From c4f9e898ef17b27c8e0f130af818dc9435d0d3a1 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 14 Mar 2024 10:36:52 +0530 Subject: [PATCH 01/10] =?UTF-8?q?add:=20utility=20to=20format=20our=20docs?= =?UTF-8?q?=20too=20=F0=9F=93=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/pr_test_peft_backend.yml | 6 +- .github/workflows/pr_tests.yml | 6 +- Makefile | 2 + setup.py | 6 +- src/diffusers/dependency_versions_table.py | 1 + src/diffusers/image_processor.py | 52 +++++---- src/diffusers/loaders/ip_adapter.py | 13 ++- src/diffusers/loaders/peft.py | 7 +- src/diffusers/loaders/single_file.py | 19 +-- src/diffusers/loaders/textual_inversion.py | 27 ++++- src/diffusers/models/attention_processor.py | 6 +- .../models/autoencoders/autoencoder_kl.py | 4 +- src/diffusers/models/controlnet_flax.py | 8 +- src/diffusers/models/embeddings.py | 17 ++- src/diffusers/models/resnet.py | 4 +- .../transformers/dual_transformer_2d.py | 3 +- .../transformers/transformer_temporal.py | 4 +- .../models/unets/unet_2d_condition.py | 8 +- .../models/unets/unet_2d_condition_flax.py | 11 +- .../models/unets/unet_3d_condition.py | 4 +- src/diffusers/models/unets/unet_i2vgen_xl.py | 11 +- .../models/unets/unet_motion_model.py | 4 +- .../unets/unet_spatio_temporal_condition.py | 15 +-- .../models/unets/unet_stable_cascade.py | 10 +- .../pipelines/amused/pipeline_amused.py | 12 +- .../amused/pipeline_amused_img2img.py | 8 +- .../amused/pipeline_amused_inpaint.py | 8 +- .../animatediff/pipeline_animatediff.py | 8 +- .../pipeline_animatediff_video2video.py | 45 ++++--- .../pipelines/animatediff/pipeline_output.py | 3 +- .../pipelines/audioldm2/modeling_audioldm2.py | 4 +- .../controlnet/pipeline_controlnet.py | 18 +-- .../controlnet/pipeline_controlnet_img2img.py | 8 +- .../controlnet/pipeline_controlnet_inpaint.py | 19 +-- .../pipeline_controlnet_inpaint_sd_xl.py | 19 +-- .../controlnet/pipeline_controlnet_sd_xl.py | 8 +- .../pipeline_controlnet_sd_xl_img2img.py | 8 +- .../alt_diffusion/pipeline_alt_diffusion.py | 4 +- .../pipeline_alt_diffusion_img2img.py | 4 +- .../versatile_diffusion/modeling_text_unet.py | 8 +- src/diffusers/pipelines/free_init_utils.py | 16 +-- .../pipelines/i2vgen_xl/pipeline_i2vgen_xl.py | 22 ++-- .../kandinsky3/convert_kandinsky3_unet.py | 6 +- .../kandinsky3/pipeline_kandinsky3.py | 4 +- .../kandinsky3/pipeline_kandinsky3_img2img.py | 8 +- .../pipeline_latent_consistency_img2img.py | 12 +- .../pipeline_latent_consistency_text2img.py | 12 +- .../pipeline_leditspp_stable_diffusion.py | 110 +++++++++--------- .../pipeline_leditspp_stable_diffusion_xl.py | 92 +++++++-------- .../pipelines/ledits_pp/pipeline_output.py | 4 +- src/diffusers/pipelines/pia/pipeline_pia.py | 25 ++-- src/diffusers/pipelines/pipeline_utils.py | 7 +- .../pixart_alpha/pipeline_pixart_alpha.py | 4 +- .../stable_cascade/pipeline_stable_cascade.py | 4 +- .../pipeline_stable_cascade_combined.py | 5 +- .../pipeline_stable_cascade_prior.py | 17 +-- .../pipeline_stable_diffusion.py | 12 +- .../pipeline_stable_diffusion_img2img.py | 12 +- .../pipeline_stable_diffusion_inpaint.py | 23 ++-- .../pipeline_stable_diffusion_ldm3d.py | 12 +- .../pipeline_stable_diffusion_panorama.py | 8 +- .../pipeline_stable_diffusion_xl.py | 12 +- .../pipeline_stable_diffusion_xl_img2img.py | 12 +- .../pipeline_stable_diffusion_xl_inpaint.py | 23 ++-- .../pipeline_stable_video_diffusion.py | 42 ++++--- .../pipeline_stable_diffusion_adapter.py | 4 +- .../pipeline_stable_diffusion_xl_adapter.py | 12 +- .../pipeline_output.py | 3 +- .../pipelines/unidiffuser/modeling_uvit.py | 3 +- .../schedulers/scheduling_ddim_flax.py | 3 +- .../scheduling_dpmsolver_multistep.py | 4 +- .../scheduling_dpmsolver_singlestep.py | 14 +-- .../scheduling_edm_dpmsolver_multistep.py | 11 +- .../schedulers/scheduling_edm_euler.py | 3 +- .../schedulers/scheduling_sasolver.py | 29 ++--- src/diffusers/schedulers/scheduling_tcd.py | 9 +- src/diffusers/utils/dynamic_modules_utils.py | 8 +- src/diffusers/utils/hub_utils.py | 3 +- src/diffusers/utils/loading_utils.py | 4 +- src/diffusers/utils/state_dict_utils.py | 4 +- tests/fixtures/custom_pipeline/pipeline.py | 5 +- tests/lora/test_lora_layers_peft.py | 59 +++++----- tests/others/test_check_copies.py | 3 +- tests/others/test_check_dummies.py | 16 +-- .../test_stable_diffusion_adapter.py | 11 +- .../test_stable_diffusion_xl_adapter.py | 11 +- tests/pipelines/test_pipelines_common.py | 23 ++-- tests/pipelines/test_pipelines_onnx_common.py | 5 +- 88 files changed, 619 insertions(+), 564 deletions(-) diff --git a/.github/workflows/pr_test_peft_backend.yml b/.github/workflows/pr_test_peft_backend.yml index b9fb06d78124..a99648bd7dd7 100644 --- a/.github/workflows/pr_test_peft_backend.yml +++ b/.github/workflows/pr_test_peft_backend.yml @@ -32,9 +32,7 @@ jobs: python -m pip install --upgrade pip pip install .[quality] - name: Check quality - run: | - ruff check examples tests src utils scripts - ruff format examples tests src utils scripts --check + run: make quality check_repository_consistency: needs: check_code_quality @@ -49,7 +47,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - - name: Check quality + - name: Check repo consistency run: | python utils/check_copies.py python utils/check_dummies.py diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index d196bb7ff445..b6402a744be7 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -40,9 +40,7 @@ jobs: python -m pip install --upgrade pip pip install .[quality] - name: Check quality - run: | - ruff check examples tests src utils scripts - ruff format examples tests src utils scripts --check + run: make quality check_repository_consistency: needs: check_code_quality @@ -57,7 +55,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - - name: Check quality + - name: Check repo consistency run: | python utils/check_copies.py python utils/check_dummies.py diff --git a/Makefile b/Makefile index c92285b48c71..e2618f4d8925 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,7 @@ repo-consistency: quality: ruff check $(check_dirs) setup.py ruff format --check $(check_dirs) setup.py + doc-builder style src/diffusers tests docs/source --max_len 119 --check_only python utils/check_doc_toc.py # Format source code automatically and check is there are any problems left that need manual fixing @@ -55,6 +56,7 @@ extra_style_checks: style: ruff check $(check_dirs) setup.py --fix ruff format $(check_dirs) setup.py + doc-builder style src/diffusers tests docs/source --max_len 119 ${MAKE} autogenerate_code ${MAKE} extra_style_checks diff --git a/setup.py b/setup.py index b97b01fb4a9f..b0e63ee56c94 100644 --- a/setup.py +++ b/setup.py @@ -130,6 +130,7 @@ "torchvision", "transformers>=4.25.1", "urllib3<=2.0.0", + "black", ] # this is a lookup table with items like: @@ -201,8 +202,9 @@ def run(self): extras = {} -extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder") -extras["docs"] = deps_list("hf-doc-builder") +# `hf-doc-builder` has a dependency on `black`. See huggingface/doc-builder#434. +extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder", "black") +extras["docs"] = deps_list("hf-doc-builder", "black") extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft") extras["test"] = deps_list( "compel", diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index e92a486bffc1..c542d51fb3f2 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -42,4 +42,5 @@ "torchvision": "torchvision", "transformers": "transformers>=4.25.1", "urllib3": "urllib3<=2.0.0", + "black": "black", } diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index daeb8fd6fa6d..eac3f9b7d578 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -173,8 +173,9 @@ def blur(image: PIL.Image.Image, blur_factor: int = 4) -> PIL.Image.Image: @staticmethod def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0): """ - Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect ratio of the original image; - for example, if user drew mask in a 128x32 region, and the dimensions for processing are 512x512, the region will be expanded to 128x128. + Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect + ratio of the original image; for example, if user drew mask in a 128x32 region, and the dimensions for + processing are 512x512, the region will be expanded to 128x128. Args: mask_image (PIL.Image.Image): Mask image. @@ -183,7 +184,8 @@ def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0) pad (int, optional): Padding to be added to the crop region. Defaults to 0. Returns: - tuple: (x1, y1, x2, y2) represent a rectangular region that contains all masked ares in an image and matches the original aspect ratio. + tuple: (x1, y1, x2, y2) represent a rectangular region that contains all masked ares in an image and + matches the original aspect ratio. """ mask_image = mask_image.convert("L") @@ -265,7 +267,8 @@ def _resize_and_fill( height: int, ) -> PIL.Image.Image: """ - Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image. + Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center + the image within the dimensions, filling empty with data from image. Args: image: The image to resize. @@ -309,7 +312,8 @@ def _resize_and_crop( height: int, ) -> PIL.Image.Image: """ - Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess. + Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center + the image within the dimensions, cropping the excess. Args: image: The image to resize. @@ -346,12 +350,12 @@ def resize( The width to resize to. resize_mode (`str`, *optional*, defaults to `default`): The resize mode to use, can be one of `default` or `fill`. If `default`, will resize the image to fit - within the specified width and height, and it may not maintaining the original aspect ratio. - If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, filling empty with data from image. - If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, cropping the excess. - Note that resize_mode `fill` and `crop` are only supported for PIL image input. + within the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, + will resize the image to fit within the specified width and height, maintaining the aspect ratio, and + then center the image within the dimensions, filling empty with data from image. If `crop`, will resize + the image to fit within the specified width and height, maintaining the aspect ratio, and then center + the image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only + supported for PIL image input. Returns: `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`: @@ -456,19 +460,21 @@ def preprocess( Args: image (`pipeline_image_input`): - The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of supported formats. + The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of + supported formats. height (`int`, *optional*, defaults to `None`): - The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height. + The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default + height. width (`int`, *optional*`, defaults to `None`): - The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width. + The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width. resize_mode (`str`, *optional*, defaults to `default`): - The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit - within the specified width and height, and it may not maintaining the original aspect ratio. - If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, filling empty with data from image. - If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, cropping the excess. - Note that resize_mode `fill` and `crop` are only supported for PIL image input. + The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within + the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will + resize the image to fit within the specified width and height, maintaining the aspect ratio, and then + center the image within the dimensions, filling empty with data from image. If `crop`, will resize the + image to fit within the specified width and height, maintaining the aspect ratio, and then center the + image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only + supported for PIL image input. crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`): The crop coordinates for each image in the batch. If `None`, will not crop the image. """ @@ -930,8 +936,8 @@ def __init__( @staticmethod def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value_embed_dim: int): """ - Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. - If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued. + Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the + aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued. Args: mask (`torch.FloatTensor`): diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index 93959b9f0a6d..47f995da2f0a 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -67,17 +67,18 @@ def load_ip_adapter( - A [torch state dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). subfolder (`str` or `List[str]`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - If a list is passed, it should have the same length as `weight_name`. + The subfolder location of a model file within a larger model repository on the Hub or locally. If a + list is passed, it should have the same length as `weight_name`. weight_name (`str` or `List[str]`): The name of the weight file to load. If a list is passed, it should have the same length as `weight_name`. image_encoder_folder (`str`, *optional*, defaults to `image_encoder`): The subfolder location of the image encoder within a larger model repository on the Hub or locally. - Pass `None` to not load the image encoder. If the image encoder is located in a folder inside `subfolder`, - you only need to pass the name of the folder that contains image encoder weights, e.g. `image_encoder_folder="image_encoder"`. - If the image encoder is located in a folder other than `subfolder`, you should pass the path to the folder that contains image encoder weights, - for example, `image_encoder_folder="different_subfolder/image_encoder"`. + Pass `None` to not load the image encoder. If the image encoder is located in a folder inside + `subfolder`, you only need to pass the name of the folder that contains image encoder weights, e.g. + `image_encoder_folder="image_encoder"`. If the image encoder is located in a folder other than + `subfolder`, you should pass the path to the folder that contains image encoder weights, for example, + `image_encoder_folder="different_subfolder/image_encoder"`. cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py index 01dbd3494a4c..5892c2865374 100644 --- a/src/diffusers/loaders/peft.py +++ b/src/diffusers/loaders/peft.py @@ -20,7 +20,8 @@ class PeftAdapterMixin: """ A class containing all functions for loading and using adapters weights that are supported in PEFT library. For - more details about adapters and injecting them in a transformer-based model, check out the PEFT [documentation](https://huggingface.co/docs/peft/index). + more details about adapters and injecting them in a transformer-based model, check out the PEFT + [documentation](https://huggingface.co/docs/peft/index). Install the latest version of PEFT, and use this mixin to: @@ -143,8 +144,8 @@ def disable_adapters(self) -> None: def enable_adapters(self) -> None: """ - Enable adapters that are attached to the model. The model uses `self.active_adapters()` to retrieve the - list of adapters to enable. + Enable adapters that are attached to the model. The model uses `self.active_adapters()` to retrieve the list of + adapters to enable. If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT [documentation](https://huggingface.co/docs/peft). diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py index 0d384b1647d5..752ef18c7a0b 100644 --- a/src/diffusers/loaders/single_file.py +++ b/src/diffusers/loaders/single_file.py @@ -198,19 +198,24 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): model_type (`str`, *optional*): The type of model to load. If not provided, the model type will be inferred from the checkpoint file. image_size (`int`, *optional*): - The size of the image output. It's used to configure the `sample_size` parameter of the UNet and VAE model. + The size of the image output. It's used to configure the `sample_size` parameter of the UNet and VAE + model. load_safety_checker (`bool`, *optional*, defaults to `False`): - Whether to load the safety checker model or not. By default, the safety checker is not loaded unless a `safety_checker` component is passed to the `kwargs`. + Whether to load the safety checker model or not. By default, the safety checker is not loaded unless a + `safety_checker` component is passed to the `kwargs`. num_in_channels (`int`, *optional*): - Specify the number of input channels for the UNet model. Read more about how to configure UNet model with this parameter + Specify the number of input channels for the UNet model. Read more about how to configure UNet model + with this parameter [here](https://huggingface.co/docs/diffusers/training/adapt_a_model#configure-unet2dconditionmodel-parameters). scaling_factor (`float`, *optional*): - The scaling factor to use for the VAE model. If not provided, it is inferred from the config file first. - If the scaling factor is not found in the config file, the default value 0.18215 is used. + The scaling factor to use for the VAE model. If not provided, it is inferred from the config file + first. If the scaling factor is not found in the config file, the default value 0.18215 is used. scheduler_type (`str`, *optional*): - The type of scheduler to load. If not provided, the scheduler type will be inferred from the checkpoint file. + The type of scheduler to load. If not provided, the scheduler type will be inferred from the checkpoint + file. prediction_type (`str`, *optional*): - The type of prediction to load. If not provided, the prediction type will be inferred from the checkpoint file. + The type of prediction to load. If not provided, the prediction type will be inferred from the + checkpoint file. kwargs (remaining dictionary of keyword arguments, *optional*): Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline class). The overwritten components are passed directly to the pipelines `__init__` method. See example diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index aaaf4b68bb5f..653630860fcf 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -486,20 +486,35 @@ def unload_textual_inversion( # Example 3: unload from SDXL pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0") - embedding_path = hf_hub_download(repo_id="linoyts/web_y2k", filename="web_y2k_emb.safetensors", repo_type="model") + embedding_path = hf_hub_download( + repo_id="linoyts/web_y2k", filename="web_y2k_emb.safetensors", repo_type="model" + ) # load embeddings to the text encoders state_dict = load_file(embedding_path) # load embeddings of text_encoder 1 (CLIP ViT-L/14) - pipeline.load_textual_inversion(state_dict["clip_l"], token=["", ""], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer) + pipeline.load_textual_inversion( + state_dict["clip_l"], + token=["", ""], + text_encoder=pipeline.text_encoder, + tokenizer=pipeline.tokenizer, + ) # load embeddings of text_encoder 2 (CLIP ViT-G/14) - pipeline.load_textual_inversion(state_dict["clip_g"], token=["", ""], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2) + pipeline.load_textual_inversion( + state_dict["clip_g"], + token=["", ""], + text_encoder=pipeline.text_encoder_2, + tokenizer=pipeline.tokenizer_2, + ) # Unload explicitly from both text encoders abd tokenizers - pipeline.unload_textual_inversion(tokens=["", ""], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer) - pipeline.unload_textual_inversion(tokens=["", ""], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2) - + pipeline.unload_textual_inversion( + tokens=["", ""], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer + ) + pipeline.unload_textual_inversion( + tokens=["", ""], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2 + ) ``` """ diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 44fbd584cd7c..4d5968d3b632 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -1301,9 +1301,9 @@ def __call__( class FusedAttnProcessor2_0: r""" - Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). - It uses fused projection layers. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). It uses + fused projection layers. For self-attention modules, all projection matrices (i.e., query, key, value) are fused. + For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index 9bbf2023eb99..b286453de424 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -453,8 +453,8 @@ def forward( # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/models/controlnet_flax.py b/src/diffusers/models/controlnet_flax.py index 6f9b201aa1e3..0540850a9e61 100644 --- a/src/diffusers/models/controlnet_flax.py +++ b/src/diffusers/models/controlnet_flax.py @@ -329,15 +329,15 @@ def __call__( controlnet_cond (`jnp.ndarray`): (batch, channel, height, width) the conditional input tensor conditioning_scale (`float`, *optional*, defaults to `1.0`): the scale factor for controlnet outputs return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a - plain tuple. + Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of + a plain tuple. train (`bool`, *optional*, defaults to `False`): Use deterministic functions and disable dropout when not training. Returns: [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`: - [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a - `tuple`. When returning a tuple, the first element is the sample tensor. + [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise + a `tuple`. When returning a tuple, the first element is the sample tensor. """ channel_order = self.controlnet_conditioning_channel_order if channel_order == "bgr": diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index c15ff24cbcda..6e1f9059cf97 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -796,16 +796,13 @@ class IPAdapterPlusImageProjection(nn.Module): Args: ---- - embed_dims (int): The feature dimension. Defaults to 768. - output_dims (int): The number of output channels, that is the same - number of the channels in the - `unet.config.cross_attention_dim`. Defaults to 1024. - hidden_dims (int): The number of hidden channels. Defaults to 1280. - depth (int): The number of blocks. Defaults to 8. - dim_head (int): The number of head channels. Defaults to 64. - heads (int): Parallel attention heads. Defaults to 16. - num_queries (int): The number of queries. Defaults to 8. - ffn_ratio (float): The expansion ratio of feedforward network hidden + embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels, + that is the same + number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024. + hidden_dims (int): The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults + to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads. + Defaults to 16. num_queries (int): The number of queries. Defaults to 8. ffn_ratio (float): The expansion ratio + of feedforward network hidden layer channels. Defaults to 4. """ diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index ec75861e2da0..ccebc68e45c2 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -204,8 +204,8 @@ class ResnetBlock2D(nn.Module): eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization. non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use. time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config. - By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" - for a stronger conditioning with scale and shift. + By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a + stronger conditioning with scale and shift. kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`]. output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output. diff --git a/src/diffusers/models/transformers/dual_transformer_2d.py b/src/diffusers/models/transformers/dual_transformer_2d.py index 96849bd28bb1..e2f1b8538ca0 100644 --- a/src/diffusers/models/transformers/dual_transformer_2d.py +++ b/src/diffusers/models/transformers/dual_transformer_2d.py @@ -120,7 +120,8 @@ def forward( `self.processor` in [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. Returns: [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`: diff --git a/src/diffusers/models/transformers/transformer_temporal.py b/src/diffusers/models/transformers/transformer_temporal.py index 9c61eaee2d21..e4f9a39d84bd 100644 --- a/src/diffusers/models/transformers/transformer_temporal.py +++ b/src/diffusers/models/transformers/transformer_temporal.py @@ -294,8 +294,8 @@ def forward( A tensor indicating whether the input contains only images. 1 indicates that the input contains only images, 0 indicates that the input contains video frames. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a plain - tuple. + Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a + plain tuple. Returns: [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`: diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index 9f69b03462dc..100b6194c09f 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -865,8 +865,8 @@ def disable_freeu(self): def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. @@ -1103,8 +1103,8 @@ def forward( Returns: [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: - If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise - a `tuple` is returned where the first element is the sample tensor. + If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, + otherwise a `tuple` is returned where the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). diff --git a/src/diffusers/models/unets/unet_2d_condition_flax.py b/src/diffusers/models/unets/unet_2d_condition_flax.py index a5ec2875ca0e..edbbcbaeda73 100644 --- a/src/diffusers/models/unets/unet_2d_condition_flax.py +++ b/src/diffusers/models/unets/unet_2d_condition_flax.py @@ -76,7 +76,8 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin): up_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D")`): The tuple of upsample blocks to use. mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`): - Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer is skipped. + Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer + is skipped. block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`): The tuple of output channels for each block. layers_per_block (`int`, *optional*, defaults to 2): @@ -350,15 +351,15 @@ def __call__( mid_block_additional_residual: (`torch.Tensor`, *optional*): A tensor that if specified is added to the residual of the middle unet block. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a - plain tuple. + Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of + a plain tuple. train (`bool`, *optional*, defaults to `False`): Use deterministic functions and disable dropout when not training. Returns: [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`: - [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. - When returning a tuple, the first element is the sample tensor. + [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a + `tuple`. When returning a tuple, the first element is the sample tensor. """ # 1. time if not isinstance(timesteps, jnp.ndarray): diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py index b7641a96a7a1..f9e8640836d6 100644 --- a/src/diffusers/models/unets/unet_3d_condition.py +++ b/src/diffusers/models/unets/unet_3d_condition.py @@ -507,8 +507,8 @@ def disable_freeu(self): # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index 5c5c6a2cc5ec..0a5f71ed0029 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -99,8 +99,8 @@ def forward( class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): r""" - I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep - and returns a sample-shaped output. + I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and + returns a sample-shaped output. This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented for all models (such as downloading or saving). @@ -477,8 +477,8 @@ def disable_freeu(self): # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. @@ -533,7 +533,8 @@ def forward( timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition". image_latents (`torch.FloatTensor`): Image encodings from the VAE. - image_embeddings (`torch.FloatTensor`): Projection embeddings of the conditioning image computed with a vision encoder. + image_embeddings (`torch.FloatTensor`): + Projection embeddings of the conditioning image computed with a vision encoder. encoder_hidden_states (`torch.FloatTensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. cross_attention_kwargs (`dict`, *optional*): diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index ab2eac4c9a9a..3e8973def4ee 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -705,8 +705,8 @@ def disable_freeu(self) -> None: # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py index 5fe265e63fc5..0f89df8c6bff 100644 --- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py +++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py @@ -31,8 +31,8 @@ class UNetSpatioTemporalConditionOutput(BaseOutput): class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): r""" - A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample - shaped output. + A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and + returns a sample shaped output. This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented for all models (such as downloading or saving). @@ -57,7 +57,8 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL The dimension of the cross attention features. transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1): The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for - [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`], + [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], + [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`], [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`]. num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`): The number of attention heads. @@ -374,12 +375,12 @@ def forward( The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal embeddings and added to the time embeddings. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain - tuple. + Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead + of a plain tuple. Returns: [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`: - If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise - a `tuple` is returned where the first element is the sample tensor. + If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is + returned, otherwise a `tuple` is returned where the first element is the sample tensor. """ # 1. time timesteps = timestep diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py index 9f81e50241a9..b93cb805adf2 100644 --- a/src/diffusers/models/unets/unet_stable_cascade.py +++ b/src/diffusers/models/unets/unet_stable_cascade.py @@ -187,7 +187,8 @@ def __init__( block_out_channels (Tuple[int], defaults to (2048, 2048)): Tuple of output channels for each block. num_attention_heads (Tuple[int], defaults to (32, 32)): - Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have attention. + Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have + attention. down_num_layers_per_block (Tuple[int], defaults to [8, 24]): Number of layers in each down block. up_num_layers_per_block (Tuple[int], defaults to [24, 8]): @@ -198,10 +199,9 @@ def __init__( Number of 1x1 Convolutional layers to repeat in each up block. block_types_per_layer (Tuple[Tuple[str]], optional, defaults to ( - ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"), - ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock") - ): - Block types used in each layer of the up/down blocks. + ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"), ("SDCascadeResBlock", + "SDCascadeTimestepBlock", "SDCascadeAttnBlock") + ): Block types used in each layer of the up/down blocks. clip_text_in_channels (`int`, *optional*, defaults to `None`): Number of input channels for CLIP based text conditioning. clip_text_pooled_in_channels (`int`, *optional*, defaults to 1280): diff --git a/src/diffusers/pipelines/amused/pipeline_amused.py b/src/diffusers/pipelines/amused/pipeline_amused.py index aa682b46fe70..994455ff29db 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused.py +++ b/src/diffusers/pipelines/amused/pipeline_amused.py @@ -30,9 +30,7 @@ >>> import torch >>> from diffusers import AmusedPipeline - >>> pipe = AmusedPipeline.from_pretrained( - ... "amused/amused-512", variant="fp16", torch_dtype=torch.float16 - ... ) + >>> pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16) >>> pipe = pipe.to("cuda") >>> prompt = "a photo of an astronaut riding a horse on mars" @@ -150,10 +148,12 @@ def __call__( A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): - The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/ - and the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): - The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py index 444d6354b7da..99decbfd94f1 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py @@ -167,10 +167,12 @@ def __call__( A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): - The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/ - and the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): - The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py index 423f5734b478..ab0a55cdd388 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py @@ -191,10 +191,12 @@ def __call__( A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6): - The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/ - and the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted aesthetic score according to the laion aesthetic classifier. See + https://laion.ai/blog/laion-aesthetics/ and the micro-conditioning section of + https://arxiv.org/abs/2307.01952. micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)): - The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952. + The targeted height, width crop coordinates. See the micro-conditioning section of + https://arxiv.org/abs/2307.01952. temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)): Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index cd7f0a283b63..7dba61562dcd 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -639,10 +639,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index cb6b71351faf..a569bdadd34d 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -52,14 +52,21 @@ >>> from io import BytesIO >>> from PIL import Image - >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16) - >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda") - >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace") + >>> adapter = MotionAdapter.from_pretrained( + ... "guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16 + ... ) + >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained( + ... "SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter + ... ).to("cuda") + >>> pipe.scheduler = DDIMScheduler( + ... beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace" + ... ) + >>> def load_video(file_path: str): ... images = [] - ... - ... if file_path.startswith(('http://', 'https://')): + + ... if file_path.startswith(("http://", "https://")): ... # If the file_path is a URL ... response = requests.get(file_path) ... response.raise_for_status() @@ -68,15 +75,20 @@ ... else: ... # Assuming it's a local file path ... vid = imageio.get_reader(file_path) - ... + ... for frame in vid: ... pil_image = Image.fromarray(frame) ... images.append(pil_image) - ... + ... return images - >>> video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif") - >>> output = pipe(video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5) + + >>> video = load_video( + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif" + ... ) + >>> output = pipe( + ... video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5 + ... ) >>> frames = output.frames[0] >>> export_to_gif(frames, "animation.gif") ``` @@ -135,8 +147,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -799,16 +811,15 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`AnimateDiffPipelineOutput`] instead - of a plain tuple. + Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). diff --git a/src/diffusers/pipelines/animatediff/pipeline_output.py b/src/diffusers/pipelines/animatediff/pipeline_output.py index 184a45848a37..97e7c87ad7f7 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_output.py +++ b/src/diffusers/pipelines/animatediff/pipeline_output.py @@ -15,7 +15,8 @@ class AnimateDiffPipelineOutput(BaseOutput): Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): - List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised + List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing + denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape `(batch_size, num_frames, channels, height, width)` """ diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index c0b85e4db5f6..70bab832eea2 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -701,8 +701,8 @@ def forward( Returns: [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: - If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise - a `tuple` is returned where the first element is the sample tensor. + If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, + otherwise a `tuple` is returned where the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 8f31dfc2678a..8ed7b7a99b92 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -107,8 +107,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -920,9 +920,9 @@ def __call__( accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single ControlNet, - each will be paired with each prompt in the `prompt` list. This also applies to multiple ControlNets, - where a list of image lists can be passed to batch for each prompt and each ControlNet. + input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single + ControlNet, each will be paired with each prompt in the `prompt` list. This also applies to multiple + ControlNets, where a list of image lists can be passed to batch for each prompt and each ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -960,10 +960,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 9d2c76fd7483..14131d01d9c5 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -978,10 +978,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index c4f1bff5efcd..3cbd94d8dae8 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -1167,11 +1167,12 @@ def __call__( width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. padding_mask_crop (`int`, *optional*, defaults to `None`): - The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If - `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and - contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on - the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large - and contain information inreleant for inpainging, such as background. + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information inreleant for inpainging, such as background. strength (`float`, *optional*, defaults to 1.0): Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a starting point and more noise is added the higher the `strength`. The number of denoising steps depends @@ -1207,10 +1208,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index 52ffe5a3f356..b035b0622bc0 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -1194,11 +1194,12 @@ def __call__( width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The width in pixels of the generated image. padding_mask_crop (`int`, *optional*, defaults to `None`): - The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If - `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and - contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on - the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large - and contain information inreleant for inpainging, such as background. + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information inreleant for inpainging, such as background. strength (`float`, *optional*, defaults to 0.9999): Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the @@ -1247,10 +1248,10 @@ def __call__( argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index eca81083be7b..8b85f02b8d0a 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -1037,10 +1037,10 @@ def __call__( argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index 86a0e2c570d8..5ec584e0fd1e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -1176,10 +1176,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py index e4583699e79e..9a1bb5e78fb1 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py @@ -89,8 +89,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py index 156e52c249d9..48b3b96483d5 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -129,8 +129,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py index 62a3a8728a2a..1082b742c8ae 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py @@ -1000,8 +1000,8 @@ def disable_freeu(self): def fuse_qkv_projections(self): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. @@ -1112,8 +1112,8 @@ def forward( Returns: [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: - If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise - a `tuple` is returned where the first element is the sample tensor. + If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, + otherwise a `tuple` is returned where the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). diff --git a/src/diffusers/pipelines/free_init_utils.py b/src/diffusers/pipelines/free_init_utils.py index 50c28cc69f44..d842f4116ee7 100644 --- a/src/diffusers/pipelines/free_init_utils.py +++ b/src/diffusers/pipelines/free_init_utils.py @@ -41,20 +41,20 @@ def enable_free_init( num_iters (`int`, *optional*, defaults to `3`): Number of FreeInit noise re-initialization iterations. use_fast_sampling (`bool`, *optional*, defaults to `False`): - Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables - the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`. + Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables the + "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`. method (`str`, *optional*, defaults to `butterworth`): - Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the - FreeInit low pass filter. + Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the FreeInit low + pass filter. order (`int`, *optional*, defaults to `4`): Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour whereas lower values lead to `gaussian` method behaviour. spatial_stop_frequency (`float`, *optional*, defaults to `0.25`): - Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in - the original implementation. + Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in the + original implementation. temporal_stop_frequency (`float`, *optional*, defaults to `0.25`): - Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in - the original implementation. + Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in the + original implementation. """ self._free_init_num_iters = num_iters self._free_init_use_fast_sampling = use_fast_sampling diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py index cb6f3e300904..a6b9499f5542 100644 --- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py @@ -43,10 +43,14 @@ >>> from diffusers import I2VGenXLPipeline >>> from diffusers.utils import export_to_gif, load_image - >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") + >>> pipeline = I2VGenXLPipeline.from_pretrained( + ... "ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16" + ... ) >>> pipeline.enable_model_cpu_offload() - >>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png" + >>> image_url = ( + ... "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png" + ... ) >>> image = load_image(image_url).convert("RGB") >>> prompt = "Papers were floating in the air on a table in the library" @@ -59,7 +63,7 @@ ... num_inference_steps=50, ... negative_prompt=negative_prompt, ... guidance_scale=9.0, - ... generator=generator + ... generator=generator, ... ).frames[0] >>> video_path = export_to_gif(frames, "i2v.gif") ``` @@ -95,7 +99,8 @@ class I2VGenXLPipelineOutput(BaseOutput): Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): - List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised + List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing + denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape `(batch_size, num_frames, channels, height, width)` """ @@ -551,7 +556,8 @@ def __call__( width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. target_fps (`int`, *optional*): - Frames per second. The rate at which the generated images shall be exported to a video after generation. This is also used as a "micro-condition" while generation. + Frames per second. The rate at which the generated images shall be exported to a video after + generation. This is also used as a "micro-condition" while generation. num_frames (`int`, *optional*): The number of video frames to generate. num_inference_steps (`int`, *optional*): @@ -568,9 +574,9 @@ def __call__( num_videos_per_prompt (`int`, *optional*): The number of images to generate per prompt. decode_chunk_size (`int`, *optional*): - The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency - between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once - for maximal quality. Reduce `decode_chunk_size` to reduce memory usage. + The number of frames to decode at a time. The higher the chunk size, the higher the temporal + consistency between frames, but also the higher the memory consumption. By default, the decoder will + decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. diff --git a/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py b/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py index 4fe8c54eb7fc..5360632275b4 100755 --- a/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py +++ b/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py @@ -35,10 +35,10 @@ def convert_state_dict(unet_state_dict): """ - Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model. Args: - unet_model (torch.nn.Module): The original U-Net model. - unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet model to match keys with. + Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model. + unet_model (torch.nn.Module): The original U-Net model. unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet + model to match keys with. Returns: OrderedDict: The converted state dictionary. diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py index fcf7ddcb9966..076b3f77d477 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py @@ -24,7 +24,9 @@ >>> from diffusers import AutoPipelineForText2Image >>> import torch - >>> pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16) + >>> pipe = AutoPipelineForText2Image.from_pretrained( + ... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 + ... ) >>> pipe.enable_model_cpu_offload() >>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py index 7f4164a04d1e..755e5089299c 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py @@ -29,11 +29,15 @@ >>> from diffusers.utils import load_image >>> import torch - >>> pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16) + >>> pipe = AutoPipelineForImage2Image.from_pretrained( + ... "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16 + ... ) >>> pipe.enable_model_cpu_offload() >>> prompt = "A painting of the inside of a subway train with tiny raccoons." - >>> image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png") + >>> image = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png" + ... ) >>> generator = torch.Generator(device="cpu").manual_seed(0) >>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0] diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index f64854ea982b..07b659cc1b5e 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -73,8 +73,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -747,10 +747,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index e9bacaa89ba5..2697dc6ec244 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -77,8 +77,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -679,10 +679,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py index a6357c4cd3a1..619be13a8f36 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py @@ -40,30 +40,21 @@ >>> from io import BytesIO >>> from diffusers import LEditsPPPipelineStableDiffusion + >>> from diffusers.utils import load_image >>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained( ... "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 ... ) >>> pipe = pipe.to("cuda") - >>> def download_image(url): - ... response = requests.get(url) - ... return PIL.Image.open(BytesIO(response.content)).convert("RGB") - >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png" - >>> image = download_image(img_url) + >>> image = load_image(img_url).convert("RGB") - >>> _ = pipe.invert( - ... image = image, - ... num_inversion_steps=50, - ... skip=0.1 - ... ) + >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.1) >>> edited_image = pipe( - ... editing_prompt=["cherry blossom"], - ... edit_guidance_scale=10.0, - ... edit_threshold=0.75, - ).images[0] + ... editing_prompt=["cherry blossom"], edit_guidance_scale=10.0, edit_threshold=0.75 + ... ).images[0] ``` """ @@ -279,8 +270,8 @@ class LEditsPPPipelineStableDiffusion( unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of - [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically - be set to [`DPMSolverMultistepScheduler`]. + [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will + automatically be set to [`DPMSolverMultistepScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. @@ -531,8 +522,7 @@ def encode_prompt( `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). editing_prompt (`str` or `List[str]`, *optional*): - Editing prompt(s) to be encoded. If not defined, one has to pass - `editing_prompt_embeds` instead. + Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead. editing_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. @@ -734,8 +724,9 @@ def __call__( **kwargs, ): r""" - The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] - method has to be called beforehand. Edits will always be performed for the last inverted image(s). + The call function to the pipeline for editing. The + [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`] method has to be called beforehand. Edits will + always be performed for the last inverted image(s). Args: negative_prompt (`str` or `List[str]`, *optional*): @@ -748,49 +739,51 @@ def __call__( The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a - plain tuple. + Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a plain + tuple. editing_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. The image is reconstructed by setting - `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`. + `editing_prompt = None`. Guidance direction of prompt should be specified via + `reverse_editing_direction`. editing_prompt_embeds (`torch.Tensor>`, *optional*): - Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be - specified via `reverse_editing_direction`. + Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should + be specified via `reverse_editing_direction`. negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): Whether the corresponding prompt in `editing_prompt` should be increased or decreased. edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5): - Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`. - `edit_guidance_scale` is defined as `s_e` of equation 12 of - [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). + Guidance scale for guiding the image generation. If provided as list values should correspond to + `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10): Number of diffusion steps (for each prompt) for which guidance will not be applied. edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`): Number of diffusion steps (for each prompt) after which guidance will no longer be applied. edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9): Masking threshold of guidance. Threshold should be proportional to the image region that is modified. - 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). + 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). user_mask (`torch.FloatTensor`, *optional*): - User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit - masks do not meet user preferences. + User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s + implicit masks do not meet user preferences. sem_guidance (`List[torch.Tensor]`, *optional*): List of pre-generated guidance vectors to be applied at generation. Length of the list has to correspond to `num_inference_steps`. use_cross_attn_mask (`bool`, defaults to `False`): Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask - is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of - [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). + is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++ + paper](https://arxiv.org/pdf/2311.16711.pdf). use_intersect_mask (`bool`, defaults to `True`): - Whether the masking term is calculated as intersection of cross-attention masks and masks derived - from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise - estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). + Whether the masking term is calculated as intersection of cross-attention masks and masks derived from + the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate + are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). attn_store_steps (`List[int]`, *optional*): Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes. store_averaged_over_steps (`bool`, defaults to `True`): - Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. - If False, attention maps for each step are stores separately. Just for visualization purposes. + Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If + False, attention maps for each step are stores separately. Just for visualization purposes. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). @@ -815,10 +808,10 @@ def __call__( Returns: [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, - otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the - second element is a list of `bool`s denoting whether the corresponding generated image likely represents - "not-safe-for-work" (nsfw) content, according to the `safety_checker`. + [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images, and the second element is a list + of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) + content, according to the `safety_checker`. """ if self.inversion_steps is None: @@ -1219,9 +1212,9 @@ def invert( crops_coords: Optional[Tuple[int, int, int, int]] = None, ): r""" - The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). - If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) - will be performed instead. + The function to the pipeline for image inversion as described by the [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the + inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead. Args: image (`PipelineImageInput`): @@ -1238,8 +1231,8 @@ def invert( Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values will lead to stronger changes to the input image. `skip` has to be between `0` and `1`. generator (`torch.Generator`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - inversion deterministic. + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion + deterministic. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). @@ -1247,23 +1240,24 @@ def invert( Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. height (`int`, *optional*, defaults to `None`): - The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height. + The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default + height. width (`int`, *optional*`, defaults to `None`): - The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width. + The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width. resize_mode (`str`, *optional*, defaults to `default`): - The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit - within the specified width and height, and it may not maintaining the original aspect ratio. - If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, filling empty with data from image. - If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image - within the dimensions, cropping the excess. - Note that resize_mode `fill` and `crop` are only supported for PIL image input. + The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within + the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will + resize the image to fit within the specified width and height, maintaining the aspect ratio, and then + center the image within the dimensions, filling empty with data from image. If `crop`, will resize the + image to fit within the specified width and height, maintaining the aspect ratio, and then center the + image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only + supported for PIL image input. crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`): The crop coordinates for each image in the batch. If `None`, will not crop the image. Returns: - [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: - Output will contain the resized input image(s) and respective VAE reconstruction(s). + [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s) + and respective VAE reconstruction(s). """ # Reset attn processor, we do not want to store attn maps during inversion self.unet.set_attn_processor(AttnProcessor()) diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py index 874a10a7ccd5..c203e35af59d 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py @@ -85,25 +85,23 @@ ... ) >>> pipe = pipe.to("cuda") + >>> def download_image(url): ... response = requests.get(url) ... return PIL.Image.open(BytesIO(response.content)).convert("RGB") + >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg" >>> image = download_image(img_url) - >>> _ = pipe.invert( - ... image = image, - ... num_inversion_steps=50, - ... skip=0.2 - ... ) + >>> _ = pipe.invert(image=image, num_inversion_steps=50, skip=0.2) >>> edited_image = pipe( - ... editing_prompt=["tennis ball","tomato"], - ... reverse_editing_direction=[True,False], - ... edit_guidance_scale=[5.0,10.0], - ... edit_threshold=[0.9,0.85], - ).images[0] + ... editing_prompt=["tennis ball", "tomato"], + ... reverse_editing_direction=[True, False], + ... edit_guidance_scale=[5.0, 10.0], + ... edit_threshold=[0.9, 0.85], + ... ).images[0] ``` """ @@ -292,9 +290,9 @@ class LEditsPPPipelineStableDiffusionXL( """ Pipeline for textual image editing using LEDits++ with Stable Diffusion XL. - This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the superclass - documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular - device, etc.). + This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the + superclass documentation for the generic methods implemented for all pipelines (downloading, saving, running on a + particular device, etc.). In addition the pipeline inherits the following loading methods: - *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`] @@ -325,8 +323,8 @@ class LEditsPPPipelineStableDiffusionXL( unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of - [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically - be set to [`DPMSolverMultistepScheduler`]. + [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will + automatically be set to [`DPMSolverMultistepScheduler`]. force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`): Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of `stabilityai/stable-diffusion-xl-base-1-0`. @@ -453,9 +451,9 @@ def encode_prompt( Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass `editing_prompt_embeds` instead. editing_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input - argument. + Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from + `editing_prompt` input argument. editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt` @@ -833,8 +831,9 @@ def __call__( **kwargs, ): r""" - The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] - method has to be called beforehand. Edits will always be performed for the last inverted image(s). + The call function to the pipeline for editing. The + [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`] method has to be called beforehand. Edits + will always be performed for the last inverted image(s). Args: denoising_end (`float`, *optional*): @@ -892,11 +891,11 @@ def __call__( section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). editing_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. The image is reconstructed by setting - `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`. + `editing_prompt = None`. Guidance direction of prompt should be specified via + `reverse_editing_direction`. editing_prompt_embeddings (`torch.Tensor`, *optional*): - Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input - argument. + Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input argument. editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*): Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input @@ -904,35 +903,36 @@ def __call__( reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): Whether the corresponding prompt in `editing_prompt` should be increased or decreased. edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5): - Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`. - `edit_guidance_scale` is defined as `s_e` of equation 12 of - [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). + Guidance scale for guiding the image generation. If provided as list values should correspond to + `editing_prompt`. `edit_guidance_scale` is defined as `s_e` of equation 12 of [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10): Number of diffusion steps (for each prompt) for which guidance is not applied. edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`): Number of diffusion steps (for each prompt) after which guidance is no longer applied. edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9): Masking threshold of guidance. Threshold should be proportional to the image region that is modified. - 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). + 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). sem_guidance (`List[torch.Tensor]`, *optional*): List of pre-generated guidance vectors to be applied at generation. Length of the list has to correspond to `num_inference_steps`. use_cross_attn_mask: Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask - is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of - [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). + is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of [LEDITS++ + paper](https://arxiv.org/pdf/2311.16711.pdf). use_intersect_mask: - Whether the masking term is calculated as intersection of cross-attention masks and masks derived - from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise - estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). + Whether the masking term is calculated as intersection of cross-attention masks and masks derived from + the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise estimate + are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf). user_mask: - User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit - masks do not meet user preferences. + User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s + implicit masks do not meet user preferences. attn_store_steps: Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes. store_averaged_over_steps: - Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. - If False, attention maps for each step are stores separately. Just for visualization purposes. + Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps. If + False, attention maps for each step are stores separately. Just for visualization purposes. clip_skip (`int`, *optional*): Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that the output of the pre-final layer will be used for computing the prompt embeddings. @@ -950,8 +950,8 @@ def __call__( Returns: [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, - otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. + [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images. """ if self.inversion_steps is None: raise ValueError( @@ -1444,9 +1444,9 @@ def invert( cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" - The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). - If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) - will be performed instead. + The function to the pipeline for image inversion as described by the [LEDITS++ + Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the + inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead. Args: image (`PipelineImageInput`): @@ -1470,8 +1470,8 @@ def invert( Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values will lead to stronger changes to the input image. `skip` has to be between `0` and `1`. generator (`torch.Generator`, *optional*): - A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make - inversion deterministic. + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make inversion + deterministic. crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting @@ -1486,8 +1486,8 @@ def invert( [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). Returns: - [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: - Output will contain the resized input image(s) and respective VAE reconstruction(s). + [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]: Output will contain the resized input image(s) + and respective VAE reconstruction(s). """ # Reset attn processor, we do not want to store attn maps during inversion diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_output.py b/src/diffusers/pipelines/ledits_pp/pipeline_output.py index b90005c97c4a..756be82b0069 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_output.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_output.py @@ -35,8 +35,8 @@ class LEditsPPInversionPipelineOutput(BaseOutput): List of the cropped and resized input images as PIL images of length `batch_size` or NumPy array of shape ` (batch_size, height, width, num_channels)`. vae_reconstruction_images (`List[PIL.Image.Image]` or `np.ndarray`) - List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape ` - (batch_size, height, width, num_channels)`. + List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape + ` (batch_size, height, width, num_channels)`. """ images: Union[List[PIL.Image.Image], np.ndarray] diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index 507088991a5e..b9d1f17811ae 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -61,6 +61,7 @@ ... PIAPipeline, ... ) >>> from diffusers.utils import export_to_gif, load_image + >>> adapter = MotionAdapter.from_pretrained("../checkpoints/pia-diffusers") >>> pipe = PIAPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter) >>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) @@ -788,7 +789,8 @@ def __call__( The input image to be used for video generation. prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - strength (`float`, *optional*, defaults to 1.0): Indicates extent to transform the reference `image`. Must be between 0 and 1. + strength (`float`, *optional*, defaults to 1.0): + Indicates extent to transform the reference `image`. Must be between 0 and 1. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated video. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -825,16 +827,15 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. motion_scale: (`int`, *optional*, defaults to 0): - Parameter that controls the amount and type of motion that is added to the image. Increasing the value increases the amount of motion, while specific - ranges of values control the type of motion that is added. Must be between 0 and 8. - Set between 0-2 to only increase the amount of motion. - Set between 3-5 to create looping motion. - Set between 6-8 to perform motion with image style transfer. + Parameter that controls the amount and type of motion that is added to the image. Increasing the value + increases the amount of motion, while specific ranges of values control the type of motion that is + added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5 + to create looping motion. Set between 6-8 to perform motion with image style transfer. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or `np.array`. @@ -861,8 +862,8 @@ def __call__( Returns: [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is - returned, otherwise a `tuple` is returned where the first element is a list with the generated frames. + If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is returned, otherwise a + `tuple` is returned where the first element is a list with the generated frames. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 341360d4f7eb..b042caf76b84 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -539,7 +539,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P allowed by Git. custom_revision (`str`, *optional*): The specific model version to use. It can be a branch name, a tag name, or a commit id similar to - `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers version. + `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers + version. mirror (`str`, *optional*): Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not guarantee the timeliness or safety of the source, and you should refer to the mirror site for more @@ -1713,8 +1714,8 @@ def disable_freeu(self): def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index e7213a38bcad..608aa4eb1905 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -186,8 +186,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py index a05fb9001c0e..305bdacd1c77 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py @@ -332,8 +332,8 @@ def __call__( argument. negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input - argument. + weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` + input argument. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py index 07afdedac446..167a02da4d96 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py @@ -31,7 +31,10 @@ ```py >>> import torch >>> from diffusers import StableCascadeCombinedPipeline - >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16) + + >>> pipe = StableCascadeCombinedPipeline.from_pretrained( + ... "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16 + ... ) >>> pipe.enable_model_cpu_offload() >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet" >>> images = pipe(prompt=prompt) diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py index 24ccc4b882e9..55fb4c28f6dd 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py @@ -80,7 +80,8 @@ class StableCascadePriorPipeline(DiffusionPipeline): prior ([`StableCascadeUNet`]): The Stable Cascade prior to approximate the image embedding from the text and/or image embedding. text_encoder ([`CLIPTextModelWithProjection`]): - Frozen text-encoder ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)). + Frozen text-encoder + ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)). feature_extractor ([`~transformers.CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `image_encoder`. image_encoder ([`CLIPVisionModelWithProjection`]): @@ -420,11 +421,11 @@ def __call__( argument. negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input - argument. + weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` + input argument. image_embeds (`torch.FloatTensor`, *optional*): - Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. - If not provided, image embeddings will be generated from `image` input argument if existing. + Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If + not provided, image embeddings will be generated from `image` input argument if existing. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): @@ -452,9 +453,9 @@ def __call__( Examples: Returns: - [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if - `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the - generated image embeddings. + [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if `return_dict` is + True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image + embeddings. """ # 0. Define commonly used variables diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 9e4e6c186ffa..51bed94ef5b2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -85,8 +85,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -799,10 +799,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index b43e0eb2abcd..01bac84aa78a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -125,8 +125,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -895,10 +895,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 221d5c2cfd3f..828f14abced6 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -189,8 +189,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -1020,11 +1020,12 @@ def __call__( width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. padding_mask_crop (`int`, *optional*, defaults to `None`): - The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If - `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and - contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on - the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large - and contain information inreleant for inpainging, such as background. + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information inreleant for inpainging, such as background. strength (`float`, *optional*, defaults to 1.0): Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a starting point and more noise is added the higher the `strength`. The number of denoising steps depends @@ -1064,10 +1065,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index c7c05feaf013..f4643cc0deee 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -90,8 +90,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -771,10 +771,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index feda710e0049..ae8b3aae1269 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -671,10 +671,10 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 776696e9d486..860f0741b75f 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -117,8 +117,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -917,10 +917,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index fd4c412f48cb..135293938e6a 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -134,8 +134,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -1065,10 +1065,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index c25628c22c7b..53b125923b83 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -279,8 +279,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -1253,11 +1253,12 @@ def __call__( [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and checkpoints that are not specifically fine-tuned on low resolutions. padding_mask_crop (`int`, *optional*, defaults to `None`): - The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If - `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and - contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on - the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large - and contain information inreleant for inpainging, such as background. + The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to + image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region + with the same aspect ration of the image and contains all masked area, and then expand that area based + on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before + resizing to the original image size for inpainting. This is useful when the masked area is small while + the image is large and contain information inreleant for inpainging, such as background. strength (`float`, *optional*, defaults to 0.9999): Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the @@ -1317,10 +1318,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py index 1342fe429145..ae4e12642242 100644 --- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py @@ -37,10 +37,14 @@ >>> from diffusers import StableVideoDiffusionPipeline >>> from diffusers.utils import load_image, export_to_video - >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16") + >>> pipe = StableVideoDiffusionPipeline.from_pretrained( + ... "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16" + ... ) >>> pipe.to("cuda") - >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg") + >>> image = load_image( + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg" + ... ) >>> image = image.resize((1024, 576)) >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0] @@ -86,8 +90,8 @@ class StableVideoDiffusionPipelineOutput(BaseOutput): Args: frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]): - List of denoised PIL images of length `batch_size` or numpy array or torch tensor - of shape `(batch_size, num_frames, height, width, num_channels)`. + List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size, + num_frames, height, width, num_channels)`. """ frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor] @@ -104,7 +108,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): vae ([`AutoencoderKLTemporalDecoder`]): Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. image_encoder ([`~transformers.CLIPVisionModelWithProjection`]): - Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)). + Frozen CLIP image-encoder + ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)). unet ([`UNetSpatioTemporalConditionModel`]): A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents. scheduler ([`EulerDiscreteScheduler`]): @@ -357,14 +362,15 @@ def __call__( Args: image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): - Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`. + Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, + 1]`. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. num_frames (`int`, *optional*): - The number of video frames to generate. Defaults to `self.unet.config.num_frames` - (14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`). + The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for + `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`). num_inference_steps (`int`, *optional*, defaults to 25): The number of denoising steps. More denoising steps usually lead to a higher quality video at the expense of slower inference. This parameter is modulated by `strength`. @@ -373,16 +379,18 @@ def __call__( max_guidance_scale (`float`, *optional*, defaults to 3.0): The maximum guidance scale. Used for the classifier free guidance with last frame. fps (`int`, *optional*, defaults to 7): - Frames per second. The rate at which the generated images shall be exported to a video after generation. - Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. + Frames per second. The rate at which the generated images shall be exported to a video after + generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. motion_bucket_id (`int`, *optional*, defaults to 127): Used for conditioning the amount of motion for the generation. The higher the number the more motion will be in the video. noise_aug_strength (`float`, *optional*, defaults to 0.02): - The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion. + The amount of noise added to the init image, the higher it is the less the video will look like the + init image. Increase it for more motion. decode_chunk_size (`int`, *optional*): - The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal - quality. For lower memory usage, reduce `decode_chunk_size`. + The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the + expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality. + For lower memory usage, reduce `decode_chunk_size`. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of videos to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): @@ -398,7 +406,8 @@ def __call__( A function that is called at the end of each denoising step during inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. - `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -411,8 +420,9 @@ def __call__( Returns: [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned, - otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned. + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is + returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) + is returned. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index 0b55bb38b5eb..225943d1dbf1 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -134,8 +134,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 4e0cc61f5c1d..181b0ce5d561 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -150,8 +150,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -941,10 +941,10 @@ def __call__( input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py index c155386cf173..2dae5b4ead69 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py @@ -17,7 +17,8 @@ class TextToVideoSDPipelineOutput(BaseOutput): Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): - List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised + List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing + denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape `(batch_size, num_frames, channels, height, width)` """ diff --git a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py index c074b9916301..6579e272a3bf 100644 --- a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py +++ b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py @@ -752,7 +752,8 @@ def forward( cross_attention_kwargs (*optional*): Keyword arguments to supply to the cross attention layers, if used. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. hidden_states_is_embedding (`bool`, *optional*, defaults to `False`): Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the diff --git a/src/diffusers/schedulers/scheduling_ddim_flax.py b/src/diffusers/schedulers/scheduling_ddim_flax.py index dc3d8455bdfe..23c71a61452a 100644 --- a/src/diffusers/schedulers/scheduling_ddim_flax.py +++ b/src/diffusers/schedulers/scheduling_ddim_flax.py @@ -85,7 +85,8 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): trained_betas (`jnp.ndarray`, optional): option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc. clip_sample (`bool`, default `True`): - option to clip predicted sample between for numerical stability. The clip range is determined by `clip_sample_range`. + option to clip predicted sample between for numerical stability. The clip range is determined by + `clip_sample_range`. clip_sample_range (`float`, default `1.0`): the maximum magnitude for sample clipping. Valid only when `clip_sample=True`. set_alpha_to_one (`bool`, default `True`): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index 3bbfc65e2ab9..34fa83d0bc95 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -166,8 +166,8 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): the sampling process. If `True`, the sigmas and time steps are determined according to a sequence of `lambda(t)`. final_sigmas_type (`str`, defaults to `"zero"`): - The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma - is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. + The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final + sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. lambda_min_clipped (`float`, defaults to `-inf`): Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the cosine (`squaredcos_cap_v2`) noise schedule. diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 7bb201de4896..da7d15fc60e3 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -108,11 +108,11 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `algorithm_type="dpmsolver++"`. algorithm_type (`str`, defaults to `dpmsolver++`): - Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The - `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) - paper, and the `dpmsolver++` type implements the algorithms in the - [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or - `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion. + Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the + algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) paper, and the `dpmsolver++` type + implements the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is + recommended to use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in + Stable Diffusion. solver_type (`str`, defaults to `midpoint`): Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers. @@ -123,8 +123,8 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`, the sigmas are determined according to a sequence of noise levels {σi}. final_sigmas_type (`str`, *optional*, defaults to `"zero"`): - The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma - is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. + The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final + sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. lambda_min_clipped (`float`, defaults to `-inf`): Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the cosine (`squaredcos_cap_v2`) noise schedule. diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index 5fea89bb887b..ce87fa3c5eae 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -62,10 +62,9 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `algorithm_type="dpmsolver++"`. algorithm_type (`str`, defaults to `dpmsolver++`): - Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The - `dpmsolver++` type implements the algorithms in the - [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or - `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion. + Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The `dpmsolver++` type implements + the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to + use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion. solver_type (`str`, defaults to `midpoint`): Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers. @@ -77,8 +76,8 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference steps, but sometimes may result in blurring. final_sigmas_type (`str`, defaults to `"zero"`): - The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma - is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. + The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final + sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. """ _compatibles = [] diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py index e62a486cc214..78bb3200ae1b 100644 --- a/src/diffusers/schedulers/scheduling_edm_euler.py +++ b/src/diffusers/schedulers/scheduling_edm_euler.py @@ -278,8 +278,7 @@ def step( generator (`torch.Generator`, *optional*): A random number generator. return_dict (`bool`): - Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or - tuple. + Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or tuple. Returns: [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or `tuple`: diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py index b46f6de8aedc..4012cd9db17c 100644 --- a/src/diffusers/schedulers/scheduling_sasolver.py +++ b/src/diffusers/schedulers/scheduling_sasolver.py @@ -92,19 +92,20 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): trained_betas (`np.ndarray`, *optional*): Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`. predictor_order (`int`, defaults to 2): - The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for guided - sampling, and `predictor_order=3` for unconditional sampling. + The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for + guided sampling, and `predictor_order=3` for unconditional sampling. corrector_order (`int`, defaults to 2): - The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided - sampling, and `corrector_order=3` for unconditional sampling. + The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for + guided sampling, and `corrector_order=3` for unconditional sampling. prediction_type (`str`, defaults to `epsilon`, *optional*): Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process), `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen Video](https://imagen.research.google/video/paper.pdf) paper). tau_func (`Callable`, *optional*): - Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`. SA-Solver - will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample from vanilla - diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check https://arxiv.org/abs/2309.05019 + Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`. + SA-Solver will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample + from vanilla diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check + https://arxiv.org/abs/2309.05019 thresholding (`bool`, defaults to `False`): Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such as Stable Diffusion. @@ -114,8 +115,8 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin): The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `algorithm_type="dpmsolver++"`. algorithm_type (`str`, defaults to `data_prediction`): - Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use `data_prediction` - with `solver_order=2` for guided sampling like in Stable Diffusion. + Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use + `data_prediction` with `solver_order=2` for guided sampling like in Stable Diffusion. lower_order_final (`bool`, defaults to `True`): Whether to use lower-order solvers in the final steps. Default = True. use_karras_sigmas (`bool`, *optional*, defaults to `False`): @@ -402,14 +403,14 @@ def convert_model_output( **kwargs, ) -> torch.FloatTensor: """ - Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. Noise_prediction is - designed to discretize an integral of the noise prediction model, and data_prediction is designed to discretize an - integral of the data prediction model. + Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. + Noise_prediction is designed to discretize an integral of the noise prediction model, and data_prediction is + designed to discretize an integral of the data prediction model. - The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both noise - prediction and data prediction models. + The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both + noise prediction and data prediction models. diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py index 7eb01b382673..9d1c0754a4c1 100644 --- a/src/diffusers/schedulers/scheduling_tcd.py +++ b/src/diffusers/schedulers/scheduling_tcd.py @@ -132,8 +132,8 @@ def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor: class TCDScheduler(SchedulerMixin, ConfigMixin): """ - `TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency Distillation`, - extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal. + `TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency + Distillation`, extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal. This code is based on the official repo of TCD(https://github.com/jabir-zheng/TCD). @@ -540,8 +540,9 @@ def step( sample (`torch.FloatTensor`): A current instance of a sample created by the diffusion process. eta (`float`): - A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every step. - When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic sampling. + A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every + step. When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic + sampling. generator (`torch.Generator`, *optional*): A random number generator. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py index a4c704a91c05..211029f7b9c3 100644 --- a/src/diffusers/utils/dynamic_modules_utils.py +++ b/src/diffusers/utils/dynamic_modules_utils.py @@ -246,8 +246,8 @@ def get_cached_module_file( - You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private - or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models). + You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or + [gated models](https://huggingface.co/docs/hub/models-gated#gated-models). @@ -422,8 +422,8 @@ def get_class_from_dynamic_module( - You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private - or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models). + You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or + [gated models](https://huggingface.co/docs/hub/models-gated#gated-models). diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index e554b42ddd31..d70ee53aaa41 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -112,7 +112,8 @@ def load_or_create_model_card( repo_id_or_path (`str`): The repo id (e.g., "runwayml/stable-diffusion-v1-5") or local path where to look for the model card. token (`str`, *optional*): - Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more details. + Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more + details. is_pipeline (`bool`): Boolean to indicate if we're adding tag to a [`DiffusionPipeline`]. from_training: (`bool`): Boolean flag to denote if the model card is being created from a training script. diff --git a/src/diffusers/utils/loading_utils.py b/src/diffusers/utils/loading_utils.py index 18f6ead64c4e..aa087e981731 100644 --- a/src/diffusers/utils/loading_utils.py +++ b/src/diffusers/utils/loading_utils.py @@ -16,8 +16,8 @@ def load_image( image (`str` or `PIL.Image.Image`): The image to convert to the PIL Image format. convert_method (Callable[[PIL.Image.Image], PIL.Image.Image], optional): - A conversion method to apply to the image after loading it. - When set to `None` the image will be converted "RGB". + A conversion method to apply to the image after loading it. When set to `None` the image will be converted + "RGB". Returns: `PIL.Image.Image`: diff --git a/src/diffusers/utils/state_dict_utils.py b/src/diffusers/utils/state_dict_utils.py index c4566636da30..8d41aa399c7d 100644 --- a/src/diffusers/utils/state_dict_utils.py +++ b/src/diffusers/utils/state_dict_utils.py @@ -247,8 +247,8 @@ def convert_unet_state_dict_to_peft(state_dict): def convert_all_state_dict_to_peft(state_dict): r""" - Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer` - for a valid `DIFFUSERS` LoRA for example, attempts to exclusively convert the Unet `convert_unet_state_dict_to_peft` + Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer` for a valid + `DIFFUSERS` LoRA for example, attempts to exclusively convert the Unet `convert_unet_state_dict_to_peft` """ try: peft_dict = convert_state_dict_to_peft(state_dict) diff --git a/tests/fixtures/custom_pipeline/pipeline.py b/tests/fixtures/custom_pipeline/pipeline.py index 601f51b1263e..41d69706b956 100644 --- a/tests/fixtures/custom_pipeline/pipeline.py +++ b/tests/fixtures/custom_pipeline/pipeline.py @@ -66,9 +66,8 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ # Sample gaussian noise to begin loop diff --git a/tests/lora/test_lora_layers_peft.py b/tests/lora/test_lora_layers_peft.py index 67d28fe19e7e..1ee64e4a8fee 100644 --- a/tests/lora/test_lora_layers_peft.py +++ b/tests/lora/test_lora_layers_peft.py @@ -202,8 +202,7 @@ def test_simple_inference(self): def test_simple_inference_with_text_lora(self): """ - Tests a simple inference with lora attached on the text encoder - and makes sure it works as expected + Tests a simple inference with lora attached on the text encoder and makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) @@ -233,8 +232,8 @@ def test_simple_inference_with_text_lora(self): def test_simple_inference_with_text_lora_and_scale(self): """ - Tests a simple inference with lora attached on the text encoder + scale argument - and makes sure it works as expected + Tests a simple inference with lora attached on the text encoder + scale argument and makes sure it works as + expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) @@ -280,8 +279,8 @@ def test_simple_inference_with_text_lora_and_scale(self): def test_simple_inference_with_text_lora_fused(self): """ - Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model - and makes sure it works as expected + Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and + makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) @@ -322,8 +321,8 @@ def test_simple_inference_with_text_lora_fused(self): def test_simple_inference_with_text_lora_unloaded(self): """ - Tests a simple inference with lora attached to text encoder, then unloads the lora weights - and makes sure it works as expected + Tests a simple inference with lora attached to text encoder, then unloads the lora weights and makes sure it + works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) @@ -554,8 +553,8 @@ def test_simple_inference_with_text_unet_lora_save_load(self): def test_simple_inference_with_text_unet_lora_and_scale(self): """ - Tests a simple inference with lora attached on the text encoder + Unet + scale argument - and makes sure it works as expected + Tests a simple inference with lora attached on the text encoder + Unet + scale argument and makes sure it works + as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -608,8 +607,8 @@ def test_simple_inference_with_text_unet_lora_and_scale(self): def test_simple_inference_with_text_lora_unet_fused(self): """ - Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model - and makes sure it works as expected - with unet + Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and + makes sure it works as expected - with unet """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -654,8 +653,8 @@ def test_simple_inference_with_text_lora_unet_fused(self): def test_simple_inference_with_text_unet_lora_unloaded(self): """ - Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights - and makes sure it works as expected + Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes + sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -701,8 +700,8 @@ def test_simple_inference_with_text_unet_lora_unloaded(self): def test_simple_inference_with_text_unet_lora_unfused(self): """ - Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights - and makes sure it works as expected + Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes + sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -751,8 +750,7 @@ def test_simple_inference_with_text_unet_lora_unfused(self): def test_simple_inference_with_text_unet_multi_adapter(self): """ - Tests a simple inference with lora attached to text encoder and unet, attaches - multiple adapters and set them + Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set them """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -819,8 +817,8 @@ def test_simple_inference_with_text_unet_multi_adapter(self): def test_simple_inference_with_text_unet_multi_adapter_delete_adapter(self): """ - Tests a simple inference with lora attached to text encoder and unet, attaches - multiple adapters and set/delete them + Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set/delete + them """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -909,8 +907,7 @@ def test_simple_inference_with_text_unet_multi_adapter_delete_adapter(self): def test_simple_inference_with_text_unet_multi_adapter_weighted(self): """ - Tests a simple inference with lora attached to text encoder and unet, attaches - multiple adapters and set them + Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set them """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -1019,8 +1016,7 @@ def test_lora_fuse_nan(self): def test_get_adapters(self): """ - Tests a simple usecase where we attach multiple adapters and check if the results - are the expected results + Tests a simple usecase where we attach multiple adapters and check if the results are the expected results """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -1046,8 +1042,7 @@ def test_get_adapters(self): def test_get_list_adapters(self): """ - Tests a simple usecase where we attach multiple adapters and check if the results - are the expected results + Tests a simple usecase where we attach multiple adapters and check if the results are the expected results """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -1084,8 +1079,8 @@ def test_get_list_adapters(self): @require_peft_version_greater(peft_version="0.6.2") def test_simple_inference_with_text_lora_unet_fused_multi(self): """ - Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model - and makes sure it works as expected - with unet and multi-adapter case + Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and + makes sure it works as expected - with unet and multi-adapter case """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -1146,8 +1141,8 @@ def test_simple_inference_with_text_lora_unet_fused_multi(self): @unittest.skip("This is failing for now - need to investigate") def test_simple_inference_with_text_unet_lora_unfused_torch_compile(self): """ - Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights - and makes sure it works as expected + Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes + sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -2211,8 +2206,8 @@ def test_sdxl_1_0_lora_with_sequential_cpu_offloading(self): def test_sd_load_civitai_empty_network_alpha(self): """ - This test simply checks that loading a LoRA with an empty network alpha works fine - See: https://github.com/huggingface/diffusers/issues/5606 + This test simply checks that loading a LoRA with an empty network alpha works fine See: + https://github.com/huggingface/diffusers/issues/5606 """ pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to("cuda") pipeline.enable_sequential_cpu_offload() diff --git a/tests/others/test_check_copies.py b/tests/others/test_check_copies.py index 6e1c8fcfa54b..8d9b79e01a1f 100644 --- a/tests/others/test_check_copies.py +++ b/tests/others/test_check_copies.py @@ -40,8 +40,7 @@ `pred_original_sample` can be used to preview progress or for guidance. \""" - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.FloatTensor pred_original_sample: Optional[torch.FloatTensor] = None """ diff --git a/tests/others/test_check_dummies.py b/tests/others/test_check_dummies.py index 1890ffaecd8d..f29958d436e5 100644 --- a/tests/others/test_check_dummies.py +++ b/tests/others/test_check_dummies.py @@ -81,20 +81,18 @@ class FakeClass(metaclass=DummyObject): def __init__(self, *args, **kwargs): requires_backends(self, 'torch') - @classmethod - def from_config(cls, *args, **kwargs): + @classmethod def from_config(cls, *args, **kwargs): requires_backends(cls, 'torch') - @classmethod - def from_pretrained(cls, *args, **kwargs): + @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, 'torch') """ dummy_class = create_dummy_object("FakeClass", "'torch'") self.assertEqual(dummy_class, expected_dummy_class) def test_create_dummy_files(self): - expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit. -from ..utils import DummyObject, requires_backends + expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit. from ..utils import DummyObject, +requires_backends CONSTANT = None @@ -110,12 +108,10 @@ class FakeClass(metaclass=DummyObject): def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) - @classmethod - def from_config(cls, *args, **kwargs): + @classmethod def from_config(cls, *args, **kwargs): requires_backends(cls, ["torch"]) - @classmethod - def from_pretrained(cls, *args, **kwargs): + @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) """ dummy_files = create_dummy_files({"torch": ["CONSTANT", "function", "FakeClass"]}) diff --git a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py index f1b61c3364f0..bdf5fd3f15f6 100644 --- a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py +++ b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py @@ -143,8 +143,7 @@ def get_dummy_components(self, adapter_type, time_cond_proj_dim=None): def get_dummy_components_with_full_downscaling(self, adapter_type): """Get dummy components with x8 VAE downscaling and 4 UNet down blocks. - These dummy components are intended to fully-exercise the T2I-Adapter - downscaling behavior. + These dummy components are intended to fully-exercise the T2I-Adapter downscaling behavior. """ torch.manual_seed(0) unet = UNet2DConditionModel( @@ -277,12 +276,10 @@ def test_inference_batch_single_identical(self): ) def test_multiple_image_dimensions(self, dim): """Test that the T2I-Adapter pipeline supports any input dimension that - is divisible by the adapter's `downscale_factor`. This test was added in - response to an issue where the T2I Adapter's downscaling padding - behavior did not match the UNet's behavior. + is divisible by the adapter's `downscale_factor`. This test was added in response to an issue where the T2I + Adapter's downscaling padding behavior did not match the UNet's behavior. - Note that we have selected `dim` values to produce odd resolutions at - each downscaling level. + Note that we have selected `dim` values to produce odd resolutions at each downscaling level. """ components = self.get_dummy_components_with_full_downscaling() sd_pipe = StableDiffusionAdapterPipeline(**components) diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py index 0bcffeb078b8..2d2cbad8bf87 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py @@ -167,8 +167,7 @@ def get_dummy_components(self, adapter_type="full_adapter_xl", time_cond_proj_di def get_dummy_components_with_full_downscaling(self, adapter_type="full_adapter_xl"): """Get dummy components with x8 VAE downscaling and 3 UNet down blocks. - These dummy components are intended to fully-exercise the T2I-Adapter - downscaling behavior. + These dummy components are intended to fully-exercise the T2I-Adapter downscaling behavior. """ torch.manual_seed(0) unet = UNet2DConditionModel( @@ -321,12 +320,10 @@ def test_stable_diffusion_adapter_default_case(self): ) def test_multiple_image_dimensions(self, dim): """Test that the T2I-Adapter pipeline supports any input dimension that - is divisible by the adapter's `downscale_factor`. This test was added in - response to an issue where the T2I Adapter's downscaling padding - behavior did not match the UNet's behavior. + is divisible by the adapter's `downscale_factor`. This test was added in response to an issue where the T2I + Adapter's downscaling padding behavior did not match the UNet's behavior. - Note that we have selected `dim` values to produce odd resolutions at - each downscaling level. + Note that we have selected `dim` values to produce odd resolutions at each downscaling level. """ components = self.get_dummy_components_with_full_downscaling() sd_pipe = StableDiffusionXLAdapterPipeline(**components) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 2b29e3ae9eeb..e249fa81ca0f 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -67,8 +67,8 @@ def check_same_shape(tensor_list): class SDFunctionTesterMixin: """ - This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. - It provides a set of common tests for PyTorch pipeline that inherit from StableDiffusionMixin, e.g. vae_slicing, vae_tiling, freeu, etc. + This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. It provides a set of + common tests for PyTorch pipeline that inherit from StableDiffusionMixin, e.g. vae_slicing, vae_tiling, freeu, etc. """ def test_vae_slicing(self): @@ -209,8 +209,8 @@ def test_fused_qkv_projections(self): class IPAdapterTesterMixin: """ - This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. - It provides a set of common tests for pipelines that support IP Adapters. + This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. It provides a set of + common tests for pipelines that support IP Adapters. """ def test_pipeline_signature(self): @@ -350,9 +350,8 @@ def test_ip_adapter_cfg(self, expected_max_diff: float = 1e-4): class PipelineLatentTesterMixin: """ - This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. - It provides a set of common tests for PyTorch pipeline that has vae, e.g. - equivalence of different input and output types, etc. + This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. It provides a set of + common tests for PyTorch pipeline that has vae, e.g. equivalence of different input and output types, etc. """ @property @@ -508,9 +507,8 @@ def test_multi_vae(self): @require_torch class PipelineKarrasSchedulerTesterMixin: """ - This mixin is designed to be used with unittest.TestCase classes. - It provides a set of common tests for each PyTorch pipeline that makes use of KarrasDiffusionSchedulers - equivalence of dict and tuple outputs, etc. + This mixin is designed to be used with unittest.TestCase classes. It provides a set of common tests for each + PyTorch pipeline that makes use of KarrasDiffusionSchedulers equivalence of dict and tuple outputs, etc. """ def test_karras_schedulers_shape(self): @@ -548,9 +546,8 @@ def test_karras_schedulers_shape(self): @require_torch class PipelineTesterMixin: """ - This mixin is designed to be used with unittest.TestCase classes. - It provides a set of common tests for each PyTorch pipeline, e.g. saving and loading the pipeline, - equivalence of dict and tuple outputs, etc. + This mixin is designed to be used with unittest.TestCase classes. It provides a set of common tests for each + PyTorch pipeline, e.g. saving and loading the pipeline, equivalence of dict and tuple outputs, etc. """ # Canonical parameters that are passed to `__call__` regardless diff --git a/tests/pipelines/test_pipelines_onnx_common.py b/tests/pipelines/test_pipelines_onnx_common.py index 575ecd007531..69889efa3743 100644 --- a/tests/pipelines/test_pipelines_onnx_common.py +++ b/tests/pipelines/test_pipelines_onnx_common.py @@ -4,9 +4,8 @@ @require_onnxruntime class OnnxPipelineTesterMixin: """ - This mixin is designed to be used with unittest.TestCase classes. - It provides a set of common tests for each ONNXRuntime pipeline, e.g. saving and loading the pipeline, - equivalence of dict and tuple outputs, etc. + This mixin is designed to be used with unittest.TestCase classes. It provides a set of common tests for each + ONNXRuntime pipeline, e.g. saving and loading the pipeline, equivalence of dict and tuple outputs, etc. """ pass From 15dd7c2a735634dfec6ff50764fb5f9e8d5f7d1c Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 14 Mar 2024 16:18:59 +0530 Subject: [PATCH 02/10] debugging saga --- tests/others/test_check_dummies.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/others/test_check_dummies.py b/tests/others/test_check_dummies.py index f29958d436e5..8fb06915e5c4 100644 --- a/tests/others/test_check_dummies.py +++ b/tests/others/test_check_dummies.py @@ -115,4 +115,6 @@ def __init__(self, *args, **kwargs): requires_backends(cls, ["torch"]) """ dummy_files = create_dummy_files({"torch": ["CONSTANT", "function", "FakeClass"]}) + print(f"Dummy file: {dummy_files['torch']}") + print(f"expected_dummy_pytorch_file: {expected_dummy_pytorch_file}") self.assertEqual(dummy_files["torch"], expected_dummy_pytorch_file) From 52aa0b0f38e833f57109884b524af8b5e01b730c Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 14 Mar 2024 16:20:58 +0530 Subject: [PATCH 03/10] fix: message --- tests/others/test_check_dummies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/others/test_check_dummies.py b/tests/others/test_check_dummies.py index 8fb06915e5c4..368119f6faab 100644 --- a/tests/others/test_check_dummies.py +++ b/tests/others/test_check_dummies.py @@ -91,8 +91,8 @@ def __init__(self, *args, **kwargs): self.assertEqual(dummy_class, expected_dummy_class) def test_create_dummy_files(self): - expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit. from ..utils import DummyObject, -requires_backends + expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends CONSTANT = None From 09ae822220504bf9de3e5c0f81a29ce05684d63e Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 14 Mar 2024 16:24:18 +0530 Subject: [PATCH 04/10] checking --- tests/others/test_check_dummies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/others/test_check_dummies.py b/tests/others/test_check_dummies.py index 368119f6faab..518e6a4668b8 100644 --- a/tests/others/test_check_dummies.py +++ b/tests/others/test_check_dummies.py @@ -91,8 +91,8 @@ def __init__(self, *args, **kwargs): self.assertEqual(dummy_class, expected_dummy_class) def test_create_dummy_files(self): - expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit. -from ..utils import DummyObject, requires_backends + expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit.\nfrom ..utils import DummyObject, +requires_backends CONSTANT = None From b1a4e4533ad7da7f19d6cc7190275c985283e774 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 14 Mar 2024 16:32:53 +0530 Subject: [PATCH 05/10] should be fixed. --- Makefile | 4 +- tests/lora/test_lora_layers_peft.py | 59 ++++++++++--------- tests/others/test_check_copies.py | 3 +- tests/others/test_check_dummies.py | 18 +++--- .../test_stable_diffusion_adapter.py | 11 ++-- .../test_stable_diffusion_xl_adapter.py | 11 ++-- tests/pipelines/test_pipelines_common.py | 23 ++++---- tests/pipelines/test_pipelines_onnx_common.py | 5 +- 8 files changed, 76 insertions(+), 58 deletions(-) diff --git a/Makefile b/Makefile index e2618f4d8925..9af2e8b1a5c9 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ repo-consistency: quality: ruff check $(check_dirs) setup.py ruff format --check $(check_dirs) setup.py - doc-builder style src/diffusers tests docs/source --max_len 119 --check_only + doc-builder style src/diffusers docs/source --max_len 119 --check_only python utils/check_doc_toc.py # Format source code automatically and check is there are any problems left that need manual fixing @@ -56,7 +56,7 @@ extra_style_checks: style: ruff check $(check_dirs) setup.py --fix ruff format $(check_dirs) setup.py - doc-builder style src/diffusers tests docs/source --max_len 119 + doc-builder style src/diffusers docs/source --max_len 119 ${MAKE} autogenerate_code ${MAKE} extra_style_checks diff --git a/tests/lora/test_lora_layers_peft.py b/tests/lora/test_lora_layers_peft.py index 1ee64e4a8fee..67d28fe19e7e 100644 --- a/tests/lora/test_lora_layers_peft.py +++ b/tests/lora/test_lora_layers_peft.py @@ -202,7 +202,8 @@ def test_simple_inference(self): def test_simple_inference_with_text_lora(self): """ - Tests a simple inference with lora attached on the text encoder and makes sure it works as expected + Tests a simple inference with lora attached on the text encoder + and makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) @@ -232,8 +233,8 @@ def test_simple_inference_with_text_lora(self): def test_simple_inference_with_text_lora_and_scale(self): """ - Tests a simple inference with lora attached on the text encoder + scale argument and makes sure it works as - expected + Tests a simple inference with lora attached on the text encoder + scale argument + and makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) @@ -279,8 +280,8 @@ def test_simple_inference_with_text_lora_and_scale(self): def test_simple_inference_with_text_lora_fused(self): """ - Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and - makes sure it works as expected + Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model + and makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) @@ -321,8 +322,8 @@ def test_simple_inference_with_text_lora_fused(self): def test_simple_inference_with_text_lora_unloaded(self): """ - Tests a simple inference with lora attached to text encoder, then unloads the lora weights and makes sure it - works as expected + Tests a simple inference with lora attached to text encoder, then unloads the lora weights + and makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) @@ -553,8 +554,8 @@ def test_simple_inference_with_text_unet_lora_save_load(self): def test_simple_inference_with_text_unet_lora_and_scale(self): """ - Tests a simple inference with lora attached on the text encoder + Unet + scale argument and makes sure it works - as expected + Tests a simple inference with lora attached on the text encoder + Unet + scale argument + and makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -607,8 +608,8 @@ def test_simple_inference_with_text_unet_lora_and_scale(self): def test_simple_inference_with_text_lora_unet_fused(self): """ - Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and - makes sure it works as expected - with unet + Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model + and makes sure it works as expected - with unet """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -653,8 +654,8 @@ def test_simple_inference_with_text_lora_unet_fused(self): def test_simple_inference_with_text_unet_lora_unloaded(self): """ - Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes - sure it works as expected + Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights + and makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -700,8 +701,8 @@ def test_simple_inference_with_text_unet_lora_unloaded(self): def test_simple_inference_with_text_unet_lora_unfused(self): """ - Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes - sure it works as expected + Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights + and makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -750,7 +751,8 @@ def test_simple_inference_with_text_unet_lora_unfused(self): def test_simple_inference_with_text_unet_multi_adapter(self): """ - Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set them + Tests a simple inference with lora attached to text encoder and unet, attaches + multiple adapters and set them """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -817,8 +819,8 @@ def test_simple_inference_with_text_unet_multi_adapter(self): def test_simple_inference_with_text_unet_multi_adapter_delete_adapter(self): """ - Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set/delete - them + Tests a simple inference with lora attached to text encoder and unet, attaches + multiple adapters and set/delete them """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -907,7 +909,8 @@ def test_simple_inference_with_text_unet_multi_adapter_delete_adapter(self): def test_simple_inference_with_text_unet_multi_adapter_weighted(self): """ - Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set them + Tests a simple inference with lora attached to text encoder and unet, attaches + multiple adapters and set them """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -1016,7 +1019,8 @@ def test_lora_fuse_nan(self): def test_get_adapters(self): """ - Tests a simple usecase where we attach multiple adapters and check if the results are the expected results + Tests a simple usecase where we attach multiple adapters and check if the results + are the expected results """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -1042,7 +1046,8 @@ def test_get_adapters(self): def test_get_list_adapters(self): """ - Tests a simple usecase where we attach multiple adapters and check if the results are the expected results + Tests a simple usecase where we attach multiple adapters and check if the results + are the expected results """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -1079,8 +1084,8 @@ def test_get_list_adapters(self): @require_peft_version_greater(peft_version="0.6.2") def test_simple_inference_with_text_lora_unet_fused_multi(self): """ - Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and - makes sure it works as expected - with unet and multi-adapter case + Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model + and makes sure it works as expected - with unet and multi-adapter case """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -1141,8 +1146,8 @@ def test_simple_inference_with_text_lora_unet_fused_multi(self): @unittest.skip("This is failing for now - need to investigate") def test_simple_inference_with_text_unet_lora_unfused_torch_compile(self): """ - Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes - sure it works as expected + Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights + and makes sure it works as expected """ for scheduler_cls in [DDIMScheduler, LCMScheduler]: components, text_lora_config, unet_lora_config = self.get_dummy_components(scheduler_cls) @@ -2206,8 +2211,8 @@ def test_sdxl_1_0_lora_with_sequential_cpu_offloading(self): def test_sd_load_civitai_empty_network_alpha(self): """ - This test simply checks that loading a LoRA with an empty network alpha works fine See: - https://github.com/huggingface/diffusers/issues/5606 + This test simply checks that loading a LoRA with an empty network alpha works fine + See: https://github.com/huggingface/diffusers/issues/5606 """ pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to("cuda") pipeline.enable_sequential_cpu_offload() diff --git a/tests/others/test_check_copies.py b/tests/others/test_check_copies.py index 8d9b79e01a1f..6e1c8fcfa54b 100644 --- a/tests/others/test_check_copies.py +++ b/tests/others/test_check_copies.py @@ -40,7 +40,8 @@ `pred_original_sample` can be used to preview progress or for guidance. \""" - prev_sample: torch.FloatTensor pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.FloatTensor + pred_original_sample: Optional[torch.FloatTensor] = None """ diff --git a/tests/others/test_check_dummies.py b/tests/others/test_check_dummies.py index 518e6a4668b8..1890ffaecd8d 100644 --- a/tests/others/test_check_dummies.py +++ b/tests/others/test_check_dummies.py @@ -81,18 +81,20 @@ class FakeClass(metaclass=DummyObject): def __init__(self, *args, **kwargs): requires_backends(self, 'torch') - @classmethod def from_config(cls, *args, **kwargs): + @classmethod + def from_config(cls, *args, **kwargs): requires_backends(cls, 'torch') - @classmethod def from_pretrained(cls, *args, **kwargs): + @classmethod + def from_pretrained(cls, *args, **kwargs): requires_backends(cls, 'torch') """ dummy_class = create_dummy_object("FakeClass", "'torch'") self.assertEqual(dummy_class, expected_dummy_class) def test_create_dummy_files(self): - expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit.\nfrom ..utils import DummyObject, -requires_backends + expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends CONSTANT = None @@ -108,13 +110,13 @@ class FakeClass(metaclass=DummyObject): def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) - @classmethod def from_config(cls, *args, **kwargs): + @classmethod + def from_config(cls, *args, **kwargs): requires_backends(cls, ["torch"]) - @classmethod def from_pretrained(cls, *args, **kwargs): + @classmethod + def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) """ dummy_files = create_dummy_files({"torch": ["CONSTANT", "function", "FakeClass"]}) - print(f"Dummy file: {dummy_files['torch']}") - print(f"expected_dummy_pytorch_file: {expected_dummy_pytorch_file}") self.assertEqual(dummy_files["torch"], expected_dummy_pytorch_file) diff --git a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py index bdf5fd3f15f6..f1b61c3364f0 100644 --- a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py +++ b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py @@ -143,7 +143,8 @@ def get_dummy_components(self, adapter_type, time_cond_proj_dim=None): def get_dummy_components_with_full_downscaling(self, adapter_type): """Get dummy components with x8 VAE downscaling and 4 UNet down blocks. - These dummy components are intended to fully-exercise the T2I-Adapter downscaling behavior. + These dummy components are intended to fully-exercise the T2I-Adapter + downscaling behavior. """ torch.manual_seed(0) unet = UNet2DConditionModel( @@ -276,10 +277,12 @@ def test_inference_batch_single_identical(self): ) def test_multiple_image_dimensions(self, dim): """Test that the T2I-Adapter pipeline supports any input dimension that - is divisible by the adapter's `downscale_factor`. This test was added in response to an issue where the T2I - Adapter's downscaling padding behavior did not match the UNet's behavior. + is divisible by the adapter's `downscale_factor`. This test was added in + response to an issue where the T2I Adapter's downscaling padding + behavior did not match the UNet's behavior. - Note that we have selected `dim` values to produce odd resolutions at each downscaling level. + Note that we have selected `dim` values to produce odd resolutions at + each downscaling level. """ components = self.get_dummy_components_with_full_downscaling() sd_pipe = StableDiffusionAdapterPipeline(**components) diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py index 2d2cbad8bf87..0bcffeb078b8 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py @@ -167,7 +167,8 @@ def get_dummy_components(self, adapter_type="full_adapter_xl", time_cond_proj_di def get_dummy_components_with_full_downscaling(self, adapter_type="full_adapter_xl"): """Get dummy components with x8 VAE downscaling and 3 UNet down blocks. - These dummy components are intended to fully-exercise the T2I-Adapter downscaling behavior. + These dummy components are intended to fully-exercise the T2I-Adapter + downscaling behavior. """ torch.manual_seed(0) unet = UNet2DConditionModel( @@ -320,10 +321,12 @@ def test_stable_diffusion_adapter_default_case(self): ) def test_multiple_image_dimensions(self, dim): """Test that the T2I-Adapter pipeline supports any input dimension that - is divisible by the adapter's `downscale_factor`. This test was added in response to an issue where the T2I - Adapter's downscaling padding behavior did not match the UNet's behavior. + is divisible by the adapter's `downscale_factor`. This test was added in + response to an issue where the T2I Adapter's downscaling padding + behavior did not match the UNet's behavior. - Note that we have selected `dim` values to produce odd resolutions at each downscaling level. + Note that we have selected `dim` values to produce odd resolutions at + each downscaling level. """ components = self.get_dummy_components_with_full_downscaling() sd_pipe = StableDiffusionXLAdapterPipeline(**components) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index e249fa81ca0f..2b29e3ae9eeb 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -67,8 +67,8 @@ def check_same_shape(tensor_list): class SDFunctionTesterMixin: """ - This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. It provides a set of - common tests for PyTorch pipeline that inherit from StableDiffusionMixin, e.g. vae_slicing, vae_tiling, freeu, etc. + This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. + It provides a set of common tests for PyTorch pipeline that inherit from StableDiffusionMixin, e.g. vae_slicing, vae_tiling, freeu, etc. """ def test_vae_slicing(self): @@ -209,8 +209,8 @@ def test_fused_qkv_projections(self): class IPAdapterTesterMixin: """ - This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. It provides a set of - common tests for pipelines that support IP Adapters. + This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. + It provides a set of common tests for pipelines that support IP Adapters. """ def test_pipeline_signature(self): @@ -350,8 +350,9 @@ def test_ip_adapter_cfg(self, expected_max_diff: float = 1e-4): class PipelineLatentTesterMixin: """ - This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. It provides a set of - common tests for PyTorch pipeline that has vae, e.g. equivalence of different input and output types, etc. + This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. + It provides a set of common tests for PyTorch pipeline that has vae, e.g. + equivalence of different input and output types, etc. """ @property @@ -507,8 +508,9 @@ def test_multi_vae(self): @require_torch class PipelineKarrasSchedulerTesterMixin: """ - This mixin is designed to be used with unittest.TestCase classes. It provides a set of common tests for each - PyTorch pipeline that makes use of KarrasDiffusionSchedulers equivalence of dict and tuple outputs, etc. + This mixin is designed to be used with unittest.TestCase classes. + It provides a set of common tests for each PyTorch pipeline that makes use of KarrasDiffusionSchedulers + equivalence of dict and tuple outputs, etc. """ def test_karras_schedulers_shape(self): @@ -546,8 +548,9 @@ def test_karras_schedulers_shape(self): @require_torch class PipelineTesterMixin: """ - This mixin is designed to be used with unittest.TestCase classes. It provides a set of common tests for each - PyTorch pipeline, e.g. saving and loading the pipeline, equivalence of dict and tuple outputs, etc. + This mixin is designed to be used with unittest.TestCase classes. + It provides a set of common tests for each PyTorch pipeline, e.g. saving and loading the pipeline, + equivalence of dict and tuple outputs, etc. """ # Canonical parameters that are passed to `__call__` regardless diff --git a/tests/pipelines/test_pipelines_onnx_common.py b/tests/pipelines/test_pipelines_onnx_common.py index 69889efa3743..575ecd007531 100644 --- a/tests/pipelines/test_pipelines_onnx_common.py +++ b/tests/pipelines/test_pipelines_onnx_common.py @@ -4,8 +4,9 @@ @require_onnxruntime class OnnxPipelineTesterMixin: """ - This mixin is designed to be used with unittest.TestCase classes. It provides a set of common tests for each - ONNXRuntime pipeline, e.g. saving and loading the pipeline, equivalence of dict and tuple outputs, etc. + This mixin is designed to be used with unittest.TestCase classes. + It provides a set of common tests for each ONNXRuntime pipeline, e.g. saving and loading the pipeline, + equivalence of dict and tuple outputs, etc. """ pass From 610444b144125dcc2948fa0c941a920669822cba Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 14 Mar 2024 16:35:43 +0530 Subject: [PATCH 06/10] revert pipeline_fixture --- tests/fixtures/custom_pipeline/pipeline.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/fixtures/custom_pipeline/pipeline.py b/tests/fixtures/custom_pipeline/pipeline.py index 41d69706b956..8af001bb774a 100644 --- a/tests/fixtures/custom_pipeline/pipeline.py +++ b/tests/fixtures/custom_pipeline/pipeline.py @@ -1,3 +1,4 @@ + # Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -66,8 +67,9 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is - True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if + `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the + generated images. """ # Sample gaussian noise to begin loop From 6012cd67bb7edee62c32f85ed64b2ddf6e16bd3d Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 14 Mar 2024 16:36:38 +0530 Subject: [PATCH 07/10] remove empty line --- tests/fixtures/custom_pipeline/pipeline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/fixtures/custom_pipeline/pipeline.py b/tests/fixtures/custom_pipeline/pipeline.py index 8af001bb774a..601f51b1263e 100644 --- a/tests/fixtures/custom_pipeline/pipeline.py +++ b/tests/fixtures/custom_pipeline/pipeline.py @@ -1,4 +1,3 @@ - # Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); From ab13de38749983fd1655f9c368f8961be3d1c1a6 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 29 Mar 2024 21:06:30 +0530 Subject: [PATCH 08/10] make style --- src/diffusers/pipelines/pia/pipeline_pia.py | 6 +++--- .../pipeline_stable_diffusion_panorama.py | 18 +++++++++--------- src/diffusers/utils/testing_utils.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index 2134619f2d45..aceb95ae0451 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -136,9 +136,9 @@ class PIAPipelineOutput(BaseOutput): Args: frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): - Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, - NumPy array of shape `(batch_size, num_frames, channels, height, width, - Torch tensor of shape `(batch_size, num_frames, channels, height, width)`. + Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, NumPy array of + shape `(batch_size, num_frames, channels, height, width, Torch tensor of shape `(batch_size, num_frames, + channels, height, width)`. """ frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index 046e1e656ff7..bd7cc443fecb 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -90,8 +90,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -694,9 +694,9 @@ def get_views( circular_padding: bool = False, ) -> List[Tuple[int, int, int, int]]: """ - Generates a list of views based on the given parameters. - Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113). - If panorama's height/width < window_size, num_blocks of height/width should return 1. + Generates a list of views based on the given parameters. Here, we define the mappings F_i (see Eq. 7 in the + MultiDiffusion paper https://arxiv.org/abs/2302.08113). If panorama's height/width < window_size, num_blocks of + height/width should return 1. Args: panorama_height (int): The height of the panorama. @@ -706,8 +706,8 @@ def get_views( circular_padding (bool, optional): Whether to apply circular padding. Defaults to False. Returns: - List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains - four integers representing the start and end coordinates of the window in the panorama. + List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains four integers + representing the start and end coordinates of the window in the panorama. """ panorama_height /= 8 @@ -800,8 +800,8 @@ def __call__( The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. timesteps (`List[int]`, *optional*): - The timesteps at which to generate the images. If not specified, then the default - timestep spacing strategy of the scheduler is used. + The timesteps at which to generate the images. If not specified, then the default timestep spacing + strategy of the scheduler is used. guidance_scale (`float`, *optional*, defaults to 7.5): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 4ea541dac356..8fa16ddb9d9f 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -156,8 +156,8 @@ def get_tests_dir(append_path=None): # https://github.com/huggingface/accelerate/pull/1964 def str_to_bool(value) -> int: """ - Converts a string representation of truth to `True` (1) or `False` (0). - True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`; + Converts a string representation of truth to `True` (1) or `False` (0). True values are `y`, `yes`, `t`, `true`, + `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`; """ value = value.lower() if value in ("y", "yes", "t", "true", "on", "1"): From d7f87fa993a3df2b429745ba876b97f7a740cbdc Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 30 Mar 2024 06:53:41 +0530 Subject: [PATCH 09/10] fix: setup.py --- setup.py | 5 ++- src/diffusers/loaders/unet_loader_utils.py | 41 ++++++++-------------- 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/setup.py b/setup.py index ce707f4ac7f6..ee2b17b60c17 100644 --- a/setup.py +++ b/setup.py @@ -205,9 +205,8 @@ def run(self): extras = {} -# `hf-doc-builder` has a dependency on `black`. See huggingface/doc-builder#434. -extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder", "black") -extras["docs"] = deps_list("hf-doc-builder", "black") +extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder") +extras["docs"] = deps_list("hf-doc-builder") extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft") extras["test"] = deps_list( "compel", diff --git a/src/diffusers/loaders/unet_loader_utils.py b/src/diffusers/loaders/unet_loader_utils.py index 918a0fca06c8..3ee4a96fad0a 100644 --- a/src/diffusers/loaders/unet_loader_utils.py +++ b/src/diffusers/loaders/unet_loader_utils.py @@ -74,37 +74,24 @@ def _maybe_expand_lora_scales_for_one_adapter( E.g. turns ```python - scales = { - 'down': 2, - 'mid': 3, - 'up': { - 'block_0': 4, - 'block_1': [5, 6, 7] - } - } - blocks_with_transformer = { - 'down': [1,2], - 'up': [0,1] - } - transformer_per_block = { - 'down': 2, - 'up': 3 - } + scales = {"down": 2, "mid": 3, "up": {"block_0": 4, "block_1": [5, 6, 7]}} + blocks_with_transformer = {"down": [1, 2], "up": [0, 1]} + transformer_per_block = {"down": 2, "up": 3} ``` into ```python { - 'down.block_1.0': 2, - 'down.block_1.1': 2, - 'down.block_2.0': 2, - 'down.block_2.1': 2, - 'mid': 3, - 'up.block_0.0': 4, - 'up.block_0.1': 4, - 'up.block_0.2': 4, - 'up.block_1.0': 5, - 'up.block_1.1': 6, - 'up.block_1.2': 7, + "down.block_1.0": 2, + "down.block_1.1": 2, + "down.block_2.0": 2, + "down.block_2.1": 2, + "mid": 3, + "up.block_0.0": 4, + "up.block_0.1": 4, + "up.block_0.2": 4, + "up.block_1.0": 5, + "up.block_1.1": 6, + "up.block_1.2": 7, } ``` """ From d9ec1ea2882182619900264d6819e2749e34e341 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 2 Apr 2024 20:17:10 +0530 Subject: [PATCH 10/10] style. --- src/diffusers/pipelines/pipeline_utils.py | 3 ++- .../stable_diffusion_sag/pipeline_stable_diffusion_sag.py | 4 ++-- src/diffusers/schedulers/scheduling_unipc_multistep.py | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 65927c1844e6..0ed27293c178 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -1670,7 +1670,8 @@ def set_attention_slice(self, slice_size: Optional[int]): @classmethod def from_pipe(cls, pipeline, **kwargs): r""" - Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing pipeline components without reallocating additional memory. + Create a new pipeline from a given pipeline. This method is useful to create a new pipeline from the existing + pipeline components without reallocating additional memory. Arguments: pipeline (`DiffusionPipeline`): diff --git a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py index 2e7a1fa41b58..1c1464a4271e 100644 --- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py @@ -619,8 +619,8 @@ def __call__( ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. If not - provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the + `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 70e63a64c0a8..c95ea43e55e3 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -128,8 +128,8 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): steps_offset (`int`, defaults to 0): An offset added to the inference steps, as required by some model families. final_sigmas_type (`str`, defaults to `"zero"`): - The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma - is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. + The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final + sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers]