diff --git a/docs/README.md b/docs/README.md index e7aa8c4f682b..f36b76fb0789 100644 --- a/docs/README.md +++ b/docs/README.md @@ -242,10 +242,10 @@ Here's an example of a tuple return, comprising several objects: ``` Returns: - `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs: - - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` -- + `tuple(torch.Tensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs: + - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.Tensor` of shape `(1,)` -- Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss. - - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) -- + - **prediction_scores** (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`) -- Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). ``` diff --git a/docs/source/en/optimization/memory.md b/docs/source/en/optimization/memory.md index 6b2a22b315c8..e3f4d2652d0f 100644 --- a/docs/source/en/optimization/memory.md +++ b/docs/source/en/optimization/memory.md @@ -261,7 +261,7 @@ from dataclasses import dataclass @dataclass class UNet2DConditionOutput: - sample: torch.FloatTensor + sample: torch.Tensor pipe = StableDiffusionPipeline.from_pretrained( diff --git a/docs/source/ko/optimization/fp16.md b/docs/source/ko/optimization/fp16.md index 2e58421c35b8..f7b2cf809552 100644 --- a/docs/source/ko/optimization/fp16.md +++ b/docs/source/ko/optimization/fp16.md @@ -339,7 +339,7 @@ from dataclasses import dataclass @dataclass class UNet2DConditionOutput: - sample: torch.FloatTensor + sample: torch.Tensor pipe = StableDiffusionPipeline.from_pretrained( diff --git a/examples/community/bit_diffusion.py b/examples/community/bit_diffusion.py index 18d5fca5619e..71d8f31163f2 100644 --- a/examples/community/bit_diffusion.py +++ b/examples/community/bit_diffusion.py @@ -44,9 +44,9 @@ def bits_to_decimal(x, bits=BITS): # modified scheduler step functions for clamping the predicted x_0 between -bit_scale and +bit_scale def ddim_bit_scheduler_step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = True, generator=None, @@ -56,9 +56,9 @@ def ddim_bit_scheduler_step( Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): TODO @@ -134,9 +134,9 @@ def ddim_bit_scheduler_step( def ddpm_bit_scheduler_step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, prediction_type="epsilon", generator=None, return_dict: bool = True, @@ -145,9 +145,9 @@ def ddpm_bit_scheduler_step( Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. prediction_type (`str`, default `epsilon`): indicates whether the model predicts the noise (epsilon), or the samples (`sample`). diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py index 16dcecd7b22a..75b7df84dc77 100644 --- a/examples/community/clip_guided_images_mixing_stable_diffusion.py +++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py @@ -233,8 +233,8 @@ def cond_fn( @torch.no_grad() def __call__( self, - style_image: Union[torch.FloatTensor, PIL.Image.Image], - content_image: Union[torch.FloatTensor, PIL.Image.Image], + style_image: Union[torch.Tensor, PIL.Image.Image], + content_image: Union[torch.Tensor, PIL.Image.Image], style_prompt: Optional[str] = None, content_prompt: Optional[str] = None, height: Optional[int] = 512, diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py index 4205718802de..1350650113da 100644 --- a/examples/community/clip_guided_stable_diffusion.py +++ b/examples/community/clip_guided_stable_diffusion.py @@ -180,7 +180,7 @@ def __call__( num_cutouts: Optional[int] = 4, use_cutouts: Optional[bool] = True, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, ): diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py index c8e0a9094f22..9a77458be5ad 100644 --- a/examples/community/clip_guided_stable_diffusion_img2img.py +++ b/examples/community/clip_guided_stable_diffusion_img2img.py @@ -306,7 +306,7 @@ def __call__( prompt: Union[str, List[str]], height: Optional[int] = 512, width: Optional[int] = 512, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -317,7 +317,7 @@ def __call__( num_cutouts: Optional[int] = 4, use_cutouts: Optional[bool] = True, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, ): diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py index 2ad37df5abe3..46d12ba1f2aa 100644 --- a/examples/community/composable_stable_diffusion.py +++ b/examples/community/composable_stable_diffusion.py @@ -354,10 +354,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, weights: Optional[str] = "", ): @@ -391,7 +391,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -403,7 +403,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/ddim_noise_comparative_analysis.py b/examples/community/ddim_noise_comparative_analysis.py index 482c0a5826d2..829106c47f65 100644 --- a/examples/community/ddim_noise_comparative_analysis.py +++ b/examples/community/ddim_noise_comparative_analysis.py @@ -103,7 +103,7 @@ def prepare_latents(self, image, timestep, batch_size, dtype, device, generator= @torch.no_grad() def __call__( self, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, batch_size: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -115,7 +115,7 @@ def __call__( ) -> Union[ImagePipelineOutput, Tuple]: r""" Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): diff --git a/examples/community/gluegen.py b/examples/community/gluegen.py index c656dce55a0d..1ad6911905db 100644 --- a/examples/community/gluegen.py +++ b/examples/community/gluegen.py @@ -205,7 +205,7 @@ def __init__( safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPImageProcessor, language_adapter: TranslatorNoLN = None, - tensor_norm: torch.FloatTensor = None, + tensor_norm: torch.Tensor = None, requires_safety_checker: bool = True, ): super().__init__() @@ -231,7 +231,7 @@ def load_language_adapter( num_token: int, dim: int, dim_out: int, - tensor_norm: torch.FloatTensor, + tensor_norm: torch.Tensor, mult: int = 2, depth: int = 5, ): @@ -242,7 +242,7 @@ def load_language_adapter( ) self.language_adapter.load_state_dict(torch.load(model_path)) - def _adapt_language(self, prompt_embeds: torch.FloatTensor): + def _adapt_language(self, prompt_embeds: torch.Tensor): prompt_embeds = prompt_embeds / 3 prompt_embeds = self.language_adapter(prompt_embeds) * (self.tensor_norm / 2) return prompt_embeds @@ -254,8 +254,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -275,10 +275,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -535,7 +535,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -594,9 +594,9 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -635,14 +635,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. diff --git a/examples/community/hd_painter.py b/examples/community/hd_painter.py index c157e1688941..df41be9ef7b1 100644 --- a/examples/community/hd_painter.py +++ b/examples/community/hd_painter.py @@ -28,10 +28,10 @@ def __init__(self, mask, token_idx, scale_factor): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, ) -> torch.Tensor: # Same as the default AttnProcessor up untill the part where similarity matrix gets saved @@ -111,10 +111,10 @@ def __init__(self, transformer_block, mask, token_idx, do_classifier_free_guidan def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, ) -> torch.Tensor: # Automatically recognize the resolution of the current attention layer and resize the masks accordingly @@ -454,7 +454,7 @@ def __call__( prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, - masked_image_latents: torch.FloatTensor = None, + masked_image_latents: torch.Tensor = None, height: Optional[int] = None, width: Optional[int] = None, padding_mask_crop: Optional[int] = None, @@ -467,9 +467,9 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.01, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, diff --git a/examples/community/iadb.py b/examples/community/iadb.py index 6089e49fc621..81e9e8d89dc6 100644 --- a/examples/community/iadb.py +++ b/examples/community/iadb.py @@ -17,21 +17,21 @@ class IADBScheduler(SchedulerMixin, ConfigMixin): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - x_alpha: torch.FloatTensor, - ) -> torch.FloatTensor: + x_alpha: torch.Tensor, + ) -> torch.Tensor: """ Predict the sample at the previous timestep by reversing the ODE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. It is the direction from x0 to x1. + model_output (`torch.Tensor`): direct output from learned diffusion model. It is the direction from x0 to x1. timestep (`float`): current timestep in the diffusion chain. - x_alpha (`torch.FloatTensor`): x_alpha sample for the current timestep + x_alpha (`torch.Tensor`): x_alpha sample for the current timestep Returns: - `torch.FloatTensor`: the sample at the previous timestep + `torch.Tensor`: the sample at the previous timestep """ if self.num_inference_steps is None: @@ -53,10 +53,10 @@ def set_timesteps(self, num_inference_steps: int): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - alpha: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + alpha: torch.Tensor, + ) -> torch.Tensor: return original_samples * alpha + noise * (1 - alpha) def __len__(self): diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 719e8f12bfd4..cea55dd3837b 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -110,7 +110,7 @@ def __init__( def train( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], height: Optional[int] = 512, width: Optional[int] = 512, generator: Optional[torch.Generator] = None, @@ -144,7 +144,7 @@ def train( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py index 71dc3cf712ed..4dfb7a39155f 100644 --- a/examples/community/img2img_inpainting.py +++ b/examples/community/img2img_inpainting.py @@ -133,9 +133,9 @@ def __init__( def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - inner_image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + inner_image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], height: int = 512, width: int = 512, num_inference_steps: int = 50, @@ -144,10 +144,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -194,7 +194,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -206,7 +206,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/instaflow_one_step.py b/examples/community/instaflow_one_step.py index b0476d3afe38..ab0393c8f76c 100644 --- a/examples/community/instaflow_one_step.py +++ b/examples/community/instaflow_one_step.py @@ -189,8 +189,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." @@ -219,8 +219,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -239,10 +239,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -501,12 +501,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -538,14 +538,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -555,7 +555,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py index 1b859c35f174..52b2707f33f7 100644 --- a/examples/community/interpolate_stable_diffusion.py +++ b/examples/community/interpolate_stable_diffusion.py @@ -132,12 +132,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, - text_embeddings: Optional[torch.FloatTensor] = None, + text_embeddings: Optional[torch.Tensor] = None, **kwargs, ): r""" @@ -170,7 +170,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -182,11 +182,11 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. - text_embeddings (`torch.FloatTensor`, *optional*, defaults to `None`): + text_embeddings (`torch.Tensor`, *optional*, defaults to `None`): Pre-generated text embeddings to be used as inputs for image generation. Can be used in place of `prompt` to avoid re-computing the embeddings. If not provided, the embeddings will be generated from the supplied `prompt`. diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py index bb5a2a4fe5a9..befb48c7391e 100644 --- a/examples/community/ip_adapter_face_id.py +++ b/examples/community/ip_adapter_face_id.py @@ -62,7 +62,7 @@ def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_t self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): x = self.ff(image_embeds) x = x.reshape(-1, self.num_tokens, self.cross_attention_dim) return self.norm(x) @@ -452,8 +452,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -484,8 +484,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -505,10 +505,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -788,7 +788,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -847,10 +847,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -891,17 +891,17 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. diff --git a/examples/community/latent_consistency_img2img.py b/examples/community/latent_consistency_img2img.py index 3c5ffa845699..97089e7d192c 100644 --- a/examples/community/latent_consistency_img2img.py +++ b/examples/community/latent_consistency_img2img.py @@ -88,7 +88,7 @@ def _encode_prompt( torch device num_images_per_prompt (`int`): number of images that should be generated per prompt - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. """ @@ -282,10 +282,10 @@ def __call__( width: Optional[int] = 768, guidance_scale: float = 7.5, num_images_per_prompt: Optional[int] = 1, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, num_inference_steps: int = 4, lcm_origin_steps: int = 50, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -395,16 +395,16 @@ class LCMSchedulerOutput(BaseOutput): """ Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - denoised: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + denoised: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -452,10 +452,10 @@ def rescale_zero_terminal_snr(betas): """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -587,17 +587,17 @@ def __init__( self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -613,7 +613,7 @@ def _get_variance(self, timestep, prev_timestep): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -685,25 +685,25 @@ def get_scalings_for_boundary_condition_discrete(self, t): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timeindex: int, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: """ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -714,7 +714,7 @@ def step( `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): @@ -777,10 +777,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -799,9 +799,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) diff --git a/examples/community/latent_consistency_interpolate.py b/examples/community/latent_consistency_interpolate.py index 3d2413c99189..8db70d3b9508 100644 --- a/examples/community/latent_consistency_interpolate.py +++ b/examples/community/latent_consistency_interpolate.py @@ -281,8 +281,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -302,10 +302,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -506,7 +506,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -546,7 +546,7 @@ def check_inputs( height: int, width: int, callback_steps: int, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, callback_on_step_end_tensor_inputs=None, ): if height % 8 != 0 or width % 8 != 0: @@ -580,11 +580,11 @@ def check_inputs( @torch.no_grad() def interpolate_embedding( self, - start_embedding: torch.FloatTensor, - end_embedding: torch.FloatTensor, + start_embedding: torch.Tensor, + end_embedding: torch.Tensor, num_interpolation_steps: Union[int, List[int]], interpolation_type: str, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if interpolation_type == "lerp": interpolation_fn = lerp elif interpolation_type == "slerp": @@ -611,11 +611,11 @@ def interpolate_embedding( @torch.no_grad() def interpolate_latent( self, - start_latent: torch.FloatTensor, - end_latent: torch.FloatTensor, + start_latent: torch.Tensor, + end_latent: torch.Tensor, num_interpolation_steps: Union[int, List[int]], interpolation_type: str, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if interpolation_type == "lerp": interpolation_fn = lerp elif interpolation_type == "slerp": @@ -663,8 +663,8 @@ def __call__( guidance_scale: float = 8.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -705,11 +705,11 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/examples/community/latent_consistency_txt2img.py b/examples/community/latent_consistency_txt2img.py index c31d6abae368..279b56d1e142 100755 --- a/examples/community/latent_consistency_txt2img.py +++ b/examples/community/latent_consistency_txt2img.py @@ -86,7 +86,7 @@ def _encode_prompt( torch device num_images_per_prompt (`int`): number of images that should be generated per prompt - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. """ @@ -208,10 +208,10 @@ def __call__( width: Optional[int] = 768, guidance_scale: float = 7.5, num_images_per_prompt: Optional[int] = 1, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, num_inference_steps: int = 4, lcm_origin_steps: int = 50, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -310,16 +310,16 @@ class LCMSchedulerOutput(BaseOutput): """ Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - denoised: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + denoised: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -367,10 +367,10 @@ def rescale_zero_terminal_snr(betas): """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -499,17 +499,17 @@ def __init__( self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -525,7 +525,7 @@ def _get_variance(self, timestep, prev_timestep): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -593,25 +593,25 @@ def get_scalings_for_boundary_condition_discrete(self, t): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timeindex: int, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: """ Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -622,7 +622,7 @@ def step( `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): @@ -685,10 +685,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -707,9 +707,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) diff --git a/examples/community/llm_grounded_diffusion.py b/examples/community/llm_grounded_diffusion.py index c3b07ade7ba7..24b0a9d0e209 100644 --- a/examples/community/llm_grounded_diffusion.py +++ b/examples/community/llm_grounded_diffusion.py @@ -756,13 +756,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -807,14 +807,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. @@ -825,7 +825,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -1194,8 +1194,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -1227,8 +1227,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -1248,10 +1248,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1509,7 +1509,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index 184ba30c4b85..9f496330a0ec 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -378,7 +378,7 @@ def preprocess_image(image, batch_size): def preprocess_mask(mask, batch_size, scale_factor=8): - if not isinstance(mask, torch.FloatTensor): + if not isinstance(mask, torch.Tensor): mask = mask.convert("L") w, h = mask.size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 @@ -543,8 +543,8 @@ def _encode_prompt( do_classifier_free_guidance, negative_prompt=None, max_embeddings_multiples=3, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -767,8 +767,8 @@ def __call__( self, prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, height: int = 512, width: int = 512, num_inference_steps: int = 50, @@ -778,13 +778,13 @@ def __call__( add_predicted_noise: Optional[bool] = False, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -798,10 +798,10 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should @@ -836,14 +836,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -857,7 +857,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. @@ -1032,13 +1032,13 @@ def text2img( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1072,14 +1072,14 @@ def text2img( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1093,7 +1093,7 @@ def text2img( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. @@ -1137,7 +1137,7 @@ def text2img( def img2img( self, - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, strength: float = 0.8, @@ -1146,12 +1146,12 @@ def img2img( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1159,7 +1159,7 @@ def img2img( r""" Function for image-to-image generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. prompt (`str` or `List[str]`): @@ -1190,10 +1190,10 @@ def img2img( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1207,7 +1207,7 @@ def img2img( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. @@ -1249,8 +1249,8 @@ def img2img( def inpaint( self, - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, strength: float = 0.8, @@ -1260,12 +1260,12 @@ def inpaint( add_predicted_noise: Optional[bool] = False, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, max_embeddings_multiples: Optional[int] = 3, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, is_cancelled_callback: Optional[Callable[[], bool]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1273,10 +1273,10 @@ def inpaint( r""" Function for inpaint. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should @@ -1311,10 +1311,10 @@ def inpaint( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1328,7 +1328,7 @@ def inpaint( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. is_cancelled_callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. If the function returns `True`, the inference will be cancelled. diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py index 64b7973e894b..4ea98b1306fb 100644 --- a/examples/community/lpw_stable_diffusion_xl.py +++ b/examples/community/lpw_stable_diffusion_xl.py @@ -694,10 +694,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -722,17 +722,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1320,7 +1320,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1378,7 +1378,7 @@ def __call__( prompt_2: Optional[str] = None, image: Optional[PipelineImageInput] = None, mask_image: Optional[PipelineImageInput] = None, - masked_image_latents: Optional[torch.FloatTensor] = None, + masked_image_latents: Optional[torch.Tensor] = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 0.8, @@ -1392,12 +1392,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1481,23 +1481,23 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1926,12 +1926,12 @@ def text2img( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -2001,12 +2001,12 @@ def img2img( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -2066,7 +2066,7 @@ def inpaint( prompt_2: Optional[str] = None, image: Optional[PipelineImageInput] = None, mask_image: Optional[PipelineImageInput] = None, - masked_image_latents: Optional[torch.FloatTensor] = None, + masked_image_latents: Optional[torch.Tensor] = None, height: Optional[int] = None, width: Optional[int] = None, strength: float = 0.8, @@ -2080,12 +2080,12 @@ def inpaint( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, diff --git a/examples/community/masked_stable_diffusion_img2img.py b/examples/community/masked_stable_diffusion_img2img.py index 0b08086c7da9..a210c167a2f3 100644 --- a/examples/community/masked_stable_diffusion_img2img.py +++ b/examples/community/masked_stable_diffusion_img2img.py @@ -16,10 +16,10 @@ def __call__( self, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -30,18 +30,18 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, mask: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -52,7 +52,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. strength (`float`, *optional*, defaults to 0.8): @@ -78,10 +78,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -91,14 +91,14 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - mask (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`, *optional*): + mask (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`, *optional*): A mask with non-zero elements for the area to be inpainted. If not specified, no mask is applied. Examples: diff --git a/examples/community/mixture_canvas.py b/examples/community/mixture_canvas.py index 2083c7acad38..7196ee9587f2 100644 --- a/examples/community/mixture_canvas.py +++ b/examples/community/mixture_canvas.py @@ -154,7 +154,7 @@ def encode_prompt(self, text_encoder, device): class Image2ImageRegion(DiffusionRegion): """Class defining a region where an image guided diffusion process is acting""" - reference_image: torch.FloatTensor = None + reference_image: torch.Tensor = None strength: float = 0.8 # Strength of the image def __post_init__(self): diff --git a/examples/community/multilingual_stable_diffusion.py b/examples/community/multilingual_stable_diffusion.py index f3b0540cf4d3..dc335e0b585e 100644 --- a/examples/community/multilingual_stable_diffusion.py +++ b/examples/community/multilingual_stable_diffusion.py @@ -147,10 +147,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -184,7 +184,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -196,7 +196,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py index 4b1a64142704..ac0aa38254e5 100644 --- a/examples/community/pipeline_animatediff_controlnet.py +++ b/examples/community/pipeline_animatediff_controlnet.py @@ -198,8 +198,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -219,10 +219,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -752,9 +752,9 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, ip_adapter_image_embeds: Optional[PipelineImageInput] = None, conditioning_frames: Optional[List[PipelineImageInput]] = None, @@ -798,20 +798,20 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. @@ -821,7 +821,7 @@ def __call__( are specified, images must be passed as a list such that each element of the list can be correctly batched for input to a single ControlNet. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py index d9209122262f..7546fbd9bcb0 100644 --- a/examples/community/pipeline_animatediff_img2video.py +++ b/examples/community/pipeline_animatediff_img2video.py @@ -315,8 +315,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -336,10 +336,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -746,14 +746,14 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, ip_adapter_image_embeds: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -791,33 +791,33 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`AnimateDiffImgToVideoPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py index f46d635dae2b..b4d47f1856ba 100644 --- a/examples/community/pipeline_demofusion_sdxl.py +++ b/examples/community/pipeline_demofusion_sdxl.py @@ -187,10 +187,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -215,17 +215,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -642,14 +642,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = False, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -720,21 +720,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -746,7 +746,7 @@ def __call__( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_fabric.py b/examples/community/pipeline_fabric.py index 46692a96840b..f17c8e52f595 100644 --- a/examples/community/pipeline_fabric.py +++ b/examples/community/pipeline_fabric.py @@ -190,8 +190,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -210,10 +210,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -512,7 +512,7 @@ def __call__( neg_scale: float = 0.5, pos_bottleneck_scale: float = 1.0, neg_bottleneck_scale: float = 1.0, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ): r""" The call function to the pipeline for generation. Generate a trajectory of images with binary feedback. The diff --git a/examples/community/pipeline_prompt2prompt.py b/examples/community/pipeline_prompt2prompt.py index db9f215b6bed..8e9bcddfef99 100644 --- a/examples/community/pipeline_prompt2prompt.py +++ b/examples/community/pipeline_prompt2prompt.py @@ -217,8 +217,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -250,8 +250,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -271,10 +271,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -564,12 +564,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -604,7 +604,7 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -616,7 +616,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_sdxl_style_aligned.py b/examples/community/pipeline_sdxl_style_aligned.py index 88edeeb7ee4c..5ad85dc90aff 100644 --- a/examples/community/pipeline_sdxl_style_aligned.py +++ b/examples/community/pipeline_sdxl_style_aligned.py @@ -514,10 +514,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -543,17 +543,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1325,7 +1325,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1387,7 +1387,7 @@ def __call__( prompt_2: Optional[Union[str, List[str]]] = None, image: Optional[PipelineImageInput] = None, mask_image: Optional[PipelineImageInput] = None, - masked_image_latents: Optional[torch.FloatTensor] = None, + masked_image_latents: Optional[torch.Tensor] = None, strength: float = 0.3, height: Optional[int] = None, width: Optional[int] = None, @@ -1401,11 +1401,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -1474,21 +1474,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/examples/community/pipeline_stable_diffusion_pag.py b/examples/community/pipeline_stable_diffusion_pag.py index cdb7bd99cb29..5c588adc4f0d 100644 --- a/examples/community/pipeline_stable_diffusion_pag.py +++ b/examples/community/pipeline_stable_diffusion_pag.py @@ -57,13 +57,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -171,13 +171,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -493,8 +493,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -525,8 +525,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -545,10 +545,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -966,7 +966,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 dtype: data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1078,11 +1078,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1122,18 +1122,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py index 61c4fbc13e07..0622db005d76 100644 --- a/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py +++ b/examples/community/pipeline_stable_diffusion_upscale_ldm3d.py @@ -164,8 +164,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -197,8 +197,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -218,10 +218,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -535,12 +535,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, target_res: Optional[List[int]] = [1024, 1024], @@ -551,7 +551,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be upscaled. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -570,14 +570,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -587,7 +587,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py index 82c522b4489a..6cb4928f7673 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py @@ -248,10 +248,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -277,17 +277,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -887,14 +887,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -922,14 +922,14 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - adapter_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + adapter_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -973,21 +973,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -999,7 +999,7 @@ def __call__( instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py index a85f1c3da6fb..c5495f756c9f 100644 --- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py +++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py @@ -396,10 +396,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -425,17 +425,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1229,14 +1229,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[Union[torch.FloatTensor]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[Union[torch.Tensor]] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -1270,14 +1270,14 @@ def __call__( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - adapter_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + adapter_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -1330,21 +1330,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1356,7 +1356,7 @@ def __call__( instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py index 49fed61254c6..ae9708a6faf6 100644 --- a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py +++ b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py @@ -280,10 +280,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -309,17 +309,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -868,7 +868,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -881,7 +881,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -942,10 +942,10 @@ def __call__( prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -960,13 +960,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -982,12 +982,12 @@ def __call__( clip_skip: Optional[int] = None, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], - map: torch.FloatTensor = None, + map: torch.Tensor = None, original_image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -1003,7 +1003,7 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. strength (`float`, *optional*, defaults to 0.3): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -1051,26 +1051,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. @@ -1083,7 +1083,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py b/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py index f1b8dfa96f4e..fb46ff3f3897 100644 --- a/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py +++ b/examples/community/pipeline_stable_diffusion_xl_instandid_img2img.py @@ -561,12 +561,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -596,10 +596,10 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -632,24 +632,24 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. diff --git a/examples/community/pipeline_stable_diffusion_xl_instantid.py b/examples/community/pipeline_stable_diffusion_xl_instantid.py index 147ba46a07ae..6e77261f512d 100644 --- a/examples/community/pipeline_stable_diffusion_xl_instantid.py +++ b/examples/community/pipeline_stable_diffusion_xl_instantid.py @@ -559,12 +559,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -592,10 +592,10 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -628,24 +628,24 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. diff --git a/examples/community/pipeline_stable_diffusion_xl_ipex.py b/examples/community/pipeline_stable_diffusion_xl_ipex.py index a44ccf89eadd..791b077a84ca 100644 --- a/examples/community/pipeline_stable_diffusion_xl_ipex.py +++ b/examples/community/pipeline_stable_diffusion_xl_ipex.py @@ -276,10 +276,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -305,17 +305,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -687,7 +687,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -750,11 +750,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -826,21 +826,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1190,11 +1190,11 @@ def prepare_for_ipex( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, diff --git a/examples/community/pipeline_zero1to3.py b/examples/community/pipeline_zero1to3.py index 5e02ba286679..af1c82dfad95 100644 --- a/examples/community/pipeline_zero1to3.py +++ b/examples/community/pipeline_zero1to3.py @@ -192,8 +192,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -211,10 +211,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -481,7 +481,7 @@ def check_inputs(self, image, height, width, callback_steps): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -595,8 +595,8 @@ def prepare_img_latents(self, image, batch_size, dtype, device, generator=None, @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - input_imgs: Union[torch.FloatTensor, PIL.Image.Image] = None, - prompt_imgs: Union[torch.FloatTensor, PIL.Image.Image] = None, + input_imgs: Union[torch.Tensor, PIL.Image.Image] = None, + prompt_imgs: Union[torch.Tensor, PIL.Image.Image] = None, poses: Union[List[float], List[List[float]]] = None, torch_dtype=torch.float32, height: Optional[int] = None, @@ -607,12 +607,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: float = 1.0, @@ -650,14 +650,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -669,7 +669,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/regional_prompting_stable_diffusion.py b/examples/community/regional_prompting_stable_diffusion.py index 71f24a81bd15..19715a4fb6c1 100644 --- a/examples/community/regional_prompting_stable_diffusion.py +++ b/examples/community/regional_prompting_stable_diffusion.py @@ -104,7 +104,7 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, rp_args: Dict[str, str] = None, @@ -168,7 +168,7 @@ def getcompelembs(prps): orig_hw = (height, width) revers = True - def pcallback(s_self, step: int, timestep: int, latents: torch.FloatTensor, selfs=None): + def pcallback(s_self, step: int, timestep: int, latents: torch.Tensor, selfs=None): if "PRO" in mode: # in Prompt mode, make masks from sum of attension maps self.step = step @@ -198,10 +198,10 @@ def pcallback(s_self, step: int, timestep: int, latents: torch.FloatTensor, self def hook_forward(module): # diffusers==0.23.2 def forward( - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, ) -> torch.Tensor: attn = module diff --git a/examples/community/rerender_a_video.py b/examples/community/rerender_a_video.py index ad414390c9b6..6e25b92603d5 100644 --- a/examples/community/rerender_a_video.py +++ b/examples/community/rerender_a_video.py @@ -142,12 +142,12 @@ class TextToVideoSDPipelineOutput(BaseOutput): Output class for text-to-video pipelines. Args: - frames (`List[np.ndarray]` or `torch.FloatTensor`) + frames (`List[np.ndarray]` or `torch.Tensor`) List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as a `torch` tensor. The length of the list denotes the video length (the number of frames). """ - frames: Union[List[np.ndarray], torch.FloatTensor] + frames: Union[List[np.ndarray], torch.Tensor] @torch.no_grad() @@ -589,20 +589,20 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - frames: Union[List[np.ndarray], torch.FloatTensor] = None, - control_frames: Union[List[np.ndarray], torch.FloatTensor] = None, + frames: Union[List[np.ndarray], torch.Tensor] = None, + control_frames: Union[List[np.ndarray], torch.Tensor] = None, strength: float = 0.8, num_inference_steps: int = 50, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, @@ -624,8 +624,8 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - frames (`List[np.ndarray]` or `torch.FloatTensor`): The input images to be used as the starting point for the image generation process. - control_frames (`List[np.ndarray]` or `torch.FloatTensor`): The ControlNet input images condition to provide guidance to the `unet` for generation. + frames (`List[np.ndarray]` or `torch.Tensor`): The input images to be used as the starting point for the image generation process. + control_frames (`List[np.ndarray]` or `torch.Tensor`): The ControlNet input images condition to provide guidance to the `unet` for generation. strength ('float'): SDEdit strength. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -646,14 +646,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -665,7 +665,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/run_onnx_controlnet.py b/examples/community/run_onnx_controlnet.py index ed9b23318414..af2672c17e59 100644 --- a/examples/community/run_onnx_controlnet.py +++ b/examples/community/run_onnx_controlnet.py @@ -507,18 +507,18 @@ def __call__( fp16: bool = True, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, control_image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -531,12 +531,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, @@ -551,14 +551,14 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image will be used as the starting point for the image generation process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in init, images must be passed as a list such that each element of the list can be correctly @@ -588,14 +588,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -607,7 +607,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/run_tensorrt_controlnet.py b/examples/community/run_tensorrt_controlnet.py index aece5484e304..873195fa31ba 100644 --- a/examples/community/run_tensorrt_controlnet.py +++ b/examples/community/run_tensorrt_controlnet.py @@ -611,18 +611,18 @@ def __call__( fp16: bool = True, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, control_image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, @@ -635,12 +635,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, @@ -655,14 +655,14 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image will be used as the starting point for the image generation process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in init, images must be passed as a list such that each element of the list can be correctly @@ -692,14 +692,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -711,7 +711,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/scheduling_ufogen.py b/examples/community/scheduling_ufogen.py index 3c03b3e8dfda..5213f13ffb43 100644 --- a/examples/community/scheduling_ufogen.py +++ b/examples/community/scheduling_ufogen.py @@ -34,16 +34,16 @@ class UFOGenSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -98,11 +98,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -240,19 +240,19 @@ def __init__( self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy()) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -340,7 +340,7 @@ def set_timesteps( self.timesteps = torch.from_numpy(timesteps).to(device) # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -375,9 +375,9 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[UFOGenSchedulerOutput, Tuple]: @@ -386,11 +386,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -461,10 +461,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -483,9 +483,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) timesteps = timesteps.to(sample.device) diff --git a/examples/community/sd_text2img_k_diffusion.py b/examples/community/sd_text2img_k_diffusion.py index 3299a7605257..9f83973aba91 100755 --- a/examples/community/sd_text2img_k_diffusion.py +++ b/examples/community/sd_text2img_k_diffusion.py @@ -286,10 +286,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -323,7 +323,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -335,7 +335,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py index e9eba994b526..ae2d8a53b298 100644 --- a/examples/community/seed_resize_stable_diffusion.py +++ b/examples/community/seed_resize_stable_diffusion.py @@ -81,12 +81,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, - text_embeddings: Optional[torch.FloatTensor] = None, + text_embeddings: Optional[torch.Tensor] = None, **kwargs, ): r""" @@ -119,7 +119,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -131,7 +131,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/speech_to_image_diffusion.py b/examples/community/speech_to_image_diffusion.py index 3537ef89e1a1..9cb5a2a8c7f9 100644 --- a/examples/community/speech_to_image_diffusion.py +++ b/examples/community/speech_to_image_diffusion.py @@ -76,10 +76,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): diff --git a/examples/community/stable_diffusion_comparison.py b/examples/community/stable_diffusion_comparison.py index dab5705b3370..2b510a64f854 100644 --- a/examples/community/stable_diffusion_comparison.py +++ b/examples/community/stable_diffusion_comparison.py @@ -96,10 +96,10 @@ def text2img_sd1_1( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -133,10 +133,10 @@ def text2img_sd1_2( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -170,10 +170,10 @@ def text2img_sd1_3( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -207,10 +207,10 @@ def text2img_sd1_4( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -244,10 +244,10 @@ def _call_( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -276,7 +276,7 @@ def _call_( generator (`torch.Generator`, optional): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, optional): + latents (`torch.Tensor`, optional): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/examples/community/stable_diffusion_controlnet_img2img.py b/examples/community/stable_diffusion_controlnet_img2img.py index 74674e65f0ef..c7c88d6fdcc7 100644 --- a/examples/community/stable_diffusion_controlnet_img2img.py +++ b/examples/community/stable_diffusion_controlnet_img2img.py @@ -189,8 +189,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -207,10 +207,10 @@ def _encode_prompt( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -600,7 +600,7 @@ def __call__( prompt: Union[str, List[str]] = None, image: Union[torch.Tensor, PIL.Image.Image] = None, controlnet_conditioning_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image] ] = None, strength: float = 0.8, height: Optional[int] = None, @@ -611,12 +611,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, @@ -633,9 +633,9 @@ def __call__( image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will be masked out with `mask_image` and repainted according to `prompt`. - controlnet_conditioning_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`): + controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. strength (`float`, *optional*): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -667,14 +667,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -686,7 +686,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_controlnet_inpaint.py b/examples/community/stable_diffusion_controlnet_inpaint.py index a13497dddcaf..b473ffe79933 100644 --- a/examples/community/stable_diffusion_controlnet_inpaint.py +++ b/examples/community/stable_diffusion_controlnet_inpaint.py @@ -288,8 +288,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -306,10 +306,10 @@ def _encode_prompt( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -744,7 +744,7 @@ def __call__( image: Union[torch.Tensor, PIL.Image.Image] = None, mask_image: Union[torch.Tensor, PIL.Image.Image] = None, controlnet_conditioning_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image] ] = None, height: Optional[int] = None, width: Optional[int] = None, @@ -754,12 +754,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, @@ -779,9 +779,9 @@ def __call__( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - controlnet_conditioning_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`): + controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -807,14 +807,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -826,7 +826,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py index 14c4e4aa6d4e..8928f34239e3 100644 --- a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py +++ b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py @@ -273,8 +273,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -291,10 +291,10 @@ def _encode_prompt( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -731,7 +731,7 @@ def __call__( image: Union[torch.Tensor, PIL.Image.Image] = None, mask_image: Union[torch.Tensor, PIL.Image.Image] = None, controlnet_conditioning_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image] ] = None, strength: float = 0.8, height: Optional[int] = None, @@ -742,12 +742,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: float = 1.0, @@ -767,9 +767,9 @@ def __call__( repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - controlnet_conditioning_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`): + controlnet_conditioning_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. strength (`float`, *optional*): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -801,14 +801,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -820,7 +820,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py index 16f7f589b70b..e4d8e12f85a9 100644 --- a/examples/community/stable_diffusion_controlnet_reference.py +++ b/examples/community/stable_diffusion_controlnet_reference.py @@ -100,14 +100,14 @@ def __call__( self, prompt: Union[str, List[str]] = None, image: Union[ - torch.FloatTensor, + torch.Tensor, PIL.Image.Image, np.ndarray, - List[torch.FloatTensor], + List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray], ] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + ref_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -116,12 +116,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, @@ -139,17 +139,17 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in init, images must be passed as a list such that each element of the list can be correctly batched for input to a single controlnet. - ref_image (`torch.FloatTensor`, `PIL.Image.Image`): + ref_image (`torch.Tensor`, `PIL.Image.Image`): The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to Reference Control as is. `PIL.Image.Image` can also be accepted as an image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -176,14 +176,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -195,7 +195,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -374,10 +374,10 @@ def __call__( def hacked_basic_transformer_inner_forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, @@ -492,12 +492,12 @@ def hacked_mid_forward(self, *args, **kwargs): def hack_CrossAttnDownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 @@ -588,14 +588,14 @@ def hacked_DownBlock2D_forward(self, hidden_states, temb=None, *args, **kwargs): def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 # TODO(Patrick, William) - attention mask is not used diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py index dd648fd8c708..92588ba8a2e8 100644 --- a/examples/community/stable_diffusion_ipex.py +++ b/examples/community/stable_diffusion_ipex.py @@ -311,8 +311,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -330,10 +330,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -567,12 +567,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): @@ -608,14 +608,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -627,7 +627,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py index e53afb703e24..95b4b03e4de1 100644 --- a/examples/community/stable_diffusion_mega.py +++ b/examples/community/stable_diffusion_mega.py @@ -99,8 +99,8 @@ def components(self) -> Dict[str, Any]: def inpaint( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -110,7 +110,7 @@ def inpaint( generator: Optional[torch.Generator] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline @@ -134,7 +134,7 @@ def inpaint( def img2img( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -144,7 +144,7 @@ def img2img( generator: Optional[torch.Generator] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -177,10 +177,10 @@ def text2img( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): # For more information on how this function https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py index c2dd184c2fa4..2352b92c30e9 100644 --- a/examples/community/stable_diffusion_reference.py +++ b/examples/community/stable_diffusion_reference.py @@ -268,10 +268,10 @@ def check_inputs( width: int, callback_steps: Optional[int], negative_prompt: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[torch.Tensor] = None, - ip_adapter_image_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image_embeds: Optional[torch.Tensor] = None, callback_on_step_end_tensor_inputs: Optional[List[str]] = None, ) -> None: """ @@ -283,10 +283,10 @@ def check_inputs( width (int): The width of the input image. callback_steps (Optional[int]): The number of steps to perform the callback on. negative_prompt (Optional[str]): The negative prompt text. - prompt_embeds (Optional[torch.FloatTensor]): The prompt embeddings. - negative_prompt_embeds (Optional[torch.FloatTensor]): The negative prompt embeddings. + prompt_embeds (Optional[torch.Tensor]): The prompt embeddings. + negative_prompt_embeds (Optional[torch.Tensor]): The negative prompt embeddings. ip_adapter_image (Optional[torch.Tensor]): The input adapter image. - ip_adapter_image_embeds (Optional[torch.FloatTensor]): The input adapter image embeddings. + ip_adapter_image_embeds (Optional[torch.Tensor]): The input adapter image embeddings. callback_on_step_end_tensor_inputs (Optional[List[str]]): The list of tensor inputs to perform the callback on. Raises: @@ -357,11 +357,11 @@ def _encode_prompt( num_images_per_prompt: int, do_classifier_free_guidance: bool, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Encodes the prompt into embeddings. @@ -371,13 +371,13 @@ def _encode_prompt( num_images_per_prompt (int): The number of images per prompt. do_classifier_free_guidance (bool): Whether to use classifier-free guidance. negative_prompt (Optional[Union[str, List[str]]], optional): The negative prompt text or a list of negative prompt texts. Defaults to None. - prompt_embeds (Optional[torch.FloatTensor], optional): The prompt embeddings. Defaults to None. - negative_prompt_embeds (Optional[torch.FloatTensor], optional): The negative prompt embeddings. Defaults to None. + prompt_embeds (Optional[torch.Tensor], optional): The prompt embeddings. Defaults to None. + negative_prompt_embeds (Optional[torch.Tensor], optional): The negative prompt embeddings. Defaults to None. lora_scale (Optional[float], optional): The LoRA scale. Defaults to None. **kwargs: Additional keyword arguments. Returns: - torch.FloatTensor: The encoded prompt embeddings. + torch.Tensor: The encoded prompt embeddings. """ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) @@ -407,11 +407,11 @@ def encode_prompt( num_images_per_prompt: int, do_classifier_free_guidance: bool, negative_prompt: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Encodes the prompt into text encoder hidden states. @@ -428,10 +428,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -813,7 +813,7 @@ def run_safety_checker( def __call__( self, prompt: Union[str, List[str]] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + ref_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -822,12 +822,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -844,9 +844,9 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - ref_image (`torch.FloatTensor`, `PIL.Image.Image`): + ref_image (`torch.Tensor`, `PIL.Image.Image`): The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can + the type is specified as `torch.Tensor`, it is passed to Reference Control as is. `PIL.Image.Image` can also be accepted as an image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -873,14 +873,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -892,7 +892,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -1017,10 +1017,10 @@ def __call__( def hacked_basic_transformer_inner_forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, @@ -1135,12 +1135,12 @@ def hacked_mid_forward(self, *args, **kwargs): def hack_CrossAttnDownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 @@ -1191,10 +1191,10 @@ def hack_CrossAttnDownBlock2D_forward( def hacked_DownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, **kwargs: Any, - ) -> Tuple[torch.FloatTensor, ...]: + ) -> Tuple[torch.Tensor, ...]: eps = 1e-6 output_states = () @@ -1236,15 +1236,15 @@ def hacked_DownBlock2D_forward( def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: eps = 1e-6 # TODO(Patrick, William) - attention mask is not used for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)): @@ -1292,12 +1292,12 @@ def hacked_CrossAttnUpBlock2D_forward( def hacked_UpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, **kwargs: Any, - ) -> torch.FloatTensor: + ) -> torch.Tensor: eps = 1e-6 for i, resnet in enumerate(self.resnets): # pop res hidden states diff --git a/examples/community/stable_diffusion_repaint.py b/examples/community/stable_diffusion_repaint.py index 02bef293bba8..2addc5a62dbb 100644 --- a/examples/community/stable_diffusion_repaint.py +++ b/examples/community/stable_diffusion_repaint.py @@ -285,8 +285,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -303,10 +303,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -606,8 +606,8 @@ def prepare_mask_latents( def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -618,12 +618,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -671,14 +671,14 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -690,7 +690,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/stable_diffusion_tensorrt_img2img.py b/examples/community/stable_diffusion_tensorrt_img2img.py index 90ec138398cf..7264a60506fe 100755 --- a/examples/community/stable_diffusion_tensorrt_img2img.py +++ b/examples/community/stable_diffusion_tensorrt_img2img.py @@ -962,7 +962,7 @@ def __loadResources(self, image_height, image_width, batch_size): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: int = 50, guidance_scale: float = 7.5, diff --git a/examples/community/stable_diffusion_tensorrt_inpaint.py b/examples/community/stable_diffusion_tensorrt_inpaint.py index 9ace697cc7a6..b2d61a3dab93 100755 --- a/examples/community/stable_diffusion_tensorrt_inpaint.py +++ b/examples/community/stable_diffusion_tensorrt_inpaint.py @@ -962,8 +962,8 @@ def __loadResources(self, image_height, image_width, batch_size): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 1.0, num_inference_steps: int = 50, guidance_scale: float = 7.5, diff --git a/examples/community/stable_diffusion_xl_reference.py b/examples/community/stable_diffusion_xl_reference.py index 7282582baf91..107afc1f8b7a 100644 --- a/examples/community/stable_diffusion_xl_reference.py +++ b/examples/community/stable_diffusion_xl_reference.py @@ -190,7 +190,7 @@ def __call__( self, prompt: Union[str, List[str]] = None, prompt_2: Optional[Union[str, List[str]]] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + ref_image: Union[torch.Tensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -201,14 +201,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -335,10 +335,10 @@ def __call__( def hacked_basic_transformer_inner_forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, @@ -453,12 +453,12 @@ def hacked_mid_forward(self, *args, **kwargs): def hack_CrossAttnDownBlock2D_forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 @@ -549,14 +549,14 @@ def hacked_DownBlock2D_forward(self, hidden_states, temb=None, *args, **kwargs): def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, ): eps = 1e-6 # TODO(Patrick, William) - attention mask is not used diff --git a/examples/community/stable_unclip.py b/examples/community/stable_unclip.py index 6acca20d6a78..f13c4e0a490b 100644 --- a/examples/community/stable_unclip.py +++ b/examples/community/stable_unclip.py @@ -191,7 +191,7 @@ def __call__( num_images_per_prompt: int = 1, prior_num_inference_steps: int = 25, generator: Optional[torch.Generator] = None, - prior_latents: Optional[torch.FloatTensor] = None, + prior_latents: Optional[torch.Tensor] = None, text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None, text_attention_mask: Optional[torch.Tensor] = None, prior_guidance_scale: float = 4.0, diff --git a/examples/community/text_inpainting.py b/examples/community/text_inpainting.py index ea4da966bb71..c4378ab96f28 100644 --- a/examples/community/text_inpainting.py +++ b/examples/community/text_inpainting.py @@ -125,7 +125,7 @@ def __init__( def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], text: str, height: int = 512, width: int = 512, @@ -135,10 +135,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -177,7 +177,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -189,7 +189,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/community/tiled_upscaling.py b/examples/community/tiled_upscaling.py index bf290042e822..313b5fc6f76a 100644 --- a/examples/community/tiled_upscaling.py +++ b/examples/community/tiled_upscaling.py @@ -193,8 +193,8 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + latents: Optional[torch.Tensor] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, tile_size: int = 128, tile_border: int = 32, @@ -206,7 +206,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`): + image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`): `Image`, or tensor representing an image batch which will be upscaled. * num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -228,7 +228,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/examples/community/unclip_image_interpolation.py b/examples/community/unclip_image_interpolation.py index e3bb44e5030b..210bd61ecd1d 100644 --- a/examples/community/unclip_image_interpolation.py +++ b/examples/community/unclip_image_interpolation.py @@ -207,14 +207,14 @@ def _encode_image(self, image, device, num_images_per_prompt, image_embeddings: @torch.no_grad() def __call__( self, - image: Optional[Union[List[PIL.Image.Image], torch.FloatTensor]] = None, + image: Optional[Union[List[PIL.Image.Image], torch.Tensor]] = None, steps: int = 5, decoder_num_inference_steps: int = 25, super_res_num_inference_steps: int = 7, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, image_embeddings: Optional[torch.Tensor] = None, - decoder_latents: Optional[torch.FloatTensor] = None, - super_res_latents: Optional[torch.FloatTensor] = None, + decoder_latents: Optional[torch.Tensor] = None, + super_res_latents: Optional[torch.Tensor] = None, decoder_guidance_scale: float = 8.0, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -223,7 +223,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image (`List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`List[PIL.Image.Image]` or `torch.Tensor`): The images to use for the image interpolation. Only accepts a list of two PIL Images or If you provide a tensor, it needs to comply with the configuration of [this](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json) @@ -242,9 +242,9 @@ def __call__( image_embeddings (`torch.Tensor`, *optional*): Pre-defined image embeddings that can be derived from the image encoder. Pre-defined image embeddings can be passed for tasks like image interpolations. `image` can the be left to `None`. - decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): + decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. - super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): + super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. decoder_guidance_scale (`float`, *optional*, defaults to 4.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). @@ -272,19 +272,19 @@ def __call__( raise AssertionError( f"Expected 'image' List to contain PIL.Image.Image, but passed 'image' contents are {type(image[0])} and {type(image[1])}" ) - elif isinstance(image, torch.FloatTensor): + elif isinstance(image, torch.Tensor): if image.shape[0] != 2: raise AssertionError( - f"Expected 'image' to be torch.FloatTensor of shape 2 in 0th dimension, but passed 'image' size is {image.shape[0]}" + f"Expected 'image' to be torch.Tensor of shape 2 in 0th dimension, but passed 'image' size is {image.shape[0]}" ) elif isinstance(image_embeddings, torch.Tensor): if image_embeddings.shape[0] != 2: raise AssertionError( - f"Expected 'image_embeddings' to be torch.FloatTensor of shape 2 in 0th dimension, but passed 'image_embeddings' shape is {image_embeddings.shape[0]}" + f"Expected 'image_embeddings' to be torch.Tensor of shape 2 in 0th dimension, but passed 'image_embeddings' shape is {image_embeddings.shape[0]}" ) else: raise AssertionError( - f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or Torch.FloatTensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively" + f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively" ) original_image_embeddings = self._encode_image( diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py index 241e661536d3..c866ce2ae904 100644 --- a/examples/community/wildcard_stable_diffusion.py +++ b/examples/community/wildcard_stable_diffusion.py @@ -166,10 +166,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = [], @@ -206,7 +206,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -218,7 +218,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py index 1e88cb67ee71..e7d934dd07a8 100644 --- a/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py @@ -336,7 +336,7 @@ def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/consistency_distillation/train_lcm_distill_sd_wds.py b/examples/consistency_distillation/train_lcm_distill_sd_wds.py index 5dcad9f6cc39..7fbcb5d6fb91 100644 --- a/examples/consistency_distillation/train_lcm_distill_sd_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_sd_wds.py @@ -314,7 +314,7 @@ def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py index a7deca72a86f..c8b91e7abdd6 100644 --- a/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py +++ b/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py @@ -406,7 +406,7 @@ def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/examples/research_projects/consistency_training/train_cm_ct_unconditional.py b/examples/research_projects/consistency_training/train_cm_ct_unconditional.py index 947de230b6b2..b7a1e2a545f8 100644 --- a/examples/research_projects/consistency_training/train_cm_ct_unconditional.py +++ b/examples/research_projects/consistency_training/train_cm_ct_unconditional.py @@ -126,7 +126,7 @@ def get_karras_sigmas( return sigmas -def get_discretized_lognormal_weights(noise_levels: torch.FloatTensor, p_mean: float = -1.1, p_std: float = 2.0): +def get_discretized_lognormal_weights(noise_levels: torch.Tensor, p_mean: float = -1.1, p_std: float = 2.0): """ Calculates the unnormalized weights for a 1D array of noise level sigma_i based on the discretized lognormal" " distribution used in the iCT paper (given in Equation 10). @@ -137,14 +137,14 @@ def get_discretized_lognormal_weights(noise_levels: torch.FloatTensor, p_mean: f return weights -def get_loss_weighting_schedule(noise_levels: torch.FloatTensor): +def get_loss_weighting_schedule(noise_levels: torch.Tensor): """ Calculates the loss weighting schedule lambda given a set of noise levels. """ return 1.0 / (noise_levels[1:] - noise_levels[:-1]) -def add_noise(original_samples: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.FloatTensor): +def add_noise(original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor): # Make sure timesteps (Karras sigmas) have the same device and dtype as original_samples sigmas = timesteps.to(device=original_samples.device, dtype=original_samples.dtype) while len(sigmas.shape) < len(original_samples.shape): diff --git a/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb b/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb index 6997c74bcad1..bde093802a5d 100644 --- a/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb +++ b/examples/research_projects/geodiff/geodiff_molecule_conformation.ipynb @@ -737,11 +737,11 @@ "class MoleculeGNNOutput(BaseOutput):\n", " \"\"\"\n", " Args:\n", - " sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):\n", + " sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):\n", " Hidden states output. Output of last layer of model.\n", " \"\"\"\n", "\n", - " sample: torch.FloatTensor\n", + " sample: torch.Tensor\n", "\n", "\n", "class MultiLayerPerceptron(nn.Module):\n", @@ -1354,7 +1354,7 @@ " r\"\"\"\n", " Args:\n", " sample: packed torch geometric object\n", - " timestep (`torch.FloatTensor` or `float` or `int): TODO verify type and shape (batch) timesteps\n", + " timestep (`torch.Tensor` or `float` or `int): TODO verify type and shape (batch) timesteps\n", " return_dict (`bool`, *optional*, defaults to `True`):\n", " Whether or not to return a [`~models.molecule_gnn.MoleculeGNNOutput`] instead of a plain tuple.\n", " Returns:\n", @@ -1404,7 +1404,7 @@ " if not return_dict:\n", " return (-eps_pos,)\n", "\n", - " return MoleculeGNNOutput(sample=torch.FloatTensor(-eps_pos).to(pos.device))" + " return MoleculeGNNOutput(sample=torch.Tensor(-eps_pos).to(pos.device))" ], "metadata": { "id": "MCeZA1qQXzoK" diff --git a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py index dcbc2704b833..61b1cbef195e 100644 --- a/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py +++ b/examples/research_projects/promptdiffusion/pipeline_prompt_diffusion.py @@ -279,8 +279,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -312,8 +312,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -333,10 +333,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -852,7 +852,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -906,9 +906,9 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -928,10 +928,10 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, images must be passed as a list such that each element of the list can be correctly batched for @@ -963,14 +963,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. @@ -981,7 +981,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py b/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py index 4e9015960157..46cabd863dfa 100644 --- a/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py +++ b/examples/research_projects/promptdiffusion/promptdiffusioncontrolnet.py @@ -181,11 +181,11 @@ def __init__( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, - controlnet_cond: torch.FloatTensor, - controlnet_query_cond: torch.FloatTensor, + controlnet_cond: torch.Tensor, + controlnet_query_cond: torch.Tensor, conditioning_scale: float = 1.0, class_labels: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None, @@ -194,20 +194,20 @@ def forward( cross_attention_kwargs: Optional[Dict[str, Any]] = None, guess_mode: bool = False, return_dict: bool = True, - ) -> Union[ControlNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]: + ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]: """ The [`~PromptDiffusionControlNetModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor. timestep (`Union[torch.Tensor, float, int]`): The number of timesteps to denoise an input. encoder_hidden_states (`torch.Tensor`): The encoder hidden states. - controlnet_cond (`torch.FloatTensor`): + controlnet_cond (`torch.Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. - controlnet_query_cond (`torch.FloatTensor`): + controlnet_query_cond (`torch.Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): The scale factor for ControlNet outputs. diff --git a/examples/research_projects/rdm/pipeline_rdm.py b/examples/research_projects/rdm/pipeline_rdm.py index e0c4847c7e39..201acb95aabd 100644 --- a/examples/research_projects/rdm/pipeline_rdm.py +++ b/examples/research_projects/rdm/pipeline_rdm.py @@ -163,11 +163,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, knn: Optional[int] = 10, **kwargs, @@ -199,11 +199,11 @@ def __call__( generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -213,7 +213,7 @@ def __call__( Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/examples/research_projects/rdm/retriever.py b/examples/research_projects/rdm/retriever.py index 16518ed1bc42..6be9785a21f3 100644 --- a/examples/research_projects/rdm/retriever.py +++ b/examples/research_projects/rdm/retriever.py @@ -20,7 +20,7 @@ def normalize_images(images: List[Image.Image]): return images -def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtractor) -> torch.FloatTensor: +def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtractor) -> torch.Tensor: """ Preprocesses a list of images into a batch of tensors. @@ -29,7 +29,7 @@ def preprocess_images(images: List[np.array], feature_extractor: CLIPFeatureExtr A list of images to preprocess. Returns: - :obj:`torch.FloatTensor`: A batch of tensors. + :obj:`torch.Tensor`: A batch of tensors. """ images = [np.array(image) for image in images] images = [(image + 1.0) / 2.0 for image in images] diff --git a/scripts/convert_music_spectrogram_to_diffusers.py b/scripts/convert_music_spectrogram_to_diffusers.py index 41ee8b914774..1ac9ceddb57e 100644 --- a/scripts/convert_music_spectrogram_to_diffusers.py +++ b/scripts/convert_music_spectrogram_to_diffusers.py @@ -17,109 +17,99 @@ def load_notes_encoder(weights, model): - model.token_embedder.weight = nn.Parameter(torch.FloatTensor(weights["token_embedder"]["embedding"])) - model.position_encoding.weight = nn.Parameter( - torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False - ) + model.token_embedder.weight = nn.Parameter(torch.Tensor(weights["token_embedder"]["embedding"])) + model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False) for lyr_num, lyr in enumerate(model.encoders): ly_weight = weights[f"layers_{lyr_num}"] - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) + lyr.layer[0].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_attention_layer_norm"]["scale"])) attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T)) - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + model.layer_norm.weight = nn.Parameter(torch.Tensor(weights["encoder_norm"]["scale"])) return model def load_continuous_encoder(weights, model): - model.input_proj.weight = nn.Parameter(torch.FloatTensor(weights["input_proj"]["kernel"].T)) + model.input_proj.weight = nn.Parameter(torch.Tensor(weights["input_proj"]["kernel"].T)) - model.position_encoding.weight = nn.Parameter( - torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False - ) + model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False) for lyr_num, lyr in enumerate(model.encoders): ly_weight = weights[f"layers_{lyr_num}"] attention_weights = ly_weight["attention"] - lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) - lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_attention_layer_norm"]["scale"]) - ) + lyr.layer[0].SelfAttention.q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].SelfAttention.k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].SelfAttention.v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].SelfAttention.o.weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_attention_layer_norm"]["scale"])) - lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) - lyr.layer[1].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[1].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[1].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[1].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"])) - model.layer_norm.weight = nn.Parameter(torch.FloatTensor(weights["encoder_norm"]["scale"])) + model.layer_norm.weight = nn.Parameter(torch.Tensor(weights["encoder_norm"]["scale"])) return model def load_decoder(weights, model): - model.conditioning_emb[0].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense0"]["kernel"].T)) - model.conditioning_emb[2].weight = nn.Parameter(torch.FloatTensor(weights["time_emb_dense1"]["kernel"].T)) + model.conditioning_emb[0].weight = nn.Parameter(torch.Tensor(weights["time_emb_dense0"]["kernel"].T)) + model.conditioning_emb[2].weight = nn.Parameter(torch.Tensor(weights["time_emb_dense1"]["kernel"].T)) - model.position_encoding.weight = nn.Parameter( - torch.FloatTensor(weights["Embed_0"]["embedding"]), requires_grad=False - ) + model.position_encoding.weight = nn.Parameter(torch.Tensor(weights["Embed_0"]["embedding"]), requires_grad=False) model.continuous_inputs_projection.weight = nn.Parameter( - torch.FloatTensor(weights["continuous_inputs_projection"]["kernel"].T) + torch.Tensor(weights["continuous_inputs_projection"]["kernel"].T) ) for lyr_num, lyr in enumerate(model.decoders): ly_weight = weights[f"layers_{lyr_num}"] lyr.layer[0].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) + torch.Tensor(ly_weight["pre_self_attention_layer_norm"]["scale"]) ) lyr.layer[0].FiLMLayer.scale_bias.weight = nn.Parameter( - torch.FloatTensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T) + torch.Tensor(ly_weight["FiLMLayer_0"]["DenseGeneral_0"]["kernel"].T) ) attention_weights = ly_weight["self_attention"] - lyr.layer[0].attention.to_q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[0].attention.to_k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[0].attention.to_v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[0].attention.to_out[0].weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[0].attention.to_q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[0].attention.to_k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[0].attention.to_v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[0].attention.to_out[0].weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) attention_weights = ly_weight["MultiHeadDotProductAttention_0"] - lyr.layer[1].attention.to_q.weight = nn.Parameter(torch.FloatTensor(attention_weights["query"]["kernel"].T)) - lyr.layer[1].attention.to_k.weight = nn.Parameter(torch.FloatTensor(attention_weights["key"]["kernel"].T)) - lyr.layer[1].attention.to_v.weight = nn.Parameter(torch.FloatTensor(attention_weights["value"]["kernel"].T)) - lyr.layer[1].attention.to_out[0].weight = nn.Parameter(torch.FloatTensor(attention_weights["out"]["kernel"].T)) + lyr.layer[1].attention.to_q.weight = nn.Parameter(torch.Tensor(attention_weights["query"]["kernel"].T)) + lyr.layer[1].attention.to_k.weight = nn.Parameter(torch.Tensor(attention_weights["key"]["kernel"].T)) + lyr.layer[1].attention.to_v.weight = nn.Parameter(torch.Tensor(attention_weights["value"]["kernel"].T)) + lyr.layer[1].attention.to_out[0].weight = nn.Parameter(torch.Tensor(attention_weights["out"]["kernel"].T)) lyr.layer[1].layer_norm.weight = nn.Parameter( - torch.FloatTensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) + torch.Tensor(ly_weight["pre_cross_attention_layer_norm"]["scale"]) ) - lyr.layer[2].layer_norm.weight = nn.Parameter(torch.FloatTensor(ly_weight["pre_mlp_layer_norm"]["scale"])) + lyr.layer[2].layer_norm.weight = nn.Parameter(torch.Tensor(ly_weight["pre_mlp_layer_norm"]["scale"])) lyr.layer[2].film.scale_bias.weight = nn.Parameter( - torch.FloatTensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) + torch.Tensor(ly_weight["FiLMLayer_1"]["DenseGeneral_0"]["kernel"].T) ) - lyr.layer[2].DenseReluDense.wi_0.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) - lyr.layer[2].DenseReluDense.wi_1.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) - lyr.layer[2].DenseReluDense.wo.weight = nn.Parameter(torch.FloatTensor(ly_weight["mlp"]["wo"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wi_0.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_0"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wi_1.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wi_1"]["kernel"].T)) + lyr.layer[2].DenseReluDense.wo.weight = nn.Parameter(torch.Tensor(ly_weight["mlp"]["wo"]["kernel"].T)) - model.decoder_norm.weight = nn.Parameter(torch.FloatTensor(weights["decoder_norm"]["scale"])) + model.decoder_norm.weight = nn.Parameter(torch.Tensor(weights["decoder_norm"]["scale"])) - model.spec_out.weight = nn.Parameter(torch.FloatTensor(weights["spec_out_dense"]["kernel"].T)) + model.spec_out.weight = nn.Parameter(torch.Tensor(weights["spec_out_dense"]["kernel"].T)) return model diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 0f4481570829..ce504eda3d2a 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -29,10 +29,10 @@ PipelineImageInput = Union[ PIL.Image.Image, np.ndarray, - torch.FloatTensor, + torch.Tensor, List[PIL.Image.Image], List[np.ndarray], - List[torch.FloatTensor], + List[torch.Tensor], ] PipelineDepthInput = PipelineImageInput @@ -110,7 +110,7 @@ def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.nd return images @staticmethod - def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: + def numpy_to_pt(images: np.ndarray) -> torch.Tensor: """ Convert a NumPy image to a PyTorch tensor. """ @@ -121,7 +121,7 @@ def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: return images @staticmethod - def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray: + def pt_to_numpy(images: torch.Tensor) -> np.ndarray: """ Convert a PyTorch tensor to a NumPy image. """ @@ -561,15 +561,15 @@ def preprocess( def postprocess( self, - image: torch.FloatTensor, + image: torch.Tensor, output_type: str = "pil", do_denormalize: Optional[List[bool]] = None, - ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]: + ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: """ Postprocess the image output from tensor to `output_type`. Args: - image (`torch.FloatTensor`): + image (`torch.Tensor`): The image input, should be a pytorch tensor with shape `B x C x H x W`. output_type (`str`, *optional*, defaults to `pil`): The output type of the image, can be one of `pil`, `np`, `pt`, `latent`. @@ -578,7 +578,7 @@ def postprocess( `VaeImageProcessor` config. Returns: - `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`: + `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`: The postprocessed image. """ if not isinstance(image, torch.Tensor): @@ -738,15 +738,15 @@ def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]: def postprocess( self, - image: torch.FloatTensor, + image: torch.Tensor, output_type: str = "pil", do_denormalize: Optional[List[bool]] = None, - ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]: + ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: """ Postprocess the image output from tensor to `output_type`. Args: - image (`torch.FloatTensor`): + image (`torch.Tensor`): The image input, should be a pytorch tensor with shape `B x C x H x W`. output_type (`str`, *optional*, defaults to `pil`): The output type of the image, can be one of `pil`, `np`, `pt`, `latent`. @@ -755,7 +755,7 @@ def postprocess( `VaeImageProcessor` config. Returns: - `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`: + `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`: The postprocessed image. """ if not isinstance(image, torch.Tensor): @@ -793,8 +793,8 @@ def postprocess( def preprocess( self, - rgb: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - depth: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], + rgb: Union[torch.Tensor, PIL.Image.Image, np.ndarray], + depth: Union[torch.Tensor, PIL.Image.Image, np.ndarray], height: Optional[int] = None, width: Optional[int] = None, target_res: Optional[int] = None, @@ -933,13 +933,13 @@ def __init__( ) @staticmethod - def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value_embed_dim: int): + def downsample(mask: torch.Tensor, batch_size: int, num_queries: int, value_embed_dim: int): """ Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued. Args: - mask (`torch.FloatTensor`): + mask (`torch.Tensor`): The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`. batch_size (`int`): The batch size. @@ -949,7 +949,7 @@ def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value The dimensionality of the value embeddings. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The downsampled mask tensor. """ diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index 50866e3a7a8c..3d4fccb20779 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -282,15 +282,15 @@ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0): def forward( self, - hidden_states: torch.FloatTensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, timestep: Optional[torch.LongTensor] = None, cross_attention_kwargs: Dict[str, Any] = None, class_labels: Optional[torch.LongTensor] = None, added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -477,10 +477,10 @@ def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs): def forward( self, - hidden_states: torch.FloatTensor, + hidden_states: torch.Tensor, num_frames: int, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_hidden_states: Optional[torch.Tensor] = None, + ) -> torch.Tensor: # Notice that normalization is always applied before the real computation in the following blocks. # 0. Self-Attention batch_size = hidden_states.shape[0] diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index ea1c987e95c6..cbb07eafa37f 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -503,9 +503,9 @@ def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProce def forward( self, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, **cross_attention_kwargs, ) -> torch.Tensor: r""" @@ -751,10 +751,10 @@ class AttnProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, ) -> torch.Tensor: @@ -863,9 +863,9 @@ def __init__( def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: batch_size, sequence_length, _ = hidden_states.shape attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) @@ -928,9 +928,9 @@ class AttnAddedKVProcessor: def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, *args, **kwargs, ) -> torch.Tensor: @@ -1001,9 +1001,9 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, *args, **kwargs, ) -> torch.Tensor: @@ -1080,9 +1080,9 @@ def __init__(self, attention_op: Optional[Callable] = None): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: residual = hidden_states hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) @@ -1151,13 +1151,13 @@ def __init__(self, attention_op: Optional[Callable] = None): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1243,13 +1243,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1349,13 +1349,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1448,13 +1448,13 @@ def __init__(self): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1581,10 +1581,10 @@ def __init__( def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: batch_size, sequence_length, _ = ( hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape ) @@ -1692,10 +1692,10 @@ def __init__( def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: batch_size, sequence_length, _ = hidden_states.shape attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) if self.train_q_out: @@ -1773,10 +1773,10 @@ def __init__(self, slice_size: int): def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: residual = hidden_states input_ndim = hidden_states.ndim @@ -1860,11 +1860,11 @@ def __init__(self, slice_size): def __call__( self, attn: "Attention", - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: residual = hidden_states if attn.spatial_norm is not None: @@ -1957,7 +1957,7 @@ def __init__( self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) - def forward(self, f: torch.FloatTensor, zq: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, f: torch.Tensor, zq: torch.Tensor) -> torch.Tensor: f_size = f.shape[-2:] zq = F.interpolate(zq, size=f_size, mode="nearest") norm_f = self.norm_layer(f) @@ -2003,7 +2003,7 @@ def __init__( self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2064,7 +2064,7 @@ def __init__( self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2143,7 +2143,7 @@ def __init__( self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2202,7 +2202,7 @@ def __init__( self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) - def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, **kwargs) -> torch.FloatTensor: + def __call__(self, attn: Attention, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: self_cls_name = self.__class__.__name__ deprecate( self_cls_name, @@ -2264,12 +2264,12 @@ def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, - ip_adapter_masks: Optional[torch.FloatTensor] = None, + ip_adapter_masks: Optional[torch.Tensor] = None, ): residual = hidden_states @@ -2467,12 +2467,12 @@ def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale def __call__( self, attn: Attention, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None, scale: float = 1.0, - ip_adapter_masks: Optional[torch.FloatTensor] = None, + ip_adapter_masks: Optional[torch.Tensor] = None, ): residual = hidden_states diff --git a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py index fc2041d2e930..85b4195077df 100644 --- a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py @@ -112,9 +112,7 @@ def __init__( self.register_to_config(force_upcast=False) @apply_forward_hook - def encode( - self, x: torch.FloatTensor, return_dict: bool = True - ) -> Union[AutoencoderKLOutput, Tuple[torch.FloatTensor]]: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[AutoencoderKLOutput, Tuple[torch.Tensor]]: h = self.encoder(x) moments = self.quant_conv(h) posterior = DiagonalGaussianDistribution(moments) @@ -126,11 +124,11 @@ def encode( def _decode( self, - z: torch.FloatTensor, - image: Optional[torch.FloatTensor] = None, - mask: Optional[torch.FloatTensor] = None, + z: torch.Tensor, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, return_dict: bool = True, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: z = self.post_quant_conv(z) dec = self.decoder(z, image, mask) @@ -142,12 +140,12 @@ def _decode( @apply_forward_hook def decode( self, - z: torch.FloatTensor, + z: torch.Tensor, generator: Optional[torch.Generator] = None, - image: Optional[torch.FloatTensor] = None, - mask: Optional[torch.FloatTensor] = None, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, return_dict: bool = True, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: decoded = self._decode(z, image, mask).sample if not return_dict: @@ -157,16 +155,16 @@ def decode( def forward( self, - sample: torch.FloatTensor, - mask: Optional[torch.FloatTensor] = None, + sample: torch.Tensor, + mask: Optional[torch.Tensor] = None, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: r""" Args: - sample (`torch.FloatTensor`): Input sample. - mask (`torch.FloatTensor`, *optional*, defaults to `None`): Optional inpainting mask. + sample (`torch.Tensor`): Input sample. + mask (`torch.Tensor`, *optional*, defaults to `None`): Optional inpainting mask. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index 6755cd714692..567d78d1fefe 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -237,13 +237,13 @@ def set_default_attn_processor(self): @apply_forward_hook def encode( - self, x: torch.FloatTensor, return_dict: bool = True + self, x: torch.Tensor, return_dict: bool = True ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]: """ Encode a batch of images into latents. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. @@ -268,7 +268,7 @@ def encode( return AutoencoderKLOutput(latent_dist=posterior) - def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size): return self.tiled_decode(z, return_dict=return_dict) @@ -281,14 +281,12 @@ def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[Decod return DecoderOutput(sample=dec) @apply_forward_hook - def decode( - self, z: torch.FloatTensor, return_dict: bool = True, generator=None - ) -> Union[DecoderOutput, torch.FloatTensor]: + def decode(self, z: torch.Tensor, return_dict: bool = True, generator=None) -> Union[DecoderOutput, torch.Tensor]: """ Decode a batch of images. Args: - z (`torch.FloatTensor`): Input batch of latent vectors. + z (`torch.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. @@ -321,7 +319,7 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch. b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) return b - def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput: + def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -331,7 +329,7 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen output, but they should be much less noticeable. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. @@ -375,12 +373,12 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen return AutoencoderKLOutput(latent_dist=posterior) - def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: r""" Decode a batch of images using a tiled decoder. Args: - z (`torch.FloatTensor`): Input batch of latent vectors. + z (`torch.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. @@ -425,14 +423,14 @@ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[ def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, torch.FloatTensor]: + ) -> Union[DecoderOutput, torch.Tensor]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py index b12226fa4bac..67540cb7dc7f 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py @@ -86,10 +86,10 @@ def __init__( def forward( self, - sample: torch.FloatTensor, - image_only_indicator: torch.FloatTensor, + sample: torch.Tensor, + image_only_indicator: torch.Tensor, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r"""The forward method of the `Decoder` class.""" sample = self.conv_in(sample) @@ -315,13 +315,13 @@ def set_default_attn_processor(self): @apply_forward_hook def encode( - self, x: torch.FloatTensor, return_dict: bool = True + self, x: torch.Tensor, return_dict: bool = True ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]: """ Encode a batch of images into latents. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. @@ -341,15 +341,15 @@ def encode( @apply_forward_hook def decode( self, - z: torch.FloatTensor, + z: torch.Tensor, num_frames: int, return_dict: bool = True, - ) -> Union[DecoderOutput, torch.FloatTensor]: + ) -> Union[DecoderOutput, torch.Tensor]: """ Decode a batch of images. Args: - z (`torch.FloatTensor`): Input batch of latent vectors. + z (`torch.Tensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. @@ -370,15 +370,15 @@ def decode( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, num_frames: int = 1, - ) -> Union[DecoderOutput, torch.FloatTensor]: + ) -> Union[DecoderOutput, torch.Tensor]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/autoencoder_tiny.py b/src/diffusers/models/autoencoders/autoencoder_tiny.py index a7047acdfd74..39b885b45222 100644 --- a/src/diffusers/models/autoencoders/autoencoder_tiny.py +++ b/src/diffusers/models/autoencoders/autoencoder_tiny.py @@ -157,11 +157,11 @@ def _set_gradient_checkpointing(self, module, value: bool = False) -> None: if isinstance(module, (EncoderTiny, DecoderTiny)): module.gradient_checkpointing = value - def scale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor: + def scale_latents(self, x: torch.Tensor) -> torch.Tensor: """raw latents -> [0, 1]""" return x.div(2 * self.latent_magnitude).add(self.latent_shift).clamp(0, 1) - def unscale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor: + def unscale_latents(self, x: torch.Tensor) -> torch.Tensor: """[0, 1] -> raw latents""" return x.sub(self.latent_shift).mul(2 * self.latent_magnitude) @@ -194,7 +194,7 @@ def disable_tiling(self) -> None: """ self.enable_tiling(False) - def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor: + def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -202,10 +202,10 @@ def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor: tiles overlap and are blended together to form a smooth output. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. Returns: - `torch.FloatTensor`: Encoded batch of images. + `torch.Tensor`: Encoded batch of images. """ # scale of encoder output relative to input sf = self.spatial_scale_factor @@ -242,7 +242,7 @@ def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor: tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out) return out - def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor: + def _tiled_decode(self, x: torch.Tensor) -> torch.Tensor: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -250,10 +250,10 @@ def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor: tiles overlap and are blended together to form a smooth output. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. Returns: - `torch.FloatTensor`: Encoded batch of images. + `torch.Tensor`: Encoded batch of images. """ # scale of decoder output relative to input sf = self.spatial_scale_factor @@ -290,9 +290,7 @@ def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor: return out @apply_forward_hook - def encode( - self, x: torch.FloatTensor, return_dict: bool = True - ) -> Union[AutoencoderTinyOutput, Tuple[torch.FloatTensor]]: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[AutoencoderTinyOutput, Tuple[torch.Tensor]]: if self.use_slicing and x.shape[0] > 1: output = [ self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x_slice) for x_slice in x.split(1) @@ -308,8 +306,8 @@ def encode( @apply_forward_hook def decode( - self, x: torch.FloatTensor, generator: Optional[torch.Generator] = None, return_dict: bool = True - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + self, x: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: if self.use_slicing and x.shape[0] > 1: output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)] output = torch.cat(output) @@ -323,12 +321,12 @@ def decode( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. """ diff --git a/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py index 7287cbd43f63..212c46537706 100644 --- a/src/diffusers/models/autoencoders/consistency_decoder_vae.py +++ b/src/diffusers/models/autoencoders/consistency_decoder_vae.py @@ -276,13 +276,13 @@ def set_default_attn_processor(self): @apply_forward_hook def encode( - self, x: torch.FloatTensor, return_dict: bool = True + self, x: torch.Tensor, return_dict: bool = True ) -> Union[ConsistencyDecoderVAEOutput, Tuple[DiagonalGaussianDistribution]]: """ Encode a batch of images into latents. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain tuple. @@ -312,22 +312,22 @@ def encode( @apply_forward_hook def decode( self, - z: torch.FloatTensor, + z: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, num_inference_steps: int = 2, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: """ Decodes the input latent vector `z` using the consistency decoder VAE model. Args: - z (torch.FloatTensor): The input latent vector. + z (torch.Tensor): The input latent vector. generator (Optional[torch.Generator]): The random number generator. Default is None. return_dict (bool): Whether to return the output as a dictionary. Default is True. num_inference_steps (int): The number of inference steps. Default is 2. Returns: - Union[DecoderOutput, Tuple[torch.FloatTensor]]: The decoded output. + Union[DecoderOutput, Tuple[torch.Tensor]]: The decoded output. """ z = (z * self.config.scaling_factor - self.means) / self.stds @@ -370,9 +370,7 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch. b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) return b - def tiled_encode( - self, x: torch.FloatTensor, return_dict: bool = True - ) -> Union[ConsistencyDecoderVAEOutput, Tuple]: + def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> Union[ConsistencyDecoderVAEOutput, Tuple]: r"""Encode a batch of images using a tiled encoder. When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -382,7 +380,7 @@ def tiled_encode( output, but they should be much less noticeable. Args: - x (`torch.FloatTensor`): Input batch of images. + x (`torch.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain tuple. @@ -429,14 +427,14 @@ def tiled_encode( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, sample_posterior: bool = False, return_dict: bool = True, generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]: + ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. sample_posterior (`bool`, *optional*, defaults to `False`): Whether to sample from the posterior. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/autoencoders/vae.py b/src/diffusers/models/autoencoders/vae.py index 75503dc6a4ad..333842905bc3 100644 --- a/src/diffusers/models/autoencoders/vae.py +++ b/src/diffusers/models/autoencoders/vae.py @@ -36,11 +36,11 @@ class DecoderOutput(BaseOutput): Output of decoding method. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The decoded output sample from the last layer of the model. """ - sample: torch.FloatTensor + sample: torch.Tensor class Encoder(nn.Module): @@ -136,7 +136,7 @@ def __init__( self.gradient_checkpointing = False - def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, sample: torch.Tensor) -> torch.Tensor: r"""The forward method of the `Encoder` class.""" sample = self.conv_in(sample) @@ -282,9 +282,9 @@ def __init__( def forward( self, - sample: torch.FloatTensor, - latent_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + sample: torch.Tensor, + latent_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: r"""The forward method of the `Decoder` class.""" sample = self.conv_in(sample) @@ -367,7 +367,7 @@ def __init__( self.out_channels = out_channels self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1) - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: r"""The forward method of the `UpSample` class.""" x = torch.relu(x) x = self.deconv(x) @@ -416,7 +416,7 @@ def __init__( self.layers = nn.Sequential(*layers) - def forward(self, x: torch.FloatTensor, mask=None) -> torch.FloatTensor: + def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor: r"""The forward method of the `MaskConditionEncoder` class.""" out = {} for l in range(len(self.layers)): @@ -533,11 +533,11 @@ def __init__( def forward( self, - z: torch.FloatTensor, - image: Optional[torch.FloatTensor] = None, - mask: Optional[torch.FloatTensor] = None, - latent_embeds: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + z: torch.Tensor, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, + latent_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: r"""The forward method of the `MaskConditionDecoder` class.""" sample = z sample = self.conv_in(sample) @@ -711,7 +711,7 @@ def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor: back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds) return back.reshape(ishape) - def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, Tuple]: + def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Tuple]: # reshape z -> (batch, height, width, channel) and flatten z = z.permute(0, 2, 3, 1).contiguous() z_flattened = z.view(-1, self.vq_embed_dim) @@ -730,7 +730,7 @@ def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatT loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2) # preserve gradients - z_q: torch.FloatTensor = z + (z_q - z).detach() + z_q: torch.Tensor = z + (z_q - z).detach() # reshape back to match original input shape z_q = z_q.permute(0, 3, 1, 2).contiguous() @@ -745,7 +745,7 @@ def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatT return z_q, loss, (perplexity, min_encodings, min_encoding_indices) - def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.FloatTensor: + def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.Tensor: # shape specifying (batch, height, width, channel) if self.remap is not None: indices = indices.reshape(shape[0], -1) # add batch axis @@ -753,7 +753,7 @@ def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) indices = indices.reshape(-1) # flatten again # get quantized latent vectors - z_q: torch.FloatTensor = self.embedding(indices) + z_q: torch.Tensor = self.embedding(indices) if shape is not None: z_q = z_q.view(shape) @@ -776,7 +776,7 @@ def __init__(self, parameters: torch.Tensor, deterministic: bool = False): self.mean, device=self.parameters.device, dtype=self.parameters.dtype ) - def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor: + def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor: # make sure sample is on the same device as the parameters and has same dtype sample = randn_tensor( self.mean.shape, @@ -873,7 +873,7 @@ def __init__( self.layers = nn.Sequential(*layers) self.gradient_checkpointing = False - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: r"""The forward method of the `EncoderTiny` class.""" if self.training and self.gradient_checkpointing: @@ -956,7 +956,7 @@ def __init__( self.layers = nn.Sequential(*layers) self.gradient_checkpointing = False - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: r"""The forward method of the `DecoderTiny` class.""" # Clamp. x = torch.tanh(x / 3) * 3 diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 9dc5292887a8..2618a8b15f82 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -665,10 +665,10 @@ def _set_gradient_checkpointing(self, module, value: bool = False) -> None: def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, - controlnet_cond: torch.FloatTensor, + controlnet_cond: torch.Tensor, conditioning_scale: float = 1.0, class_labels: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None, @@ -677,18 +677,18 @@ def forward( cross_attention_kwargs: Optional[Dict[str, Any]] = None, guess_mode: bool = False, return_dict: bool = True, - ) -> Union[ControlNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]: + ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]: """ The [`ControlNetModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor. timestep (`Union[torch.Tensor, float, int]`): The number of timesteps to denoise an input. encoder_hidden_states (`torch.Tensor`): The encoder hidden states. - controlnet_cond (`torch.FloatTensor`): + controlnet_cond (`torch.Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): The scale factor for ControlNet outputs. diff --git a/src/diffusers/models/controlnet_xs.py b/src/diffusers/models/controlnet_xs.py index a4f9e61f37c7..7c3ade26f1ac 100644 --- a/src/diffusers/models/controlnet_xs.py +++ b/src/diffusers/models/controlnet_xs.py @@ -17,7 +17,7 @@ import torch import torch.utils.checkpoint -from torch import FloatTensor, nn +from torch import Tensor, nn from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput, is_torch_version, logging @@ -54,12 +54,12 @@ class ControlNetXSOutput(BaseOutput): The output of [`UNetControlNetXSModel`]. Args: - sample (`FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`Tensor` of shape `(batch_size, num_channels, height, width)`): The output of the `UNetControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base model output, but is already the final output. """ - sample: FloatTensor = None + sample: Tensor = None class DownBlockControlNetXSAdapter(nn.Module): @@ -1001,7 +1001,7 @@ def unfuse_qkv_projections(self): def forward( self, - sample: FloatTensor, + sample: Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, controlnet_cond: Optional[torch.Tensor] = None, @@ -1018,13 +1018,13 @@ def forward( The [`ControlNetXSModel`] forward method. Args: - sample (`FloatTensor`): + sample (`Tensor`): The noisy input tensor. timestep (`Union[torch.Tensor, float, int]`): The number of timesteps to denoise an input. encoder_hidden_states (`torch.Tensor`): The encoder hidden states. - controlnet_cond (`FloatTensor`): + controlnet_cond (`Tensor`): The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. conditioning_scale (`float`, defaults to `1.0`): How much the control model affects the base model outputs. @@ -1402,16 +1402,16 @@ def freeze_base_params(self) -> None: def forward( self, - hidden_states_base: FloatTensor, - temb: FloatTensor, - encoder_hidden_states: Optional[FloatTensor] = None, - hidden_states_ctrl: Optional[FloatTensor] = None, + hidden_states_base: Tensor, + temb: Tensor, + encoder_hidden_states: Optional[Tensor] = None, + hidden_states_ctrl: Optional[Tensor] = None, conditioning_scale: Optional[float] = 1.0, - attention_mask: Optional[FloatTensor] = None, + attention_mask: Optional[Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[FloatTensor] = None, + encoder_attention_mask: Optional[Tensor] = None, apply_control: bool = True, - ) -> Tuple[FloatTensor, FloatTensor, Tuple[FloatTensor, ...], Tuple[FloatTensor, ...]]: + ) -> Tuple[Tensor, Tensor, Tuple[Tensor, ...], Tuple[Tensor, ...]]: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1626,16 +1626,16 @@ def freeze_base_params(self) -> None: def forward( self, - hidden_states_base: FloatTensor, - temb: FloatTensor, - encoder_hidden_states: FloatTensor, - hidden_states_ctrl: Optional[FloatTensor] = None, + hidden_states_base: Tensor, + temb: Tensor, + encoder_hidden_states: Tensor, + hidden_states_ctrl: Optional[Tensor] = None, conditioning_scale: Optional[float] = 1.0, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - attention_mask: Optional[FloatTensor] = None, - encoder_attention_mask: Optional[FloatTensor] = None, + attention_mask: Optional[Tensor] = None, + encoder_attention_mask: Optional[Tensor] = None, apply_control: bool = True, - ) -> Tuple[FloatTensor, FloatTensor]: + ) -> Tuple[Tensor, Tensor]: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1807,18 +1807,18 @@ def freeze_base_params(self) -> None: def forward( self, - hidden_states: FloatTensor, - res_hidden_states_tuple_base: Tuple[FloatTensor, ...], - res_hidden_states_tuple_ctrl: Tuple[FloatTensor, ...], - temb: FloatTensor, - encoder_hidden_states: Optional[FloatTensor] = None, + hidden_states: Tensor, + res_hidden_states_tuple_base: Tuple[Tensor, ...], + res_hidden_states_tuple_ctrl: Tuple[Tensor, ...], + temb: Tensor, + encoder_hidden_states: Optional[Tensor] = None, conditioning_scale: Optional[float] = 1.0, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - attention_mask: Optional[FloatTensor] = None, + attention_mask: Optional[Tensor] = None, upsample_size: Optional[int] = None, - encoder_attention_mask: Optional[FloatTensor] = None, + encoder_attention_mask: Optional[Tensor] = None, apply_control: bool = True, - ) -> FloatTensor: + ) -> Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") diff --git a/src/diffusers/models/downsampling.py b/src/diffusers/models/downsampling.py index 6d556e1e67ac..4e384e731c74 100644 --- a/src/diffusers/models/downsampling.py +++ b/src/diffusers/models/downsampling.py @@ -129,7 +129,7 @@ def __init__( else: self.conv = conv - def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -180,24 +180,24 @@ def __init__( def _downsample_2d( self, - hidden_states: torch.FloatTensor, - weight: Optional[torch.FloatTensor] = None, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + weight: Optional[torch.Tensor] = None, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """Fused `Conv2d()` followed by `downsample_2d()`. Padding is performed only once at the beginning, not between the operations. The fused op is considerably more efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of arbitrary order. Args: - hidden_states (`torch.FloatTensor`): + hidden_states (`torch.Tensor`): Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - weight (`torch.FloatTensor`, *optional*): + weight (`torch.Tensor`, *optional*): Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to average pooling. factor (`int`, *optional*, default to `2`): @@ -206,7 +206,7 @@ def _downsample_2d( Scaling factor for signal magnitude. Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and same datatype as `x`. """ @@ -244,7 +244,7 @@ def _downsample_2d( return output - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if self.use_conv: downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel) hidden_states = downsample_input + self.Conv2d_0.bias.reshape(1, -1, 1, 1) @@ -286,11 +286,11 @@ def forward(self, inputs: torch.Tensor) -> torch.Tensor: def downsample_2d( - hidden_states: torch.FloatTensor, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, -) -> torch.FloatTensor: +) -> torch.Tensor: r"""Downsample2D a batch of 2D images with the given filter. Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the @@ -298,9 +298,9 @@ def downsample_2d( shape is a multiple of the downsampling factor. Args: - hidden_states (`torch.FloatTensor`) + hidden_states (`torch.Tensor`) Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to average pooling. factor (`int`, *optional*, default to `2`): @@ -309,7 +309,7 @@ def downsample_2d( Scaling factor for signal magnitude. Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H // factor, W // factor]` """ diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index ced520bb8204..d13f8a06cf63 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -424,7 +424,7 @@ def __init__( self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim) self.text_proj = nn.Linear(text_embed_dim, cross_attention_dim) - def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor): + def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): batch_size = text_embeds.shape[0] # image @@ -450,7 +450,7 @@ def __init__( self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim) self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): batch_size = image_embeds.shape[0] # image @@ -468,7 +468,7 @@ def __init__(self, image_embed_dim=1024, cross_attention_dim=1024): self.ff = FeedForward(image_embed_dim, cross_attention_dim, mult=1, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): return self.norm(self.ff(image_embeds)) @@ -482,7 +482,7 @@ def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_t self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): x = self.ff(image_embeds) x = x.reshape(-1, self.num_tokens, self.cross_attention_dim) return self.norm(x) @@ -530,7 +530,7 @@ def __init__(self, text_embed_dim: int = 768, image_embed_dim: int = 768, time_e self.text_norm = nn.LayerNorm(time_embed_dim) self.image_proj = nn.Linear(image_embed_dim, time_embed_dim) - def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor): + def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor): # text time_text_embeds = self.text_proj(text_embeds) time_text_embeds = self.text_norm(time_text_embeds) @@ -547,7 +547,7 @@ def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536): self.image_proj = nn.Linear(image_embed_dim, time_embed_dim) self.image_norm = nn.LayerNorm(time_embed_dim) - def forward(self, image_embeds: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor): # image time_image_embeds = self.image_proj(image_embeds) time_image_embeds = self.image_norm(time_image_embeds) @@ -577,7 +577,7 @@ def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536): nn.Conv2d(256, 4, 3, padding=1), ) - def forward(self, image_embeds: torch.FloatTensor, hint: torch.FloatTensor): + def forward(self, image_embeds: torch.Tensor, hint: torch.Tensor): # image time_image_embeds = self.image_proj(image_embeds) time_image_embeds = self.image_norm(time_image_embeds) @@ -1007,7 +1007,7 @@ def __init__(self, IPAdapterImageProjectionLayers: Union[List[nn.Module], Tuple[ super().__init__() self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers) - def forward(self, image_embeds: List[torch.FloatTensor]): + def forward(self, image_embeds: List[torch.Tensor]): projected_image_embeds = [] # currently, we accept `image_embeds` as diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index adda53a11481..00b55cd9c9d6 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -58,7 +58,7 @@ class ResnetBlockCondNorm2D(nn.Module): non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use. time_embedding_norm (`str`, *optional*, default to `"ada_group"` ): The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial". - kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see + kernel (`torch.Tensor`, optional, default to None): FIR filter, see [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`]. output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output. use_in_shortcut (`bool`, *optional*, default to `True`): @@ -146,7 +146,7 @@ def __init__( bias=conv_shortcut_bias, ) - def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -204,7 +204,7 @@ class ResnetBlock2D(nn.Module): time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config. By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a stronger conditioning with scale and shift. - kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see + kernel (`torch.Tensor`, optional, default to None): FIR filter, see [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`]. output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output. use_in_shortcut (`bool`, *optional*, default to `True`): @@ -232,7 +232,7 @@ def __init__( non_linearity: str = "swish", skip_time_act: bool = False, time_embedding_norm: str = "default", # default, scale_shift, - kernel: Optional[torch.FloatTensor] = None, + kernel: Optional[torch.Tensor] = None, output_scale_factor: float = 1.0, use_in_shortcut: Optional[bool] = None, up: bool = False, @@ -317,7 +317,7 @@ def __init__( bias=conv_shortcut_bias, ) - def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -605,7 +605,7 @@ def __init__( padding=0, ) - def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor) -> torch.Tensor: hidden_states = input_tensor hidden_states = self.norm1(hidden_states) @@ -685,8 +685,8 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, ): num_frames = image_only_indicator.shape[-1] diff --git a/src/diffusers/models/transformers/dual_transformer_2d.py b/src/diffusers/models/transformers/dual_transformer_2d.py index e2f1b8538ca0..edc8cbf78382 100644 --- a/src/diffusers/models/transformers/dual_transformer_2d.py +++ b/src/diffusers/models/transformers/dual_transformer_2d.py @@ -106,14 +106,13 @@ def forward( """ Args: hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. - When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input - hidden_states. + When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states. encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. timestep ( `torch.long`, *optional*): Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Optional attention mask to be applied in Attention. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under diff --git a/src/diffusers/models/transformers/prior_transformer.py b/src/diffusers/models/transformers/prior_transformer.py index 990eabe2c37a..8dbcfc64e09c 100644 --- a/src/diffusers/models/transformers/prior_transformer.py +++ b/src/diffusers/models/transformers/prior_transformer.py @@ -26,11 +26,11 @@ class PriorTransformerOutput(BaseOutput): The output of [`PriorTransformer`]. Args: - predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): + predicted_image_embedding (`torch.Tensor` of shape `(batch_size, embedding_dim)`): The predicted CLIP image embedding conditioned on the CLIP text embedding input. """ - predicted_image_embedding: torch.FloatTensor + predicted_image_embedding: torch.Tensor class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin): @@ -246,8 +246,8 @@ def forward( self, hidden_states, timestep: Union[torch.Tensor, float, int], - proj_embedding: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + proj_embedding: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, attention_mask: Optional[torch.BoolTensor] = None, return_dict: bool = True, ): @@ -255,13 +255,13 @@ def forward( The [`PriorTransformer`] forward method. Args: - hidden_states (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): + hidden_states (`torch.Tensor` of shape `(batch_size, embedding_dim)`): The currently predicted image embeddings. timestep (`torch.LongTensor`): Current denoising step. - proj_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): + proj_embedding (`torch.Tensor` of shape `(batch_size, embedding_dim)`): Projected embedding vector the denoising process is conditioned on. - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_embeddings, embedding_dim)`): + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, num_embeddings, embedding_dim)`): Hidden states of the text embeddings the denoising process is conditioned on. attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`): Text mask for the text embeddings. diff --git a/src/diffusers/models/transformers/t5_film_transformer.py b/src/diffusers/models/transformers/t5_film_transformer.py index bff98db021cf..1dea37a25910 100644 --- a/src/diffusers/models/transformers/t5_film_transformer.py +++ b/src/diffusers/models/transformers/t5_film_transformer.py @@ -86,7 +86,7 @@ def __init__( self.post_dropout = nn.Dropout(p=dropout_rate) self.spec_out = nn.Linear(d_model, input_dims, bias=False) - def encoder_decoder_mask(self, query_input: torch.FloatTensor, key_input: torch.FloatTensor) -> torch.FloatTensor: + def encoder_decoder_mask(self, query_input: torch.Tensor, key_input: torch.Tensor) -> torch.Tensor: mask = torch.mul(query_input.unsqueeze(-1), key_input.unsqueeze(-2)) return mask.unsqueeze(-3) @@ -195,13 +195,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - conditioning_emb: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + conditioning_emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, encoder_decoder_position_bias=None, - ) -> Tuple[torch.FloatTensor]: + ) -> Tuple[torch.Tensor]: hidden_states = self.layer[0]( hidden_states, conditioning_emb=conditioning_emb, @@ -249,10 +249,10 @@ def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float) def forward( self, - hidden_states: torch.FloatTensor, - conditioning_emb: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + conditioning_emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: # pre_self_attention_layer_norm normed_hidden_states = self.layer_norm(hidden_states) @@ -292,10 +292,10 @@ def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float, def forward( self, - hidden_states: torch.FloatTensor, - key_value_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.attention( normed_hidden_states, @@ -328,9 +328,7 @@ def __init__(self, d_model: int, d_ff: int, dropout_rate: float, layer_norm_epsi self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) - def forward( - self, hidden_states: torch.FloatTensor, conditioning_emb: Optional[torch.FloatTensor] = None - ) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, conditioning_emb: Optional[torch.Tensor] = None) -> torch.Tensor: forwarded_states = self.layer_norm(hidden_states) if conditioning_emb is not None: forwarded_states = self.film(forwarded_states, conditioning_emb) @@ -361,7 +359,7 @@ def __init__(self, d_model: int, d_ff: int, dropout_rate: float): self.dropout = nn.Dropout(dropout_rate) self.act = NewGELUActivation() - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_gelu = self.act(self.wi_0(hidden_states)) hidden_linear = self.wi_1(hidden_states) hidden_states = hidden_gelu * hidden_linear @@ -390,7 +388,7 @@ def __init__(self, hidden_size: int, eps: float = 1e-6): self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for @@ -431,7 +429,7 @@ def __init__(self, in_features: int, out_features: int): super().__init__() self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False) - def forward(self, x: torch.FloatTensor, conditioning_emb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor, conditioning_emb: torch.Tensor) -> torch.Tensor: emb = self.scale_bias(conditioning_emb) scale, shift = torch.chunk(emb, 2, -1) x = x * (1 + scale) + shift diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py index 6a2695b9e436..ef9e0de0b662 100644 --- a/src/diffusers/models/transformers/transformer_2d.py +++ b/src/diffusers/models/transformers/transformer_2d.py @@ -35,12 +35,12 @@ class Transformer2DModelOutput(BaseOutput): The output of [`Transformer2DModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability distributions for the unnoised latent pixels. """ - sample: torch.FloatTensor + sample: torch.Tensor class Transformer2DModel(ModelMixin, ConfigMixin): @@ -346,9 +346,9 @@ def forward( The [`Transformer2DModel`] forward method. Args: - hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous): + hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous): Input `hidden_states`. - encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*): + encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. timestep ( `torch.LongTensor`, *optional*): diff --git a/src/diffusers/models/transformers/transformer_temporal.py b/src/diffusers/models/transformers/transformer_temporal.py index c2d490f3d046..2e1bb041a207 100644 --- a/src/diffusers/models/transformers/transformer_temporal.py +++ b/src/diffusers/models/transformers/transformer_temporal.py @@ -31,11 +31,11 @@ class TransformerTemporalModelOutput(BaseOutput): The output of [`TransformerTemporalModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size x num_frames, num_channels, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. """ - sample: torch.FloatTensor + sample: torch.Tensor class TransformerTemporalModel(ModelMixin, ConfigMixin): @@ -120,7 +120,7 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, + hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.LongTensor] = None, timestep: Optional[torch.LongTensor] = None, class_labels: torch.LongTensor = None, @@ -132,7 +132,7 @@ def forward( The [`TransformerTemporal`] forward method. Args: - hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous): + hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous): Input hidden_states. encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to @@ -283,7 +283,7 @@ def forward( ): """ Args: - hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`): + hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states. num_frames (`int`): The number of frames to be processed per batch. This is used to reshape the hidden states. diff --git a/src/diffusers/models/unets/unet_1d.py b/src/diffusers/models/unets/unet_1d.py index 59d70f67c959..d1538cdc61d9 100644 --- a/src/diffusers/models/unets/unet_1d.py +++ b/src/diffusers/models/unets/unet_1d.py @@ -31,11 +31,11 @@ class UNet1DOutput(BaseOutput): The output of [`UNet1DModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, sample_size)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, sample_size)`): The hidden states output from the last layer of the model. """ - sample: torch.FloatTensor + sample: torch.Tensor class UNet1DModel(ModelMixin, ConfigMixin): @@ -194,7 +194,7 @@ def __init__( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], return_dict: bool = True, ) -> Union[UNet1DOutput, Tuple]: @@ -202,9 +202,9 @@ def forward( The [`UNet1DModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple. diff --git a/src/diffusers/models/unets/unet_1d_blocks.py b/src/diffusers/models/unets/unet_1d_blocks.py index e3163cd1d5b3..3c7c1cbecee9 100644 --- a/src/diffusers/models/unets/unet_1d_blocks.py +++ b/src/diffusers/models/unets/unet_1d_blocks.py @@ -66,7 +66,7 @@ def __init__( if add_downsample: self.downsample = Downsample1D(out_channels, use_conv=True, padding=1) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: output_states = () hidden_states = self.resnets[0](hidden_states, temb) @@ -128,10 +128,10 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Optional[Tuple[torch.FloatTensor, ...]] = None, - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Optional[Tuple[torch.Tensor, ...]] = None, + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if res_hidden_states_tuple is not None: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat((hidden_states, res_hidden_states), dim=1) @@ -161,7 +161,7 @@ def __init__(self, in_channels: int, out_channels: int, embed_dim: int): self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim) self.down2 = Downsample1D(out_channels // 4, use_conv=True) - def forward(self, x: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, x: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: x = self.res1(x, temb) x = self.down1(x) x = self.res2(x, temb) @@ -209,7 +209,7 @@ def __init__( if self.upsample and self.downsample: raise ValueError("Block cannot downsample and upsample") - def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) for resnet in self.resnets[1:]: hidden_states = resnet(hidden_states, temb) @@ -230,7 +230,7 @@ def __init__(self, num_groups_out: int, out_channels: int, embed_dim: int, act_f self.final_conv1d_act = get_activation(act_fn) self.final_conv1d_2 = nn.Conv1d(embed_dim, out_channels, 1) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.final_conv1d_1(hidden_states) hidden_states = rearrange_dims(hidden_states) hidden_states = self.final_conv1d_gn(hidden_states) @@ -251,7 +251,7 @@ def __init__(self, fc_dim: int, embed_dim: int, act_fn: str = "mish"): ] ) - def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view(hidden_states.shape[0], -1) hidden_states = torch.cat((hidden_states, temb), dim=-1) for layer in self.final_block: @@ -288,7 +288,7 @@ def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"): self.pad = kernel_1d.shape[0] // 2 - 1 self.register_buffer("kernel", kernel_1d) - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode) weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]]) indices = torch.arange(hidden_states.shape[1], device=hidden_states.device) @@ -305,7 +305,7 @@ def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"): self.pad = kernel_1d.shape[0] // 2 - 1 self.register_buffer("kernel", kernel_1d) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode) weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]]) indices = torch.arange(hidden_states.shape[1], device=hidden_states.device) @@ -335,7 +335,7 @@ def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor: new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3) return new_projection - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: residual = hidden_states batch, channel_dim, seq = hidden_states.shape @@ -390,7 +390,7 @@ def __init__(self, in_channels: int, mid_channels: int, out_channels: int, is_la self.group_norm_2 = nn.GroupNorm(1, out_channels) self.gelu_2 = nn.GELU() - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states hidden_states = self.conv_1(hidden_states) @@ -435,7 +435,7 @@ def __init__(self, mid_channels: int, in_channels: int, out_channels: Optional[i self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.down(hidden_states) for attn, resnet in zip(self.attentions, self.resnets): hidden_states = resnet(hidden_states) @@ -466,7 +466,7 @@ def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[i self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.down(hidden_states) for resnet, attn in zip(self.resnets, self.attentions): @@ -490,7 +490,7 @@ def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[i self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.down(hidden_states) for resnet in self.resnets: @@ -512,7 +512,7 @@ def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[i self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = torch.cat([hidden_states, temb], dim=1) for resnet in self.resnets: hidden_states = resnet(hidden_states) @@ -542,10 +542,10 @@ def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[i def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) @@ -574,10 +574,10 @@ def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[i def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) @@ -604,10 +604,10 @@ def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[i def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states = res_hidden_states_tuple[-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) diff --git a/src/diffusers/models/unets/unet_2d.py b/src/diffusers/models/unets/unet_2d.py index 5efb63822205..0f36afe3f931 100644 --- a/src/diffusers/models/unets/unet_2d.py +++ b/src/diffusers/models/unets/unet_2d.py @@ -30,11 +30,11 @@ class UNet2DOutput(BaseOutput): The output of [`UNet2DModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The hidden states output from the last layer of the model. """ - sample: torch.FloatTensor + sample: torch.Tensor class UNet2DModel(ModelMixin, ConfigMixin): @@ -242,7 +242,7 @@ def __init__( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], class_labels: Optional[torch.Tensor] = None, return_dict: bool = True, @@ -251,10 +251,10 @@ def forward( The [`UNet2DModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - class_labels (`torch.FloatTensor`, *optional*, defaults to `None`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple. diff --git a/src/diffusers/models/unets/unet_2d_blocks.py b/src/diffusers/models/unets/unet_2d_blocks.py index ef75fad25e44..93a0a82cdcff 100644 --- a/src/diffusers/models/unets/unet_2d_blocks.py +++ b/src/diffusers/models/unets/unet_2d_blocks.py @@ -561,7 +561,7 @@ class AutoencoderTinyBlock(nn.Module): ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`. Returns: - `torch.FloatTensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to + `torch.Tensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to `out_channels`. """ @@ -582,7 +582,7 @@ def __init__(self, in_channels: int, out_channels: int, act_fn: str): ) self.fuse = nn.ReLU() - def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: return self.fuse(self.conv(x) + self.skip(x)) @@ -612,8 +612,8 @@ class UNetMidBlock2D(nn.Module): output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor. Returns: - `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size, - in_channels, height, width)`. + `torch.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size, in_channels, + height, width)`. """ @@ -731,7 +731,7 @@ def __init__( self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): if attn is not None: @@ -846,13 +846,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -986,13 +986,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1118,11 +1118,11 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1240,14 +1240,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - additional_residuals: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + additional_residuals: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1362,8 +1362,8 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None, *args, **kwargs + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1465,7 +1465,7 @@ def __init__( else: self.downsamplers = None - def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1567,7 +1567,7 @@ def __init__( else: self.downsamplers = None - def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1666,12 +1666,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - skip_sample: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + skip_sample: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...], torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1757,12 +1757,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - skip_sample: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + skip_sample: Optional[torch.Tensor] = None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...], torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1850,8 +1850,8 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None, *args, **kwargs + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1986,13 +1986,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2097,8 +2097,8 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None, *args, **kwargs + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -2201,13 +2201,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2358,13 +2358,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -2481,15 +2481,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2616,13 +2616,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -2741,7 +2741,7 @@ def __init__( self.resolution_idx = resolution_idx - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: for resnet in self.resnets: hidden_states = resnet(hidden_states, temb=temb) @@ -2839,7 +2839,7 @@ def __init__( self.resolution_idx = resolution_idx - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: for resnet, attn in zip(self.resnets, self.attentions): hidden_states = resnet(hidden_states, temb=temb) hidden_states = attn(hidden_states, temb=temb) @@ -2947,13 +2947,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, skip_sample=None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3059,13 +3059,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, skip_sample=None, *args, **kwargs, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> Tuple[torch.Tensor, torch.Tensor]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3166,13 +3166,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3310,15 +3310,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -3428,13 +3428,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -3558,15 +3558,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: res_hidden_states_tuple = res_hidden_states_tuple[-1] if res_hidden_states_tuple is not None: hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1) @@ -3684,23 +3684,23 @@ def __init__( cross_attention_norm=cross_attention_norm, ) - def _to_3d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor: + def _to_3d(self, hidden_states: torch.Tensor, height: int, weight: int) -> torch.Tensor: return hidden_states.permute(0, 2, 3, 1).reshape(hidden_states.shape[0], height * weight, -1) - def _to_4d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor: + def _to_4d(self, hidden_states: torch.Tensor, height: int, weight: int) -> torch.Tensor: return hidden_states.permute(0, 2, 1).reshape(hidden_states.shape[0], -1, height, weight) def forward( self, - hidden_states: torch.FloatTensor, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, # TODO: mark emb as non-optional (self.norm2 requires it). # requires assessing impact of change to positional param interface. - emb: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index 16928f67fcdb..d07100b10edd 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -60,11 +60,11 @@ class UNet2DConditionOutput(BaseOutput): The output of [`UNet2DConditionModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor = None + sample: torch.Tensor = None class UNet2DConditionModel( @@ -1042,7 +1042,7 @@ def process_encoder_hidden_states( def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -1060,10 +1060,10 @@ def forward( The [`UNet2DConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. diff --git a/src/diffusers/models/unets/unet_3d_blocks.py b/src/diffusers/models/unets/unet_3d_blocks.py index 75827258f64c..35a732bdb9ec 100644 --- a/src/diffusers/models/unets/unet_3d_blocks.py +++ b/src/diffusers/models/unets/unet_3d_blocks.py @@ -411,13 +411,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames) for attn, temp_attn, resnet, temp_conv in zip( @@ -544,13 +544,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, cross_attention_kwargs: Dict[str, Any] = None, - ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: # TODO(Patrick, William) - attention mask is not used output_states = () @@ -651,10 +651,10 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, num_frames: int = 1, - ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () for resnet, temp_conv in zip(self.resnets, self.temp_convs): @@ -769,15 +769,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, cross_attention_kwargs: Dict[str, Any] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: is_freeu_enabled = ( getattr(self, "s1", None) and getattr(self, "s2", None) @@ -891,12 +891,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: is_freeu_enabled = ( getattr(self, "s1", None) and getattr(self, "s2", None) @@ -1008,12 +1008,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, num_frames: int = 1, *args, **kwargs, - ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1174,14 +1174,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - additional_residuals: Optional[torch.FloatTensor] = None, + additional_residuals: Optional[torch.Tensor] = None, ): if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: @@ -1357,16 +1357,16 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1518,14 +1518,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size=None, num_frames: int = 1, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1699,14 +1699,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, num_frames: int = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -1811,8 +1811,8 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - image_only_indicator: torch.FloatTensor, + hidden_states: torch.Tensor, + image_only_indicator: torch.Tensor, ): hidden_states = self.resnets[0]( hidden_states, @@ -1862,9 +1862,9 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - image_only_indicator: torch.FloatTensor, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + image_only_indicator: torch.Tensor, + ) -> torch.Tensor: for resnet in self.resnets: hidden_states = resnet( hidden_states, @@ -1935,11 +1935,11 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: hidden_states = self.resnets[0]( hidden_states, temb, @@ -2031,10 +2031,10 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () for resnet in self.resnets: if self.training and self.gradient_checkpointing: @@ -2141,11 +2141,11 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () blocks = list(zip(self.resnets, self.attentions)) @@ -2240,11 +2240,11 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: for resnet in self.resnets: # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] @@ -2349,12 +2349,12 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, image_only_indicator: Optional[torch.Tensor] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: for resnet, attn in zip(self.resnets, self.attentions): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py index 6c353c425911..b4879fe9639c 100644 --- a/src/diffusers/models/unets/unet_3d_condition.py +++ b/src/diffusers/models/unets/unet_3d_condition.py @@ -55,11 +55,11 @@ class UNet3DConditionOutput(BaseOutput): The output of [`UNet3DConditionModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_channels, num_frames, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor + sample: torch.Tensor class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): @@ -560,7 +560,7 @@ def unload_lora(self): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -570,15 +570,15 @@ def forward( down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, mid_block_additional_residual: Optional[torch.Tensor] = None, return_dict: bool = True, - ) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]: + ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]: r""" The [`UNet3DConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_channels, num_frames, height, width`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py index 0a5f71ed0029..dbfb4f80259d 100644 --- a/src/diffusers/models/unets/unet_i2vgen_xl.py +++ b/src/diffusers/models/unets/unet_i2vgen_xl.py @@ -81,8 +81,8 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - ) -> torch.FloatTensor: + hidden_states: torch.Tensor, + ) -> torch.Tensor: norm_hidden_states = self.norm1(hidden_states) attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None) hidden_states = attn_output + hidden_states @@ -514,7 +514,7 @@ def unfuse_qkv_projections(self): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], fps: torch.Tensor, image_latents: torch.Tensor, @@ -523,19 +523,19 @@ def forward( timestep_cond: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, - ) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]: + ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]: r""" The [`I2VGenXLUNet`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition". - image_latents (`torch.FloatTensor`): Image encodings from the VAE. - image_embeddings (`torch.FloatTensor`): + image_latents (`torch.Tensor`): Image encodings from the VAE. + image_embeddings (`torch.Tensor`): Projection embeddings of the conditioning image computed with a vision encoder. - encoder_hidden_states (`torch.FloatTensor`): + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under diff --git a/src/diffusers/models/unets/unet_kandinsky3.py b/src/diffusers/models/unets/unet_kandinsky3.py index b981c8e17eb9..ff8ce25fd223 100644 --- a/src/diffusers/models/unets/unet_kandinsky3.py +++ b/src/diffusers/models/unets/unet_kandinsky3.py @@ -31,7 +31,7 @@ @dataclass class Kandinsky3UNetOutput(BaseOutput): - sample: torch.FloatTensor = None + sample: torch.Tensor = None class Kandinsky3EncoderProj(nn.Module): diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index 81cc4b1f7ad4..a092daa662f7 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -786,7 +786,7 @@ def unfuse_qkv_projections(self): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, timestep_cond: Optional[torch.Tensor] = None, @@ -801,10 +801,10 @@ def forward( The [`UNetMotionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`): Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py index 0f89df8c6bff..5613e3618d02 100644 --- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py +++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py @@ -22,11 +22,11 @@ class UNetSpatioTemporalConditionOutput(BaseOutput): The output of [`UNetSpatioTemporalConditionModel`]. Args: - sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): + sample (`torch.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor = None + sample: torch.Tensor = None class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): @@ -356,7 +356,7 @@ def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, added_time_ids: torch.Tensor, @@ -366,12 +366,12 @@ def forward( The [`UNetSpatioTemporalConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`. - added_time_ids: (`torch.FloatTensor`): + added_time_ids: (`torch.Tensor`): The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal embeddings and added to the time embeddings. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py index 72967411f721..75a3dbc8edee 100644 --- a/src/diffusers/models/unets/unet_stable_cascade.py +++ b/src/diffusers/models/unets/unet_stable_cascade.py @@ -131,7 +131,7 @@ def forward(self, x): @dataclass class StableCascadeUNetOutput(BaseOutput): - sample: torch.FloatTensor = None + sample: torch.Tensor = None class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalModelMixin): diff --git a/src/diffusers/models/upsampling.py b/src/diffusers/models/upsampling.py index af6e15db308b..572844d2de0a 100644 --- a/src/diffusers/models/upsampling.py +++ b/src/diffusers/models/upsampling.py @@ -138,9 +138,7 @@ def __init__( else: self.Conv2d_0 = conv - def forward( - self, hidden_states: torch.FloatTensor, output_size: Optional[int] = None, *args, **kwargs - ) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None, *args, **kwargs) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -217,12 +215,12 @@ def __init__( def _upsample_2d( self, - hidden_states: torch.FloatTensor, - weight: Optional[torch.FloatTensor] = None, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + weight: Optional[torch.Tensor] = None, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """Fused `upsample_2d()` followed by `Conv2d()`. Padding is performed only once at the beginning, not between the operations. The fused op is considerably more @@ -230,19 +228,19 @@ def _upsample_2d( arbitrary order. Args: - hidden_states (`torch.FloatTensor`): + hidden_states (`torch.Tensor`): Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - weight (`torch.FloatTensor`, *optional*): + weight (`torch.Tensor`, *optional*): Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling. factor (`int`, *optional*): Integer upsampling factor (default: 2). gain (`float`, *optional*): Scaling factor for signal magnitude (default: 1.0). Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same datatype as `hidden_states`. """ @@ -310,7 +308,7 @@ def _upsample_2d( return output - def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if self.use_conv: height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel) height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1) @@ -401,11 +399,11 @@ def upfirdn2d_native( def upsample_2d( - hidden_states: torch.FloatTensor, - kernel: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + kernel: Optional[torch.Tensor] = None, factor: int = 2, gain: float = 1, -) -> torch.FloatTensor: +) -> torch.Tensor: r"""Upsample2D a batch of 2D images with the given filter. Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified @@ -413,9 +411,9 @@ def upsample_2d( a: multiple of the upsampling factor. Args: - hidden_states (`torch.FloatTensor`): + hidden_states (`torch.Tensor`): Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. - kernel (`torch.FloatTensor`, *optional*): + kernel (`torch.Tensor`, *optional*): FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling. factor (`int`, *optional*, default to `2`): @@ -424,7 +422,7 @@ def upsample_2d( Scaling factor for signal magnitude (default: 1.0). Returns: - output (`torch.FloatTensor`): + output (`torch.Tensor`): Tensor of the shape `[N, C, H * factor, W * factor]` """ assert isinstance(factor, int) and factor >= 1 diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py index e5184446cee7..2e38d5b6711a 100644 --- a/src/diffusers/models/vq_model.py +++ b/src/diffusers/models/vq_model.py @@ -30,11 +30,11 @@ class VQEncoderOutput(BaseOutput): Output of VQModel encoding method. Args: - latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): The encoded output sample from the last layer of the model. """ - latents: torch.FloatTensor + latents: torch.Tensor class VQModel(ModelMixin, ConfigMixin): @@ -127,7 +127,7 @@ def __init__( ) @apply_forward_hook - def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput: h = self.encoder(x) h = self.quant_conv(h) @@ -138,8 +138,8 @@ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOut @apply_forward_hook def decode( - self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None - ) -> Union[DecoderOutput, torch.FloatTensor]: + self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None + ) -> Union[DecoderOutput, torch.Tensor]: # also go through quantization layer if not force_not_quantize: quant, _, _ = self.quantize(h) @@ -156,13 +156,13 @@ def decode( return DecoderOutput(sample=dec) def forward( - self, sample: torch.FloatTensor, return_dict: bool = True - ) -> Union[DecoderOutput, Tuple[torch.FloatTensor, ...]]: + self, sample: torch.Tensor, return_dict: bool = True + ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]: r""" The [`VQModel`] forward method. Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple. diff --git a/src/diffusers/pipelines/amused/pipeline_amused.py b/src/diffusers/pipelines/amused/pipeline_amused.py index 994455ff29db..a8c24b0aeecc 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused.py +++ b/src/diffusers/pipelines/amused/pipeline_amused.py @@ -88,7 +88,7 @@ def __call__( negative_encoder_hidden_states: Optional[torch.Tensor] = None, output_type="pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, micro_conditioning_aesthetic_score: int = 6, @@ -122,16 +122,16 @@ def __call__( latents (`torch.IntTensor`, *optional*): Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image gneration. If not provided, the starting latents will be completely masked. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. A single vector from the pooled and projected final hidden states. - encoder_hidden_states (`torch.FloatTensor`, *optional*): + encoder_hidden_states (`torch.Tensor`, *optional*): Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): + negative_encoder_hidden_states (`torch.Tensor`, *optional*): Analogous to `encoder_hidden_states` for the positive prompt. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. @@ -140,7 +140,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py index 1218e7a44c4d..c74275b414d4 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_img2img.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_img2img.py @@ -102,7 +102,7 @@ def __call__( negative_encoder_hidden_states: Optional[torch.Tensor] = None, output_type="pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, micro_conditioning_aesthetic_score: int = 6, @@ -115,7 +115,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a @@ -141,16 +141,16 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. A single vector from the pooled and projected final hidden states. - encoder_hidden_states (`torch.FloatTensor`, *optional*): + encoder_hidden_states (`torch.Tensor`, *optional*): Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): + negative_encoder_hidden_states (`torch.Tensor`, *optional*): Analogous to `encoder_hidden_states` for the positive prompt. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. @@ -159,7 +159,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py index ab0a55cdd388..24801e0ef977 100644 --- a/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py +++ b/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py @@ -119,7 +119,7 @@ def __call__( negative_encoder_hidden_states: Optional[torch.Tensor] = None, output_type="pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, micro_conditioning_aesthetic_score: int = 6, @@ -132,13 +132,13 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one @@ -165,16 +165,16 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. A single vector from the pooled and projected final hidden states. - encoder_hidden_states (`torch.FloatTensor`, *optional*): + encoder_hidden_states (`torch.Tensor`, *optional*): Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): + negative_encoder_hidden_states (`torch.Tensor`, *optional*): Analogous to `encoder_hidden_states` for the positive prompt. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. @@ -183,7 +183,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py index 3765db938cd5..dd31ba4ff350 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py @@ -169,8 +169,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -190,10 +190,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -584,11 +584,11 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -625,27 +625,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py index 56146b6d05ca..a73d3cc8ba2d 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py @@ -334,10 +334,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -363,17 +363,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -806,7 +806,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -819,7 +819,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -888,13 +888,13 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -971,27 +971,27 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py index 67fa8b12a34b..a2137fea255b 100644 --- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py @@ -274,8 +274,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -295,10 +295,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -772,11 +772,11 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -821,27 +821,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple. cross_attention_kwargs (`dict`, *optional*): diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py index 78b730ea916c..105ca40f773f 100644 --- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py +++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py @@ -103,8 +103,8 @@ def _encode_prompt( num_waveforms_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -122,10 +122,10 @@ def _encode_prompt( The prompt or prompts not to guide the audio generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -360,11 +360,11 @@ def __call__( num_waveforms_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, output_type: Optional[str] = "np", @@ -394,21 +394,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index 948caf97d27b..71310834351e 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -64,7 +64,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput): """ Args: Class for AudioLDM2 projection layer's outputs. - hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states obtained by linearly projecting the hidden-states for each of the text encoders and subsequently concatenating them together. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -75,7 +75,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput): - 0 for tokens that are **masked**. """ - hidden_states: torch.FloatTensor + hidden_states: torch.Tensor attention_mask: Optional[torch.LongTensor] = None @@ -125,8 +125,8 @@ def __init__( def forward( self, - hidden_states: Optional[torch.FloatTensor] = None, - hidden_states_1: Optional[torch.FloatTensor] = None, + hidden_states: Optional[torch.Tensor] = None, + hidden_states_1: Optional[torch.Tensor] = None, attention_mask: Optional[torch.LongTensor] = None, attention_mask_1: Optional[torch.LongTensor] = None, ): @@ -680,7 +680,7 @@ def _set_gradient_checkpointing(self, module, value=False): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -696,10 +696,10 @@ def forward( The [`AudioLDM2UNet2DConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. encoder_attention_mask (`torch.Tensor`): A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If @@ -710,7 +710,7 @@ def forward( tuple. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. - encoder_hidden_states_1 (`torch.FloatTensor`, *optional*): + encoder_hidden_states_1 (`torch.Tensor`, *optional*): A second set of encoder hidden states with shape `(batch, sequence_length_2, feature_dim_2)`. Can be used to condition the model on a different set of embeddings to `encoder_hidden_states`. encoder_attention_mask_1 (`torch.Tensor`, *optional*): @@ -1091,14 +1091,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states_1: Optional[torch.FloatTensor] = None, - encoder_attention_mask_1: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states_1: Optional[torch.Tensor] = None, + encoder_attention_mask_1: Optional[torch.Tensor] = None, ): output_states = () num_layers = len(self.resnets) @@ -1270,15 +1270,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states_1: Optional[torch.FloatTensor] = None, - encoder_attention_mask_1: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states_1: Optional[torch.Tensor] = None, + encoder_attention_mask_1: Optional[torch.Tensor] = None, + ) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) num_attention_per_layer = len(self.attentions) // (len(self.resnets) - 1) @@ -1437,16 +1437,16 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states_1: Optional[torch.FloatTensor] = None, - encoder_attention_mask_1: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states_1: Optional[torch.Tensor] = None, + encoder_attention_mask_1: Optional[torch.Tensor] = None, ): num_layers = len(self.resnets) num_attention_per_layer = len(self.attentions) // num_layers diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index a498831877c9..49440830d0c0 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -273,7 +273,7 @@ def generate_language_model( Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs. Parameters: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): The sequence used as a prompt for the generation. max_new_tokens (`int`): Number of new tokens to generate. @@ -282,7 +282,7 @@ def generate_language_model( function of the model. Return: - `inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): The sequence of generated hidden-states. """ max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens @@ -311,10 +311,10 @@ def encode_prompt( do_classifier_free_guidance, transcription=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - generated_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + generated_prompt_embeds: Optional[torch.Tensor] = None, + negative_generated_prompt_embeds: Optional[torch.Tensor] = None, attention_mask: Optional[torch.LongTensor] = None, negative_attention_mask: Optional[torch.LongTensor] = None, max_new_tokens: Optional[int] = None, @@ -337,18 +337,18 @@ def encode_prompt( The prompt or prompts not to guide the audio generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from `negative_prompt` input argument. - generated_prompt_embeds (`torch.FloatTensor`, *optional*): + generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from `negative_prompt` input argument. @@ -361,11 +361,11 @@ def encode_prompt( max_new_tokens (`int`, *optional*, defaults to None): The number of new tokens to generate with the GPT2 language model. Returns: - prompt_embeds (`torch.FloatTensor`): + prompt_embeds (`torch.Tensor`): Text embeddings from the Flan T5 model. attention_mask (`torch.LongTensor`): Attention mask to be applied to the `prompt_embeds`. - generated_prompt_embeds (`torch.FloatTensor`): + generated_prompt_embeds (`torch.Tensor`): Text embeddings generated from the GPT2 langauge model. Example: @@ -821,16 +821,16 @@ def __call__( num_waveforms_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - generated_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + generated_prompt_embeds: Optional[torch.Tensor] = None, + negative_generated_prompt_embeds: Optional[torch.Tensor] = None, attention_mask: Optional[torch.LongTensor] = None, negative_attention_mask: Optional[torch.LongTensor] = None, max_new_tokens: Optional[int] = None, return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, output_type: Optional[str] = "np", @@ -865,21 +865,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - generated_prompt_embeds (`torch.FloatTensor`, *optional*): + generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_generated_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from `negative_prompt` input argument. @@ -897,7 +897,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py b/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py index d71a1481034a..d92a07669059 100644 --- a/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py +++ b/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py @@ -298,7 +298,7 @@ def preprocess( return encoded_outputs # Follows diffusers.VaeImageProcessor.postprocess - def postprocess(self, sample: torch.FloatTensor, output_type: str = "pil"): + def postprocess(self, sample: torch.Tensor, output_type: str = "pil"): if output_type not in ["pt", "np", "pil"]: raise ValueError( f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']" diff --git a/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py b/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py index c8869ad9db23..1be4761a9987 100644 --- a/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py +++ b/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py @@ -117,7 +117,7 @@ def __init__(self, config: Blip2VisionConfig): self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: batch_size = pixel_values.shape[0] target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] @@ -376,7 +376,7 @@ def __init__(self, config: Blip2VisionConfig): @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig) def forward( self, - pixel_values: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -524,15 +524,15 @@ def forward( return_dict=None, ): r""" - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): + encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. - past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of: shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key diff --git a/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py b/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py index c6772fc88807..d29dddf64b01 100644 --- a/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py +++ b/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py @@ -186,7 +186,7 @@ def forward( ctx_begin_pos: list, input_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: if ctx_embeddings is None: ctx_len = 0 diff --git a/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py b/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py index ba43b2e53d2d..ff23247b5f81 100644 --- a/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py +++ b/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py @@ -191,7 +191,7 @@ def __call__( reference_image: PIL.Image.Image, source_subject_category: List[str], target_subject_category: List[str], - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 7.5, height: int = 512, width: int = 512, @@ -215,7 +215,7 @@ def __call__( The source subject category. target_subject_category (`List[str]`): The target subject category. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by random sampling. diff --git a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py index befac79c63f3..b0c11362ffe1 100644 --- a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py +++ b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py @@ -105,7 +105,7 @@ def prepare_latents(self, batch_size, num_channels, height, width, dtype, device return latents # Follows diffusers.VaeImageProcessor.postprocess - def postprocess_image(self, sample: torch.FloatTensor, output_type: str = "pil"): + def postprocess_image(self, sample: torch.Tensor, output_type: str = "pil"): if output_type not in ["pt", "np", "pil"]: raise ValueError( f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']" @@ -173,10 +173,10 @@ def __call__( num_inference_steps: int = 1, timesteps: List[int] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -195,7 +195,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -205,7 +205,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/controlnet/multicontrolnet.py b/src/diffusers/pipelines/controlnet/multicontrolnet.py index 7d284f2d26d3..98e9eec94a91 100644 --- a/src/diffusers/pipelines/controlnet/multicontrolnet.py +++ b/src/diffusers/pipelines/controlnet/multicontrolnet.py @@ -31,7 +31,7 @@ def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetMod def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, controlnet_cond: List[torch.tensor], diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 0bf5f1fc8e21..e64dcdc55457 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -261,8 +261,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -294,8 +294,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -315,10 +315,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -846,7 +846,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -859,7 +859,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -913,11 +913,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -936,14 +936,14 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single ControlNet, each will be paired with each prompt in the `prompt` list. This also applies to multiple ControlNets, where a list of image lists can be passed to batch for each prompt and each ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -975,18 +975,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -998,7 +998,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py index b983a3f8d4fc..86e0ddef663e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py @@ -240,7 +240,7 @@ def __call__( condtioning_image: PIL.Image.Image, source_subject_category: List[str], target_subject_category: List[str], - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 7.5, height: int = 512, width: int = 512, @@ -266,7 +266,7 @@ def __call__( The source subject category. target_subject_category (`List[str]`): The target subject category. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by random sampling. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 022f30d819d8..2e44efa78b73 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -239,8 +239,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -272,8 +272,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -293,10 +293,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -904,11 +904,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -927,18 +927,18 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image to be used as the starting point for the image generation process. Can also accept image latents as `image`, and if passing latents directly they are not encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -966,18 +966,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 47dbb26eb3e4..cdc34819d59e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -364,8 +364,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -397,8 +397,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -418,10 +418,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1121,11 +1121,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1144,14 +1144,14 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, NumPy array or tensor representing an image batch to be used as the starting point. For both NumPy array and PyTorch tensor, the expected value range is between `[0, 1]`. If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a NumPy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, NumPy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a @@ -1159,14 +1159,14 @@ def __call__( color channel (L) instead of 3, so the expected shape for PyTorch tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for NumPy array, it would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, - `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, + `List[List[torch.Tensor]]`, or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -1201,18 +1201,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index b9c4e3c0032c..3cfdefa9d44d 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -261,10 +261,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -290,17 +290,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1157,13 +1157,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1250,23 +1250,23 @@ def __call__( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1278,7 +1278,7 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py index 2307b856ad63..73bb8be89eb1 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py @@ -233,10 +233,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -262,17 +262,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -876,7 +876,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -889,7 +889,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -948,13 +948,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -982,14 +982,14 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. Anything below 512 pixels won't work well for [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) @@ -1025,25 +1025,25 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index dfd3cc239b36..dbd406d928d5 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -287,10 +287,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -316,17 +316,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1082,13 +1082,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1119,18 +1119,18 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The initial image will be used as the starting point for the image generation process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. - control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If - the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can - also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If - height and/or width are passed, `image` is resized according to them. If multiple ControlNets are - specified in init, images must be passed as a list such that each element of the list can be correctly - batched for input to a single controlnet. + the type is specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also + be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height + and/or width are passed, `image` is resized according to them. If multiple ControlNets are specified in + init, images must be passed as a list such that each element of the list can be correctly batched for + input to a single controlnet. height (`int`, *optional*, defaults to the size of control_image): The height in pixels of the generated image. Anything below 512 pixels won't work well for [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) @@ -1169,26 +1169,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py index 622bac8c5f97..50cd24e4fa18 100644 --- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py @@ -188,8 +188,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -221,8 +221,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -242,10 +242,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -638,9 +638,9 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -657,14 +657,14 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -686,14 +686,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py index 3ab535a054bd..e572412f6e91 100644 --- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py @@ -213,10 +213,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -242,17 +242,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -721,11 +721,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -751,14 +751,14 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: - `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.Tensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition to provide guidance to the `unet` for generation. If the type is - specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be - accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height - and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in - `init`, images must be passed as a list such that each element of the list can be correctly batched for - input to a single ControlNet. + specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted + as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or + width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`, + images must be passed as a list such that each element of the list can be correctly batched for input + to a single ControlNet. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. Anything below 512 pixels won't work well for [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) @@ -787,20 +787,20 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled text embeddings are generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py index 5bd396b20fad..1d438bcf877d 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py @@ -164,8 +164,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -184,10 +184,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -549,11 +549,11 @@ def __call__( width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -593,10 +593,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -607,7 +607,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py index 50e2cda25a48..c5d9eed3ca9c 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py @@ -188,8 +188,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -208,10 +208,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -421,7 +421,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -665,11 +665,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -681,7 +681,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.7): @@ -714,10 +714,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -728,7 +728,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py index 89eb97a087f8..cb7e9ef6f347 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py @@ -340,8 +340,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -360,10 +360,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -576,7 +576,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -607,7 +607,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`original_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -735,7 +735,7 @@ def prepare_intermediate_images( @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor], + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], original_image: Union[ PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] ] = None, @@ -748,11 +748,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 250, @@ -762,10 +762,10 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - original_image (`torch.FloatTensor` or `PIL.Image.Image`): + original_image (`torch.Tensor` or `PIL.Image.Image`): The original image that `image` was varied from. strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -800,10 +800,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -814,7 +814,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py index aabe1107fc9e..cb592aa5675e 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py @@ -192,8 +192,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -212,10 +212,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -428,7 +428,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -459,7 +459,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`mask_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -760,11 +760,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -776,7 +776,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. mask_image (`PIL.Image.Image`): @@ -814,10 +814,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -828,7 +828,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py index 1798e0dec7eb..aa70eb7b40fe 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py @@ -342,8 +342,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -362,10 +362,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -579,7 +579,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -610,7 +610,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`original_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -643,7 +643,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`mask_image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -823,7 +823,7 @@ def prepare_intermediate_images( @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor], + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], original_image: Union[ PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] ] = None, @@ -839,11 +839,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 0, @@ -853,10 +853,10 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - original_image (`torch.FloatTensor` or `PIL.Image.Image`): + original_image (`torch.Tensor` or `PIL.Image.Image`): The original image that `image` was varied from. mask_image (`PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be @@ -896,10 +896,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -910,7 +910,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py index 36ed34cba98e..fd38a8724308 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py @@ -298,8 +298,8 @@ def encode_prompt( num_images_per_prompt: int = 1, device: Optional[torch.device] = None, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clean_caption: bool = False, ): r""" @@ -318,10 +318,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -537,7 +537,7 @@ def check_inputs( and not isinstance(check_image_type, np.ndarray) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + "`image` has to be of type `torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" f" {type(check_image_type)}" ) @@ -608,7 +608,7 @@ def __call__( prompt: Union[str, List[str]] = None, height: int = None, width: int = None, - image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor] = None, + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor] = None, num_inference_steps: int = 50, timesteps: List[int] = None, guidance_scale: float = 4.0, @@ -616,11 +616,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 250, @@ -637,7 +637,7 @@ def __call__( The height in pixels of the generated image. width (`int`, *optional*, defaults to None): The width in pixels of the generated image. - image (`PIL.Image.Image`, `np.ndarray`, `torch.FloatTensor`): + image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`): The image to be upscaled. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -663,10 +663,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -677,7 +677,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py b/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py index f73ef15d7de7..f69f905b56c5 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py @@ -13,27 +13,27 @@ class TransformationModelOutput(ModelOutput): Base class for text model's outputs that also contains a pooling of the last hidden states. Args: - text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + text_embeds (`torch.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): The text embeddings obtained by applying the projection layer to the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + last_hidden_state (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one + for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - projection_state: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None + projection_state: Optional[torch.Tensor] = None + last_hidden_state: torch.Tensor = None + hidden_states: Optional[Tuple[torch.Tensor]] = None + attentions: Optional[Tuple[torch.Tensor]] = None class RobertaSeriesConfig(XLMRobertaConfig): diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py index d1be4bbe6379..11d81b13ea1b 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py @@ -278,8 +278,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -310,8 +310,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -331,10 +331,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -637,7 +637,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -694,9 +694,9 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -738,14 +738,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py index 53d67a1b2d33..145579da0cb7 100644 --- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -318,8 +318,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -350,8 +350,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -371,10 +371,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -721,7 +721,7 @@ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32 data type of the generated embeddings Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + `torch.Tensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` """ assert len(w.shape) == 1 w = w * 1000.0 @@ -774,8 +774,8 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -791,7 +791,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a @@ -824,10 +824,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. diff --git a/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py b/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py index c03a3d8fc3b2..101d315dfe59 100644 --- a/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py +++ b/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py @@ -112,9 +112,9 @@ def __call__( The call function to the pipeline for generation. Args: - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): The original image to inpaint on. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): The mask_image where 0.0 define which part of the original image to inpaint. num_inference_steps (`int`, *optional*, defaults to 1000): The number of denoising steps. More denoising steps usually lead to a higher quality image at the diff --git a/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index 475da0b6d188..b8ac8e1416bf 100644 --- a/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -134,7 +134,7 @@ def __call__( num_inference_steps: int = 100, return_dict: bool = True, output_type: str = "np", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ) -> Union[AudioPipelineOutput, Tuple]: if (callback_steps is None) or ( @@ -161,7 +161,7 @@ def __call__( The output format of the generated audio. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py index 0581effef2fe..edcc0fed6818 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py @@ -255,8 +255,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -288,8 +288,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -309,10 +309,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -638,10 +638,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -652,7 +652,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. strength (`float`, *optional*, defaults to 0.8): @@ -678,10 +678,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -691,7 +691,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py index c7dff9eeefca..cd29665fdb53 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py @@ -48,7 +48,7 @@ def preprocess_image(image, batch_size): def preprocess_mask(mask, batch_size, scale_factor=8): - if not isinstance(mask, torch.FloatTensor): + if not isinstance(mask, torch.Tensor): mask = mask.convert("L") w, h = mask.size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 @@ -225,8 +225,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -258,8 +258,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -279,10 +279,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -557,8 +557,8 @@ def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, device, def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -567,11 +567,11 @@ def __call__( add_predicted_noise: Optional[bool] = False, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -583,10 +583,10 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the @@ -620,10 +620,10 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -635,7 +635,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -693,7 +693,7 @@ def __call__( prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) # 4. Preprocess image and mask - if not isinstance(image, torch.FloatTensor): + if not isinstance(image, torch.Tensor): image = preprocess_image(image, batch_size) mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor) diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py index f44a1ca74ee4..901816391dcc 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py @@ -163,8 +163,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -196,8 +196,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -217,10 +217,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -620,12 +620,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -657,14 +657,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -674,7 +674,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py index 9421531d273e..74ecaa995b7d 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py @@ -154,8 +154,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -187,8 +187,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -208,10 +208,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -492,12 +492,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, debug: bool = False, @@ -537,14 +537,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -554,7 +554,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py index 5f744578810b..d1f2c5de9728 100644 --- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py @@ -60,14 +60,14 @@ class Pix2PixInversionPipelineOutput(BaseOutput, TextualInversionLoaderMixin): Output class for Stable Diffusion pipelines. Args: - latents (`torch.FloatTensor`) + latents (`torch.Tensor`) inverted latents tensor images (`List[PIL.Image.Image]` or `np.ndarray`) List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. """ - latents: torch.FloatTensor + latents: torch.Tensor images: Union[List[PIL.Image.Image], np.ndarray] @@ -377,8 +377,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -410,8 +410,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -431,10 +431,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -707,7 +707,7 @@ def construct_direction(self, embs_source: torch.Tensor, embs_target: torch.Tens return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0) @torch.no_grad() - def get_embeds(self, prompt: List[str], batch_size: int = 16) -> torch.FloatTensor: + def get_embeds(self, prompt: List[str], batch_size: int = 16) -> torch.Tensor: num_prompts = len(prompt) embeds = [] for i in range(0, num_prompts, batch_size): @@ -827,13 +827,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, cross_attention_guidance_amount: float = 0.1, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -876,14 +876,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -897,7 +897,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. @@ -1112,12 +1112,12 @@ def invert( num_inference_steps: int = 50, guidance_scale: float = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, cross_attention_guidance_amount: float = 0.1, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, lambda_auto_corr: float = 20.0, @@ -1132,7 +1132,7 @@ def invert( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch which will be used for conditioning. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. num_inference_steps (`int`, *optional*, defaults to 50): @@ -1147,11 +1147,11 @@ def invert( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. cross_attention_guidance_amount (`float`, defaults to 0.1): @@ -1164,7 +1164,7 @@ def invert( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py index c84caa1fad88..9e172ec2dca6 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py @@ -1048,7 +1048,7 @@ def unload_lora(self): def forward( self, - sample: torch.FloatTensor, + sample: torch.Tensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, @@ -1066,10 +1066,10 @@ def forward( The [`UNetFlatConditionModel`] forward method. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The noisy input tensor with the following shape `(batch, channel, height, width)`. - timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. - encoder_hidden_states (`torch.FloatTensor`): + timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. class_labels (`torch.Tensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. @@ -1590,8 +1590,8 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () for resnet in self.resnets: @@ -1719,14 +1719,14 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - additional_residuals: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + encoder_attention_mask: Optional[torch.Tensor] = None, + additional_residuals: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: output_states = () blocks = list(zip(self.resnets, self.attentions)) @@ -1837,13 +1837,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, upsample_size: Optional[int] = None, *args, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) @@ -1994,15 +1994,15 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + res_hidden_states_tuple: Tuple[torch.Tensor, ...], + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, upsample_size: Optional[int] = None, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2104,8 +2104,8 @@ class UNetMidBlockFlat(nn.Module): output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor. Returns: - `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size, - in_channels, height, width)`. + `torch.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size, in_channels, + height, width)`. """ @@ -2223,7 +2223,7 @@ def __init__( self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: + def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): if attn is not None: @@ -2339,13 +2339,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: if cross_attention_kwargs is not None: if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") @@ -2480,13 +2480,13 @@ def __init__( def forward( self, - hidden_states: torch.FloatTensor, - temb: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, + hidden_states: torch.Tensor, + temb: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + encoder_attention_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} if cross_attention_kwargs.get("scale", None) is not None: logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.") diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py index 4455d20df213..c8dc18e2e8ac 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py @@ -81,7 +81,7 @@ def __init__( @torch.no_grad() def image_variation( self, - image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -90,10 +90,10 @@ def image_variation( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -123,7 +123,7 @@ def image_variation( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -134,7 +134,7 @@ def image_variation( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -202,10 +202,10 @@ def text_to_image( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -235,7 +235,7 @@ def text_to_image( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -246,7 +246,7 @@ def text_to_image( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -311,10 +311,10 @@ def dual_guided( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -344,7 +344,7 @@ def dual_guided( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -355,7 +355,7 @@ def dual_guided( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index b1117044cf18..2212651fbb5b 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -395,10 +395,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -429,7 +429,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -439,7 +439,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index 59aa370ec2f6..62d3e83a4790 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -197,7 +197,7 @@ def check_inputs(self, image, height, width, callback_steps): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -247,10 +247,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -281,7 +281,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -292,7 +292,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index 0c76e5837b99..de4c2ac9b7f4 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -333,10 +333,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, **kwargs, ): @@ -367,7 +367,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -378,7 +378,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py b/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py index 0c55d04e67de..8dee000df05f 100644 --- a/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py +++ b/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py @@ -169,10 +169,10 @@ def __call__( truncation_rate: float = 1.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ) -> Union[ImagePipelineOutput, Tuple]: """ @@ -196,7 +196,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor` of shape (batch), *optional*): + latents (`torch.Tensor` of shape (batch), *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Must be valid embedding indices.If not provided, a latents tensor will be generated of completely masked latent pixels. @@ -206,7 +206,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -301,7 +301,7 @@ def __call__( return ImagePipelineOutput(images=image) - def truncate(self, log_p_x_0: torch.FloatTensor, truncation_rate: float) -> torch.FloatTensor: + def truncate(self, log_p_x_0: torch.Tensor, truncation_rate: float) -> torch.Tensor: """ Truncates `log_p_x_0` such that for each column vector, the total cumulative probability is `truncation_rate` The lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py index a6b9499f5542..9e6a17e2cd1e 100644 --- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py +++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py @@ -175,8 +175,8 @@ def encode_prompt( device, num_videos_per_prompt, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, clip_skip: Optional[int] = None, ): r""" @@ -195,10 +195,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -455,7 +455,7 @@ def check_inputs( and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -534,9 +534,9 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, decode_chunk_size: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -548,7 +548,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): Image or images to guide image generation. If you provide a tensor, it needs to be compatible with [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -580,14 +580,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py index 34b5a47c2541..b2041e101564 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py @@ -233,8 +233,8 @@ def _encode_prompt( def __call__( self, prompt: Union[str, List[str]], - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, @@ -242,9 +242,9 @@ def __call__( guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -254,9 +254,9 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -279,7 +279,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -288,7 +288,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py index cbe66a63f4c8..fe9909770376 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py @@ -226,9 +226,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -268,7 +268,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -277,7 +277,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -436,7 +436,7 @@ def set_progress_bar_config(self, **kwargs): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -447,9 +447,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -459,7 +459,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -499,7 +499,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -508,7 +508,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -677,8 +677,8 @@ def set_progress_bar_config(self, **kwargs): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -688,9 +688,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -700,7 +700,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -739,7 +739,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -748,7 +748,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py index 4d091e7d7a7c..ef5241fee5d2 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py @@ -266,10 +266,10 @@ def _encode_prompt( # add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32) alphas = 1.0 - betas alphas_cumprod = torch.cumprod(alphas, dim=0) @@ -295,9 +295,9 @@ def add_noise( def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - image_embeds: torch.FloatTensor, - negative_image_embeds: torch.FloatTensor, + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + image_embeds: torch.Tensor, + negative_image_embeds: torch.Tensor, negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, @@ -307,7 +307,7 @@ def __call__( num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -317,12 +317,12 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`): + image (`torch.Tensor`, `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -356,7 +356,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py index d8d9e96e6fcd..778b6e314c0d 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py @@ -398,10 +398,10 @@ def _encode_prompt( def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - image_embeds: torch.FloatTensor, - negative_image_embeds: torch.FloatTensor, + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray], + image_embeds: torch.Tensor, + negative_image_embeds: torch.Tensor, negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, @@ -409,9 +409,9 @@ def __call__( guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -421,10 +421,10 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image` or `np.ndarray`): + image (`torch.Tensor`, `PIL.Image.Image` or `np.ndarray`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. - mask_image (`PIL.Image.Image`,`torch.FloatTensor` or `np.ndarray`): + mask_image (`PIL.Image.Image`,`torch.Tensor` or `np.ndarray`): `Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if the image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, so the @@ -432,9 +432,9 @@ def __call__( image or numpy array, mask should also be a either PIL image or numpy array. If it is a PIL image, it will be converted to a single channel (luminance) before use. If it is a nummpy array, the expected shape is `(H, W)`. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -457,7 +457,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -466,7 +466,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py index 7d9be4570f6e..b5152d71cb6b 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py @@ -115,14 +115,14 @@ class KandinskyPriorPipelineOutput(BaseOutput): Output class for KandinskyPriorPipeline. Args: - image_embeds (`torch.FloatTensor`) + image_embeds (`torch.Tensor`) clip image embeddings for text prompt negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`) clip image embeddings for unconditional tokens """ - image_embeds: Union[torch.FloatTensor, np.ndarray] - negative_image_embeds: Union[torch.FloatTensor, np.ndarray] + image_embeds: Union[torch.Tensor, np.ndarray] + negative_image_embeds: Union[torch.Tensor, np.ndarray] class KandinskyPriorPipeline(DiffusionPipeline): @@ -173,12 +173,12 @@ def __init__( @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) def interpolate( self, - images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]], weights: List[float], num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, negative_prior_prompt: Optional[str] = None, negative_prompt: str = "", guidance_scale: float = 4.0, @@ -188,7 +188,7 @@ def interpolate( Function invoked when using the prior pipeline for interpolation. Args: - images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`): list of prompts and images to guide the image generation. weights: (`List[float]`): list of weights for each condition in `images_and_prompts` @@ -200,7 +200,7 @@ def interpolate( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -403,7 +403,7 @@ def __call__( num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, output_type: Optional[str] = "pt", return_dict: bool = True, @@ -425,7 +425,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py index 4b977af0d6f7..471db61556f5 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py @@ -123,15 +123,15 @@ def num_timesteps(self): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -142,9 +142,9 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. @@ -164,7 +164,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py index 06d94d2cb79f..9db767681b04 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py @@ -213,9 +213,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -259,7 +259,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -442,7 +442,7 @@ def set_progress_bar_config(self, **kwargs): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -453,9 +453,9 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -469,7 +469,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -509,7 +509,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -518,7 +518,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -681,8 +681,8 @@ def set_progress_bar_config(self, **kwargs): def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + mask_image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], negative_prompt: Optional[Union[str, List[str]]] = None, num_inference_steps: int = 100, guidance_scale: float = 4.0, @@ -692,7 +692,7 @@ def __call__( prior_guidance_scale: float = 4.0, prior_num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -707,7 +707,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -746,7 +746,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py index de87dd3c345d..0130c3951b38 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py @@ -151,18 +151,18 @@ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - hint: torch.FloatTensor, + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], + hint: torch.Tensor, height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -172,11 +172,11 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - hint (`torch.FloatTensor`): + hint (`torch.Tensor`): The controlnet condition. - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored @@ -199,7 +199,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -208,7 +208,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py index c3ac7bcf6099..12be1534c642 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py @@ -206,10 +206,10 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - hint: torch.FloatTensor, + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], + hint: torch.Tensor, height: int = 512, width: int = 512, num_inference_steps: int = 100, @@ -218,7 +218,7 @@ def __call__( num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, output_type: Optional[str] = "pil", - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, return_dict: bool = True, ): @@ -226,9 +226,9 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -238,9 +238,9 @@ def __call__( denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - hint (`torch.FloatTensor`): + hint (`torch.Tensor`): The controlnet condition. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. @@ -265,7 +265,7 @@ def __call__( (`np.array`) or `"pt"` (`torch.Tensor`). callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py index 3fdae934adfe..899273a1a736 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py @@ -190,9 +190,9 @@ def num_timesteps(self): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], height: int = 512, width: int = 512, num_inference_steps: int = 100, @@ -210,9 +210,9 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded again. @@ -222,7 +222,7 @@ def __call__( denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py index 2fb8731f8a7f..b5ba7a0011a1 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py @@ -294,17 +294,17 @@ def num_timesteps(self): @torch.no_grad() def __call__( self, - image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeds: Union[torch.Tensor, List[torch.Tensor]], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image, np.ndarray], + negative_image_embeds: Union[torch.Tensor, List[torch.Tensor]], height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -315,7 +315,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for text prompt, that will be used to condition the image generation. image (`PIL.Image.Image`): `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will @@ -325,7 +325,7 @@ def __call__( black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. - negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. @@ -345,7 +345,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py index 1455e20ab5d9..f2134b22b40b 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py @@ -132,12 +132,12 @@ def __init__( @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) def interpolate( self, - images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]], weights: List[float], num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, negative_prior_prompt: Optional[str] = None, negative_prompt: str = "", guidance_scale: float = 4.0, @@ -147,7 +147,7 @@ def interpolate( Function invoked when using the prior pipeline for interpolation. Args: - images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`): list of prompts and images to guide the image generation. weights: (`List[float]`): list of weights for each condition in `images_and_prompts` @@ -159,7 +159,7 @@ def interpolate( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -376,7 +376,7 @@ def __call__( num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, output_type: Optional[str] = "pt", # pt only return_dict: bool = True, @@ -400,7 +400,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py index 26442735e80d..ec6509bb3cb5 100644 --- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py @@ -156,12 +156,12 @@ def get_timesteps(self, num_inference_steps, strength, device): @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) def interpolate( self, - images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + images_and_prompts: List[Union[str, PIL.Image.Image, torch.Tensor]], weights: List[float], num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, negative_prior_prompt: Optional[str] = None, negative_prompt: str = "", guidance_scale: float = 4.0, @@ -171,7 +171,7 @@ def interpolate( Function invoked when using the prior pipeline for interpolation. Args: - images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`): list of prompts and images to guide the image generation. weights: (`List[float]`): list of weights for each condition in `images_and_prompts` @@ -183,7 +183,7 @@ def interpolate( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -418,7 +418,7 @@ def __call__( Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. - emb (`torch.FloatTensor`): + emb (`torch.Tensor`): The image embedding. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py index 85d6418d07cf..d7ff59e001e7 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py @@ -87,11 +87,11 @@ def encode_prompt( num_images_per_prompt=1, device=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, _cut_context=False, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -109,16 +109,16 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. """ if prompt is not None and negative_prompt is not None: @@ -334,10 +334,10 @@ def __call__( height: Optional[int] = 1024, width: Optional[int] = 1024, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, latents=None, @@ -380,16 +380,16 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between @@ -398,7 +398,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py index 16a57b6b8c3f..df46756a17ef 100644 --- a/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py +++ b/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py @@ -112,11 +112,11 @@ def encode_prompt( num_images_per_prompt=1, device=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, _cut_context=False, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -134,16 +134,16 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. """ if prompt is not None and negative_prompt is not None: @@ -403,17 +403,17 @@ def num_timesteps(self): def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, + image: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None, strength: float = 0.3, num_inference_steps: int = 25, guidance_scale: float = 3.0, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - negative_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + negative_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -427,7 +427,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): @@ -454,16 +454,16 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - attention_mask (`torch.FloatTensor`, *optional*): + attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask. Must provide if passing `prompt_embeds` directly. - negative_attention_mask (`torch.FloatTensor`, *optional*): + negative_attention_mask (`torch.Tensor`, *optional*): Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index a84263eaeeb6..11e3781aaf7c 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -237,8 +237,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -258,10 +258,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -565,7 +565,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -578,7 +578,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -628,7 +628,7 @@ def check_inputs( prompt: Union[str, List[str]], strength: float, callback_steps: int, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image=None, ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, @@ -709,10 +709,10 @@ def __call__( guidance_scale: float = 8.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -754,16 +754,16 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py index 1bbb1bd91bb8..7f3495258402 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py @@ -221,8 +221,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -242,10 +242,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -512,7 +512,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -525,7 +525,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -565,7 +565,7 @@ def check_inputs( height: int, width: int, callback_steps: int, - prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image=None, ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, @@ -646,10 +646,10 @@ def __call__( guidance_scale: float = 8.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -691,16 +691,16 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py index f39cbc839652..f6f3531a8835 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py @@ -74,7 +74,7 @@ def __call__( guidance_scale: Optional[float] = 1.0, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, **kwargs, @@ -98,7 +98,7 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -465,17 +465,17 @@ def __init__(self, config: LDMBertConfig): def forward( self, - hidden_states: torch.FloatTensor, - attention_mask: torch.FloatTensor, - layer_head_mask: torch.FloatTensor, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, output_attentions: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: """ Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` - attention_mask (`torch.FloatTensor`): attention mask of size + hidden_states (`torch.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (`torch.Tensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. - layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + layer_head_mask (`torch.Tensor`): mask for attention heads in a given layer of size `(encoder_attention_heads,)`. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under @@ -587,7 +587,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, @@ -615,7 +615,7 @@ def forward( - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py index 619be13a8f36..fdcaa85e9fc2 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py @@ -502,8 +502,8 @@ def encode_prompt( enable_edit_guidance, negative_prompt=None, editing_prompt=None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - editing_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + editing_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -523,10 +523,10 @@ def encode_prompt( less than `1`). editing_prompt (`str` or `List[str]`, *optional*): Editing prompt(s) to be encoded. If not defined, one has to pass `editing_prompt_embeds` instead. - editing_prompt_embeds (`torch.FloatTensor`, *optional*): + editing_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -704,13 +704,13 @@ def __call__( return_dict: bool = True, editing_prompt: Optional[Union[str, List[str]]] = None, editing_prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, reverse_editing_direction: Optional[Union[bool, List[bool]]] = False, edit_guidance_scale: Optional[Union[float, List[float]]] = 5, edit_warmup_steps: Optional[Union[int, List[int]]] = 0, edit_cooldown_steps: Optional[Union[int, List[int]]] = None, edit_threshold: Optional[Union[float, List[float]]] = 0.9, - user_mask: Optional[torch.FloatTensor] = None, + user_mask: Optional[torch.Tensor] = None, sem_guidance: Optional[List[torch.Tensor]] = None, use_cross_attn_mask: bool = False, use_intersect_mask: bool = True, @@ -748,7 +748,7 @@ def __call__( editing_prompt_embeds (`torch.Tensor>`, *optional*): Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be specified via `reverse_editing_direction`. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`): @@ -765,7 +765,7 @@ def __call__( Masking threshold of guidance. Threshold should be proportional to the image region that is modified. 'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247). - user_mask (`torch.FloatTensor`, *optional*): + user_mask (`torch.Tensor`, *optional*): User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit masks do not meet user preferences. sem_guidance (`List[torch.Tensor]`, *optional*): diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py index 5ea7c2c14551..d2a62c01ee83 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py @@ -409,14 +409,14 @@ def encode_prompt( num_images_per_prompt: int = 1, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, enable_edit_guidance: bool = True, editing_prompt: Optional[str] = None, - editing_prompt_embeds: Optional[torch.FloatTensor] = None, - editing_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + editing_prompt_embeds: Optional[torch.Tensor] = None, + editing_pooled_prompt_embeds: Optional[torch.Tensor] = None, ) -> object: r""" Encodes the prompt into text encoder hidden states. @@ -432,11 +432,11 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -450,11 +450,11 @@ def encode_prompt( editing_prompt (`str` or `List[str]`, *optional*): Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass `editing_prompt_embeds` instead. - editing_prompt_embeds (`torch.FloatTensor`, *optional*): + editing_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input argument. - editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + editing_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt` input argument. @@ -713,7 +713,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -726,7 +726,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -804,8 +804,8 @@ def __call__( denoising_end: Optional[float] = None, negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt_2: Optional[Union[str, List[str]]] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -824,7 +824,7 @@ def __call__( sem_guidance: Optional[List[torch.Tensor]] = None, use_cross_attn_mask: bool = False, use_intersect_mask: bool = False, - user_mask: Optional[torch.FloatTensor] = None, + user_mask: Optional[torch.Tensor] = None, attn_store_steps: Optional[List[int]] = [], store_averaged_over_steps: bool = True, clip_skip: Optional[int] = None, @@ -851,11 +851,11 @@ def __call__( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -869,7 +869,7 @@ def __call__( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py index 2bd828f0df24..728635da6d4d 100644 --- a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py +++ b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py @@ -120,8 +120,8 @@ def _encode_prompt( num_waveforms_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -139,10 +139,10 @@ def _encode_prompt( The prompt or prompts not to guide the audio generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -427,11 +427,11 @@ def __call__( num_waveforms_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, output_type: Optional[str] = "np", @@ -465,21 +465,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index 263d507bbc75..b225fd71edf8 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -266,7 +266,7 @@ def check_inputs(self, image, height, width, callback_steps): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -393,9 +393,9 @@ def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free @torch.no_grad() def __call__( self, - example_image: Union[torch.FloatTensor, PIL.Image.Image], - image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + example_image: Union[torch.Tensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], + mask_image: Union[torch.Tensor, PIL.Image.Image], height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -404,22 +404,22 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" The call function to the pipeline for generation. Args: - example_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): + example_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): An example image to guide image generation. - image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): + image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): `Image` or tensor representing an image batch to be inpainted (parts of the image are masked out with `mask_image` and repainted according to `prompt`). - mask_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): + mask_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`): `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted, while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the @@ -445,7 +445,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -456,7 +456,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py index aceb95ae0451..34840bc38306 100644 --- a/src/diffusers/pipelines/pia/pipeline_pia.py +++ b/src/diffusers/pipelines/pia/pipeline_pia.py @@ -228,8 +228,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -249,10 +249,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -701,11 +701,11 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, motion_scale: int = 0, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -746,20 +746,20 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -770,8 +770,7 @@ def __call__( added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5 to create looping motion. Set between 6-8 to perform motion with image style transfer. output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or - `np.array`. + The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index c1bb5b4584ea..9794b89f7611 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -296,10 +296,10 @@ def encode_prompt( negative_prompt: str = "", num_images_per_prompt: int = 1, device: Optional[torch.device] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, clean_caption: bool = False, max_sequence_length: int = 120, **kwargs, @@ -320,10 +320,10 @@ def encode_prompt( number of images that should be generated per prompt device: (`torch.device`, *optional*): torch device to place the resulting embeddings on - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" string. clean_caption (`bool`, defaults to `False`): @@ -694,14 +694,14 @@ def __call__( width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, use_resolution_binning: bool = True, @@ -748,18 +748,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings. + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_attention_mask (`torch.FloatTensor`, *optional*): + negative_prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for negative text embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between @@ -768,7 +768,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py index 0389ac06f363..5e3cc668d6a8 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py @@ -222,10 +222,10 @@ def encode_prompt( negative_prompt: str = "", num_images_per_prompt: int = 1, device: Optional[torch.device] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, clean_caption: bool = False, max_sequence_length: int = 120, **kwargs, @@ -246,10 +246,10 @@ def encode_prompt( number of images that should be generated per prompt device: (`torch.device`, *optional*): torch device to place the resulting embeddings on - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the "" string. clean_caption (`bool`, defaults to `False`): @@ -621,14 +621,14 @@ def __call__( width: Optional[int] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_attention_mask: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_attention_mask: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, clean_caption: bool = True, use_resolution_binning: bool = True, @@ -675,18 +675,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings. + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_attention_mask (`torch.FloatTensor`, *optional*): + negative_prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for negative text embeddings. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between @@ -695,7 +695,7 @@ def __call__( Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index fe83a860aeac..e068387b6162 100644 --- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -224,10 +224,10 @@ def __call__( num_images_per_prompt: int = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, editing_prompt: Optional[Union[str, List[str]]] = None, editing_prompt_embeddings: Optional[torch.Tensor] = None, @@ -268,7 +268,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -279,7 +279,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py index 1ef10e17cbd7..f87f28e06c4a 100644 --- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py +++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py @@ -69,7 +69,7 @@ class ShapEPipelineOutput(BaseOutput): Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`]. Args: - images (`torch.FloatTensor`) + images (`torch.Tensor`) A list of images for 3D rendering. """ @@ -187,7 +187,7 @@ def __call__( num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, frame_size: int = 64, output_type: Optional[str] = "pil", # pil, np, latent, mesh @@ -207,7 +207,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py index 700ca5db6f07..7cc145e4c3e2 100644 --- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py @@ -70,7 +70,7 @@ class ShapEPipelineOutput(BaseOutput): Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`]. Args: - images (`torch.FloatTensor`) + images (`torch.Tensor`) A list of images for 3D rendering. """ @@ -169,7 +169,7 @@ def __call__( num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, guidance_scale: float = 4.0, frame_size: int = 64, output_type: Optional[str] = "pil", # pil, np, latent, mesh @@ -179,7 +179,7 @@ def __call__( The call function to the pipeline for generation. Args: - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can also accept image latents as image, but if passing latents directly it is not encoded again. num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -190,7 +190,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py index 65ac21f22007..af80cf805afe 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py @@ -129,10 +129,10 @@ def encode_prompt( do_classifier_free_guidance, prompt=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, ): if prompt_embeds is None: # get prompt text embeddings @@ -285,18 +285,18 @@ def num_timesteps(self): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeddings: Union[torch.Tensor, List[torch.Tensor]], prompt: Union[str, List[str]] = None, num_inference_steps: int = 10, guidance_scale: float = 0.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -306,7 +306,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embedding (`torch.Tensor` or `List[torch.Tensor]`): Image Embeddings either extracted from an image or generated by a Prior Model. prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. @@ -322,17 +322,17 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `decoder_guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + negative_prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input argument. @@ -341,7 +341,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py index 3b96ef1418de..6724b60cc424 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py @@ -162,13 +162,13 @@ def __call__( num_inference_steps: int = 12, decoder_guidance_scale: float = 0.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -187,17 +187,17 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + negative_prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -230,7 +230,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py index 55fb4c28f6dd..dc6c81e1a8b9 100644 --- a/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py +++ b/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py @@ -54,19 +54,19 @@ class StableCascadePriorPipelineOutput(BaseOutput): Output class for WuerstchenPriorPipeline. Args: - image_embeddings (`torch.FloatTensor` or `np.ndarray`) + image_embeddings (`torch.Tensor` or `np.ndarray`) Prior image embeddings for text prompt - prompt_embeds (`torch.FloatTensor`): + prompt_embeds (`torch.Tensor`): Text embeddings for the prompt. - negative_prompt_embeds (`torch.FloatTensor`): + negative_prompt_embeds (`torch.Tensor`): Text embeddings for the negative prompt. """ - image_embeddings: Union[torch.FloatTensor, np.ndarray] - prompt_embeds: Union[torch.FloatTensor, np.ndarray] - prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray] - negative_prompt_embeds: Union[torch.FloatTensor, np.ndarray] - negative_prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray] + image_embeddings: Union[torch.Tensor, np.ndarray] + prompt_embeds: Union[torch.Tensor, np.ndarray] + prompt_embeds_pooled: Union[torch.Tensor, np.ndarray] + negative_prompt_embeds: Union[torch.Tensor, np.ndarray] + negative_prompt_embeds_pooled: Union[torch.Tensor, np.ndarray] class StableCascadePriorPipeline(DiffusionPipeline): @@ -150,10 +150,10 @@ def encode_prompt( do_classifier_free_guidance, prompt=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, ): if prompt_embeds is None: # get prompt text embeddings @@ -374,14 +374,14 @@ def __call__( timesteps: List[float] = None, guidance_scale: float = 4.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None, - image_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + prompt_embeds_pooled: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds_pooled: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pt", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -409,21 +409,21 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `decoder_guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*): + negative_prompt_embeds_pooled (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input argument. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting. If not provided, image embeddings will be generated from `image` input argument if existing. num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -431,7 +431,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py index 311347dccaa8..2e34dcb83c01 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py @@ -288,7 +288,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`): + image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.Tensor`): `Image`, or tensor representing an image batch which will be upscaled. * num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -329,7 +329,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index bee6ea7b1132..2ec60fbd619c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -395,7 +395,7 @@ def __call__( [`schedulers.DDIMScheduler`], will be ignored for others. generator (`np.random.RandomState`, *optional*): A np.random.RandomState to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 3612039f09cb..e8ab72421d7e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -274,8 +274,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -306,8 +306,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -327,10 +327,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -691,7 +691,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -704,7 +704,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -765,11 +765,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -814,18 +814,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 1f822971568f..d29e519974b4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -156,8 +156,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -189,8 +189,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -210,10 +210,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -609,7 +609,7 @@ def __call__( self, prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, - depth_map: Optional[torch.FloatTensor] = None, + depth_map: Optional[torch.Tensor] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -617,8 +617,8 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -633,10 +633,10 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be used as the starting point. Can accept image latents as `image` only if `depth_map` is not `None`. - depth_map (`torch.FloatTensor`, *optional*): + depth_map (`torch.Tensor`, *optional*): Depth prediction to be used as additional conditioning for the image generation process. If not defined, it automatically predicts the depth with `self.depth_estimator`. strength (`float`, *optional*, defaults to 0.8): @@ -662,10 +662,10 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index c300c7a2f3f4..93a8bd160318 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -207,7 +207,7 @@ def check_inputs(self, image, height, width, callback_steps): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -248,7 +248,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype @torch.no_grad() def __call__( self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], + image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor], height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -256,17 +256,17 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" The call function to the pipeline for generation. Args: - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): Image or images to guide image generation. If you provide a tensor, it needs to be compatible with [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -287,7 +287,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -298,7 +298,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index daded18ebd85..f2a5de81540d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -315,8 +315,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -348,8 +348,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -369,10 +369,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -784,7 +784,7 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -797,7 +797,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -854,10 +854,10 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -872,7 +872,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a @@ -909,14 +909,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index ab2a0eae7313..71dec964fdca 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -387,8 +387,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -420,8 +420,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -441,10 +441,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -931,7 +931,7 @@ def get_timesteps(self, num_inference_steps, strength, device): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -944,7 +944,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -992,7 +992,7 @@ def __call__( prompt: Union[str, List[str]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, - masked_image_latents: torch.FloatTensor = None, + masked_image_latents: torch.Tensor = None, height: Optional[int] = None, width: Optional[int] = None, padding_mask_crop: Optional[int] = None, @@ -1005,11 +1005,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1024,14 +1024,14 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. - mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one @@ -1080,18 +1080,18 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 0bf5a92a4fcc..b2b2b14009db 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -168,11 +168,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -185,7 +185,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept image latents as `image`, but if passing latents directly it is not encoded again. num_inference_steps (`int`, *optional*, defaults to 100): @@ -210,14 +210,14 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): @@ -471,8 +471,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -490,10 +490,10 @@ def _encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 918dffe5199d..978e2dbb60ab 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -267,10 +267,10 @@ def __call__( guidance_scale: float = 9.0, negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -279,7 +279,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide image upscaling. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be upscaled. If it's a tensor, it can be either a latent output from a Stable Diffusion model or an image tensor in the range `[-1, 1]`. It is considered a `latent` if `image.shape[1]` is `4`; otherwise, it is considered to be an image representation and @@ -299,7 +299,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -310,7 +310,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 2d04cf41d9b5..3981c8a461fe 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -176,8 +176,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -209,8 +209,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -230,10 +230,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -542,12 +542,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: int = None, @@ -558,7 +558,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image` or tensor representing an image batch to be upscaled. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -577,14 +577,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -594,7 +594,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index 02ddc65c7111..1839c4511060 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -257,8 +257,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -290,8 +290,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -311,10 +311,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -588,7 +588,7 @@ def noise_image_embeddings( self, image_embeds: torch.Tensor, noise_level: int, - noise: Optional[torch.FloatTensor] = None, + noise: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, ): """ @@ -644,19 +644,19 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 0, # prior args prior_num_inference_steps: int = 25, prior_guidance_scale: float = 4.0, - prior_latents: Optional[torch.FloatTensor] = None, + prior_latents: Optional[torch.Tensor] = None, clip_skip: Optional[int] = None, ): """ @@ -686,14 +686,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -702,7 +702,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -718,7 +718,7 @@ def __call__( prior_guidance_scale (`float`, *optional*, defaults to 4.0): A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. - prior_latents (`torch.FloatTensor`, *optional*): + prior_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image embedding generation in the prior denoising process. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 134ec39effc5..13915b39f034 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -166,8 +166,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -254,8 +254,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -275,10 +275,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -537,7 +537,7 @@ def check_inputs( and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -569,7 +569,7 @@ def noise_image_embeddings( self, image_embeds: torch.Tensor, noise_level: int, - noise: Optional[torch.FloatTensor] = None, + noise: Optional[torch.Tensor] = None, generator: Optional[torch.Generator] = None, ): """ @@ -615,7 +615,7 @@ def noise_image_embeddings( @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, prompt: Union[str, List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, @@ -625,16 +625,16 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[torch.Generator] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, noise_level: int = 0, - image_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.Tensor] = None, clip_skip: Optional[int] = None, ): r""" @@ -644,7 +644,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, either `prompt_embeds` will be used or prompt is initialized to `""`. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image` or tensor representing an image batch. The image is encoded to its CLIP embedding which the `unet` is conditioned on. The image is _not_ encoded by the `vae` and then used as the latents in the denoising process like it is in the standard Stable Diffusion text-guided image variation process. @@ -669,14 +669,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -685,7 +685,7 @@ def __call__( Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -695,7 +695,7 @@ def __call__( noise_level (`int`, *optional*, defaults to `0`): The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details. - image_embeds (`torch.FloatTensor`, *optional*): + image_embeds (`torch.Tensor`, *optional*): Pre-generated CLIP embeddings to condition the `unet` on. These latents are not used in the denoising process. If you want to provide pre-generated latents, pass them to `__call__` as `latents`. clip_skip (`int`, *optional*): diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py index 3e6dec3e0bff..3a0e86409e4a 100644 --- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py @@ -100,7 +100,7 @@ def forward(self, clip_input, images): return images, has_nsfw_concepts @torch.no_grad() - def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor): + def forward_onnx(self, clip_input: torch.Tensor, images: torch.Tensor): pooled_output = self.vision_model(clip_input)[1] # pooled_output image_embeds = self.visual_projection(pooled_output) diff --git a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py index 2adcb0a8c0a1..347d573e1fb7 100644 --- a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py @@ -254,8 +254,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -287,8 +287,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -308,10 +308,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -746,12 +746,12 @@ def __call__( num_images_per_prompt: int = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, max_iter_to_alter: int = 25, @@ -789,14 +789,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -806,7 +806,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py index e89d7f77e8b1..1323542e40b1 100644 --- a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py @@ -53,7 +53,7 @@ class DiffEditInversionPipelineOutput(BaseOutput): Output class for Stable Diffusion pipelines. Args: - latents (`torch.FloatTensor`) + latents (`torch.Tensor`) inverted latents tensor images (`List[PIL.Image.Image]` or `np.ndarray`) List of denoised PIL images of length `num_timesteps * batch_size` or numpy array of shape `(num_timesteps, @@ -61,7 +61,7 @@ class DiffEditInversionPipelineOutput(BaseOutput): diffusion pipeline. """ - latents: torch.FloatTensor + latents: torch.Tensor images: Union[List[PIL.Image.Image], np.ndarray] @@ -381,8 +381,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -414,8 +414,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -435,10 +435,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -831,15 +831,15 @@ def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep @replace_example_docstring(EXAMPLE_DOC_STRING) def generate_mask( self, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, target_prompt: Optional[Union[str, List[str]]] = None, target_negative_prompt: Optional[Union[str, List[str]]] = None, - target_prompt_embeds: Optional[torch.FloatTensor] = None, - target_negative_prompt_embeds: Optional[torch.FloatTensor] = None, + target_prompt_embeds: Optional[torch.Tensor] = None, + target_negative_prompt_embeds: Optional[torch.Tensor] = None, source_prompt: Optional[Union[str, List[str]]] = None, source_negative_prompt: Optional[Union[str, List[str]]] = None, - source_prompt_embeds: Optional[torch.FloatTensor] = None, - source_negative_prompt_embeds: Optional[torch.FloatTensor] = None, + source_prompt_embeds: Optional[torch.Tensor] = None, + source_negative_prompt_embeds: Optional[torch.Tensor] = None, num_maps_per_mask: Optional[int] = 10, mask_encode_strength: Optional[float] = 0.5, mask_thresholding_ratio: Optional[float] = 3.0, @@ -861,10 +861,10 @@ def generate_mask( target_negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide what to not include in image generation. If not defined, you need to pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). - target_prompt_embeds (`torch.FloatTensor`, *optional*): + target_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - target_negative_prompt_embeds (`torch.FloatTensor`, *optional*): + target_negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. source_prompt (`str` or `List[str]`, *optional*): @@ -873,11 +873,11 @@ def generate_mask( source_negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide semantic mask generation away from using DiffEdit. If not defined, you need to pass `source_negative_prompt_embeds` or `source_image` instead. - source_prompt_embeds (`torch.FloatTensor`, *optional*): + source_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings to guide the semantic mask generation. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from `source_prompt` input argument. - source_negative_prompt_embeds (`torch.FloatTensor`, *optional*): + source_negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings to negatively guide the semantic mask generation. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from `source_negative_prompt` input argument. @@ -1051,18 +1051,18 @@ def generate_mask( def invert( self, prompt: Optional[Union[str, List[str]]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.Tensor, PIL.Image.Image] = None, num_inference_steps: int = 50, inpaint_strength: float = 0.8, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, decode_latents: bool = False, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, lambda_auto_corr: float = 20.0, @@ -1095,10 +1095,10 @@ def invert( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. decode_latents (`bool`, *optional*, defaults to `False`): @@ -1111,7 +1111,7 @@ def invert( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -1289,8 +1289,8 @@ def invert( def __call__( self, prompt: Optional[Union[str, List[str]]] = None, - mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, - image_latents: Union[torch.FloatTensor, PIL.Image.Image] = None, + mask_image: Union[torch.Tensor, PIL.Image.Image] = None, + image_latents: Union[torch.Tensor, PIL.Image.Image] = None, inpaint_strength: Optional[float] = 0.8, num_inference_steps: int = 50, guidance_scale: float = 7.5, @@ -1298,12 +1298,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: int = None, @@ -1319,7 +1319,7 @@ def __call__( repainted, while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, 1, H, W)`. - image_latents (`PIL.Image.Image` or `torch.FloatTensor`): + image_latents (`PIL.Image.Image` or `torch.Tensor`): Partially noised image latents from the inversion process to be used as inputs for image generation. inpaint_strength (`float`, *optional*, defaults to 0.8): Indicates extent to inpaint the masked area. Must be between 0 and 1. When `inpaint_strength` is 1, the @@ -1343,14 +1343,14 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -1360,7 +1360,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py index 94043b7285c9..4b96e317dcc3 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py @@ -180,8 +180,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -213,8 +213,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -234,10 +234,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -541,12 +541,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -592,14 +592,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -609,7 +609,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py index c20e940b4db6..963f29f21f2e 100644 --- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py +++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py @@ -238,8 +238,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -259,10 +259,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -705,12 +705,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, gligen_normalize_constant: float = 28.7, @@ -764,14 +764,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -781,7 +781,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py index e2096be7e894..067ae5f6f172 100755 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -154,8 +154,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -187,8 +187,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -208,10 +208,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -469,12 +469,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, use_karras_sigmas: Optional[bool] = False, noise_sampler_seed: Optional[int] = None, @@ -512,14 +512,14 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -531,7 +531,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py index 3cfda4064d13..f7ca0c7c4daa 100644 --- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py @@ -207,10 +207,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -236,17 +236,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -584,11 +584,11 @@ def __call__( negative_prompt_2: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, original_size: Optional[Tuple[int, int]] = None, @@ -642,21 +642,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py index fcb6db4574f9..a100a38b04ea 100644 --- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py @@ -254,8 +254,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -287,8 +287,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -308,10 +308,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -666,7 +666,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -679,7 +679,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -740,11 +740,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -789,19 +789,19 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py index 31af62b609a1..2b80d2856869 100644 --- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py @@ -226,8 +226,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -259,8 +259,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -280,10 +280,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -677,7 +677,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -690,7 +690,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -790,11 +790,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -839,19 +839,19 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py index 63b8c6108ac4..cd59cf51869d 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py @@ -516,11 +516,11 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, sld_guidance_scale: Optional[float] = 1000, sld_warmup_steps: Optional[int] = 10, @@ -555,7 +555,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -568,7 +568,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py b/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py index 549747e97146..338e4c65c500 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py @@ -85,7 +85,7 @@ def forward(self, clip_input, images): return images, has_nsfw_concepts @torch.no_grad() - def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor): + def forward_onnx(self, clip_input: torch.Tensor, images: torch.Tensor): pooled_output = self.vision_model(clip_input)[1] # pooled_output image_embeds = self.visual_projection(pooled_output) diff --git a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py index cb29ce386f32..04c62efb46be 100644 --- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py @@ -169,8 +169,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -202,8 +202,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -223,10 +223,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -570,14 +570,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -611,19 +611,19 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): @@ -633,7 +633,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index b7bc832ec3d3..52d0b07fb315 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -282,10 +282,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -311,17 +311,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -762,7 +762,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -775,7 +775,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -843,13 +843,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -924,26 +924,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index ada981d4dec9..b8698a008320 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -303,10 +303,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -332,17 +332,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -902,7 +902,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -915,7 +915,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -988,13 +988,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1022,7 +1022,7 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. strength (`float`, *optional*, defaults to 0.3): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` @@ -1078,26 +1078,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py index 9361cc6a13af..38f5cec931f8 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py @@ -531,10 +531,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -560,17 +560,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1132,7 +1132,7 @@ def upcast_vae(self): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -1145,7 +1145,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1207,7 +1207,7 @@ def __call__( prompt_2: Optional[Union[str, List[str]]] = None, image: PipelineImageInput = None, mask_image: PipelineImageInput = None, - masked_image_latents: torch.FloatTensor = None, + masked_image_latents: torch.Tensor = None, height: Optional[int] = None, width: Optional[int] = None, padding_mask_crop: Optional[int] = None, @@ -1223,13 +1223,13 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, cross_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -1329,22 +1329,22 @@ def __call__( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -1357,7 +1357,7 @@ def __call__( generator (`torch.Generator`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py index d9380020b329..3cdef82303f7 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py @@ -222,10 +222,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, ): r""" @@ -250,17 +250,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -622,14 +622,14 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -647,7 +647,7 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + image (`torch.Tensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): The image(s) to modify with the pipeline. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -689,21 +689,21 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -715,7 +715,7 @@ def __call__( plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py index f457cdbdb1eb..70d06bb6320d 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py @@ -21,7 +21,7 @@ def __init__(self): self.encoder.set_watermark("bits", self.watermark) - def apply_watermark(self, images: torch.FloatTensor): + def apply_watermark(self, images: torch.Tensor): # can't encode images that are smaller than 256 if images.shape[-1] < 256: return images diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py index b5b048d5cbe0..cd1fe028a0ef 100644 --- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py @@ -149,12 +149,12 @@ class StableVideoDiffusionPipelineOutput(BaseOutput): Output class for Stable Video Diffusion pipeline. Args: - frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]): + frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]): List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size, num_frames, height, width, num_channels)`. """ - frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor] + frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor] class StableVideoDiffusionPipeline(DiffusionPipeline): @@ -207,7 +207,7 @@ def _encode_image( device: Union[str, torch.device], num_videos_per_prompt: int, do_classifier_free_guidance: bool, - ) -> torch.FloatTensor: + ) -> torch.Tensor: dtype = next(self.image_encoder.parameters()).dtype if not isinstance(image, torch.Tensor): @@ -300,7 +300,7 @@ def _get_add_time_ids( return add_time_ids - def decode_latents(self, latents: torch.FloatTensor, num_frames: int, decode_chunk_size: int = 14): + def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14): # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width] latents = latents.flatten(0, 1) @@ -336,7 +336,7 @@ def check_inputs(self, image, height, width): and not isinstance(image, list) ): raise ValueError( - "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" + "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" f" {type(image)}" ) @@ -353,7 +353,7 @@ def prepare_latents( dtype: torch.dtype, device: Union[str, torch.device], generator: torch.Generator, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, ): shape = ( batch_size, @@ -398,7 +398,7 @@ def num_timesteps(self): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor], + image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor], height: int = 576, width: int = 1024, num_frames: Optional[int] = None, @@ -412,7 +412,7 @@ def __call__( decode_chunk_size: Optional[int] = None, num_videos_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], @@ -422,7 +422,7 @@ def __call__( The call function to the pipeline for generation. Args: - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -461,7 +461,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -486,8 +486,8 @@ def __call__( Returns: [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`: If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is - returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) - is returned. + returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.Tensor`) is + returned. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py index 5b8dbafdec06..ab4ab87d2aff 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py @@ -271,8 +271,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -304,8 +304,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -325,10 +325,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -635,7 +635,7 @@ def _default_height_width(self, height, width, image): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -648,7 +648,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -690,12 +690,12 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, adapter_conditioning_scale: Union[float, List[float]] = 1.0, @@ -708,9 +708,9 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. @@ -745,14 +745,14 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -764,7 +764,7 @@ def __call__( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py index 6d742b4b7c98..2aa2415a47bb 100644 --- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py @@ -296,10 +296,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -325,17 +325,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -806,7 +806,7 @@ def _default_height_width(self, height, width, image): # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -819,7 +819,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -864,16 +864,16 @@ def __call__( num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, - ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -897,9 +897,9 @@ def __call__( prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is used in both text-encoders - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): + image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`): The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the - type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be + type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be accepted as an image. The control image is automatically resized to fit the output image. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. Anything below 512 pixels won't work well for @@ -948,26 +948,26 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not @@ -980,7 +980,7 @@ def __call__( instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 6c33836e60da..9ea06bb0d081 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -137,8 +137,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -170,8 +170,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -191,10 +191,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -465,12 +465,12 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "np", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -505,25 +505,25 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): - The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`. + The output format of the generated video. Choose between `torch.Tensor` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index 3901946afe46..e95ed559defc 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -213,8 +213,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -246,8 +246,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -267,10 +267,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -563,19 +563,19 @@ def prepare_latents(self, video, timestep, batch_size, dtype, device, generator= def __call__( self, prompt: Union[str, List[str]] = None, - video: Union[List[np.ndarray], torch.FloatTensor] = None, + video: Union[List[np.ndarray], torch.Tensor] = None, strength: float = 0.6, num_inference_steps: int = 50, guidance_scale: float = 15.0, negative_prompt: Optional[Union[str, List[str]]] = None, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "np", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, @@ -586,7 +586,7 @@ def __call__( Args: prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. - video (`List[np.ndarray]` or `torch.FloatTensor`): + video (`List[np.ndarray]` or `torch.Tensor`): `video` frames or tensor representing a video batch to be used as the starting point for the process. Can also accept video latents as `image`, if passing latents directly, it will not be encoded again. strength (`float`, *optional*, defaults to 0.8): @@ -610,25 +610,25 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. Latents should be of shape `(batch_size, num_channel, num_frames, height, width)`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. output_type (`str`, *optional*, defaults to `"np"`): - The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`. + The output format of the generated video. Choose between `torch.Tensor` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index dddd65079625..5ba671359292 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -392,7 +392,7 @@ def backward_loop( `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -529,12 +529,12 @@ def __call__( num_videos_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, motion_field_strength_x: float = 12, motion_field_strength_y: float = 12, output_type: Optional[str] = "tensor", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: Optional[int] = 1, t0: int = 44, t1: int = 47, @@ -569,7 +569,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. @@ -581,7 +581,7 @@ def __call__( a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -795,8 +795,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -816,10 +816,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py index 2dbd928b916e..2679fd1ea5d5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py @@ -581,10 +581,10 @@ def encode_prompt( do_classifier_free_guidance: bool = True, negative_prompt: Optional[str] = None, negative_prompt_2: Optional[str] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -610,17 +610,17 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -861,7 +861,7 @@ def backward_loop( `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. @@ -933,16 +933,16 @@ def __call__( eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, frame_ids: Optional[List[int]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, - latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + pooled_prompt_embeds: Optional[torch.Tensor] = None, + negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + latents: Optional[torch.Tensor] = None, motion_field_strength_x: float = 12, motion_field_strength_y: float = 12, output_type: Optional[str] = "tensor", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, @@ -1002,21 +1002,21 @@ def __call__( frame_ids (`List[int]`, *optional*): Indexes of the frames that are being generated. This is used when generating longer videos chunk-by-chunk. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_pooled_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. @@ -1034,7 +1034,7 @@ def __call__( of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip.py b/src/diffusers/pipelines/unclip/pipeline_unclip.py index 72e5b311394e..25c6739d8720 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py @@ -217,9 +217,9 @@ def __call__( decoder_num_inference_steps: int = 25, super_res_num_inference_steps: int = 7, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - prior_latents: Optional[torch.FloatTensor] = None, - decoder_latents: Optional[torch.FloatTensor] = None, - super_res_latents: Optional[torch.FloatTensor] = None, + prior_latents: Optional[torch.Tensor] = None, + decoder_latents: Optional[torch.Tensor] = None, + super_res_latents: Optional[torch.Tensor] = None, text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None, text_attention_mask: Optional[torch.Tensor] = None, prior_guidance_scale: float = 4.0, @@ -248,11 +248,11 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - prior_latents (`torch.FloatTensor` of shape (batch size, embeddings dimension), *optional*): + prior_latents (`torch.Tensor` of shape (batch size, embeddings dimension), *optional*): Pre-generated noisy latents to be used as inputs for the prior. - decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): + decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. - super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): + super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. prior_guidance_scale (`float`, *optional*, defaults to 4.0): A higher guidance scale value encourages the model to generate images closely linked to the text diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py index 6c646a7df362..2a0e7e90e4d2 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py @@ -199,13 +199,13 @@ def _encode_image(self, image, device, num_images_per_prompt, image_embeddings: @torch.no_grad() def __call__( self, - image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]] = None, + image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor]] = None, num_images_per_prompt: int = 1, decoder_num_inference_steps: int = 25, super_res_num_inference_steps: int = 7, generator: Optional[torch.Generator] = None, - decoder_latents: Optional[torch.FloatTensor] = None, - super_res_latents: Optional[torch.FloatTensor] = None, + decoder_latents: Optional[torch.Tensor] = None, + super_res_latents: Optional[torch.Tensor] = None, image_embeddings: Optional[torch.Tensor] = None, decoder_guidance_scale: float = 8.0, output_type: Optional[str] = "pil", @@ -215,7 +215,7 @@ def __call__( The call function to the pipeline for generation. Args: - image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): + image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`): `Image` or tensor representing an image batch to be used as the starting point. If you provide a tensor, it needs to be compatible with the [`CLIPImageProcessor`] [configuration](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json). @@ -231,9 +231,9 @@ def __call__( generator (`torch.Generator`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*): + decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. - super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*): + super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*): Pre-generated noisy latents to be used as inputs for the decoder. decoder_guidance_scale (`float`, *optional*, defaults to 4.0): A higher guidance scale value encourages the model to generate images closely linked to the text diff --git a/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py index bf0a4eb475c0..75e5d43678d5 100644 --- a/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py +++ b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py @@ -220,7 +220,7 @@ def generate_beam( input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds` must be supplied. - input_embeds (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + input_embeds (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): An embedded representation to directly pass to the transformer as a prefix for beam search. One of `input_ids` and `input_embeds` must be supplied. device: diff --git a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py index 6579e272a3bf..abc51edf6df1 100644 --- a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py +++ b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py @@ -739,8 +739,7 @@ def forward( """ Args: hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. - When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input - hidden_states + When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. @@ -1038,9 +1037,9 @@ def no_weight_decay(self): def forward( self, - latent_image_embeds: torch.FloatTensor, - image_embeds: torch.FloatTensor, - prompt_embeds: torch.FloatTensor, + latent_image_embeds: torch.Tensor, + image_embeds: torch.Tensor, + prompt_embeds: torch.Tensor, timestep_img: Union[torch.Tensor, float, int], timestep_text: Union[torch.Tensor, float, int], data_type: Optional[Union[torch.Tensor, float, int]] = 1, @@ -1049,11 +1048,11 @@ def forward( ): """ Args: - latent_image_embeds (`torch.FloatTensor` of shape `(batch size, latent channels, height, width)`): + latent_image_embeds (`torch.Tensor` of shape `(batch size, latent channels, height, width)`): Latent image representation from the VAE encoder. - image_embeds (`torch.FloatTensor` of shape `(batch size, 1, clip_img_dim)`): + image_embeds (`torch.Tensor` of shape `(batch size, 1, clip_img_dim)`): CLIP-embedded image representation (unsqueezed in the first dimension). - prompt_embeds (`torch.FloatTensor` of shape `(batch size, seq_len, text_dim)`): + prompt_embeds (`torch.Tensor` of shape `(batch size, seq_len, text_dim)`): CLIP-embedded text representation. timestep_img (`torch.long` or `float` or `int`): Current denoising step for the image. diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py index 5d61b1054e1c..44a03ef56eec 100644 --- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -304,7 +304,7 @@ def _infer_batch_size( if isinstance(image, PIL.Image.Image): batch_size = 1 else: - # Image must be available and type either PIL.Image.Image or torch.FloatTensor. + # Image must be available and type either PIL.Image.Image or torch.Tensor. # Not currently supporting something like image_embeds. batch_size = image.shape[0] multiplier = num_prompts_per_image @@ -353,8 +353,8 @@ def _encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, **kwargs, ): @@ -386,8 +386,8 @@ def encode_prompt( num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, lora_scale: Optional[float] = None, clip_skip: Optional[int] = None, ): @@ -407,10 +407,10 @@ def encode_prompt( The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -1080,7 +1080,7 @@ def check_inputs( def __call__( self, prompt: Optional[Union[str, List[str]]] = None, - image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, + image: Optional[Union[torch.Tensor, PIL.Image.Image]] = None, height: Optional[int] = None, width: Optional[int] = None, data_type: Optional[int] = 1, @@ -1091,15 +1091,15 @@ def __call__( num_prompts_per_image: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, - prompt_latents: Optional[torch.FloatTensor] = None, - vae_latents: Optional[torch.FloatTensor] = None, - clip_latents: Optional[torch.FloatTensor] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, + prompt_latents: Optional[torch.Tensor] = None, + vae_latents: Optional[torch.Tensor] = None, + clip_latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback: Optional[Callable[[int, int, torch.Tensor], None]] = None, callback_steps: int = 1, ): r""" @@ -1109,7 +1109,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. Required for text-conditioned image generation (`text2img`) mode. - image (`torch.FloatTensor` or `PIL.Image.Image`, *optional*): + image (`torch.Tensor` or `PIL.Image.Image`, *optional*): `Image` or tensor representing an image batch. Required for image-conditioned text generation (`img2text`) mode. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): @@ -1144,29 +1144,29 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for joint image-text generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. This assumes a full set of VAE, CLIP, and text latents, if supplied, overrides the value of `prompt_latents`, `vae_latents`, and `clip_latents`. - prompt_latents (`torch.FloatTensor`, *optional*): + prompt_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for text generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - vae_latents (`torch.FloatTensor`, *optional*): + vae_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - clip_latents (`torch.FloatTensor`, *optional*): + clip_latents (`torch.Tensor`, *optional*): Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor is generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. Used in text-conditioned image generation (`text2img`) mode. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, `negative_prompt_embeds` are be generated from the `negative_prompt` input argument. Used in text-conditioned image generation (`text2img`) mode. @@ -1176,7 +1176,7 @@ def __call__( Whether or not to return a [`~pipelines.ImageTextPipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that calls every `callback_steps` steps during inference. The function is called with the - following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`. callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function is called. If not specified, the callback is called at every step. diff --git a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py index 3b21dfb5f1f0..b2cf8cbc978c 100644 --- a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py +++ b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py @@ -130,7 +130,7 @@ def __init__( ) @apply_forward_hook - def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: + def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput: h = self.in_block(x) h = self.down_blocks(h) @@ -141,8 +141,8 @@ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOut @apply_forward_hook def decode( - self, h: torch.FloatTensor, force_not_quantize: bool = True, return_dict: bool = True - ) -> Union[DecoderOutput, torch.FloatTensor]: + self, h: torch.Tensor, force_not_quantize: bool = True, return_dict: bool = True + ) -> Union[DecoderOutput, torch.Tensor]: if not force_not_quantize: quant, _, _ = self.vquantizer(h) else: @@ -155,10 +155,10 @@ def decode( return DecoderOutput(sample=dec) - def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + def forward(self, sample: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: r""" Args: - sample (`torch.FloatTensor`): Input sample. + sample (`torch.Tensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. """ diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py index e4277d58a006..b08421415b23 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py @@ -209,7 +209,7 @@ def num_timesteps(self): @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, - image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]], + image_embeddings: Union[torch.Tensor, List[torch.Tensor]], prompt: Union[str, List[str]] = None, num_inference_steps: int = 12, timesteps: Optional[List[float]] = None, @@ -217,7 +217,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -228,7 +228,7 @@ def __call__( Function invoked when calling the pipeline for generation. Args: - image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`): + image_embedding (`torch.Tensor` or `List[torch.Tensor]`): Image Embeddings either extracted from an image or generated by a Prior Model. prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. @@ -252,7 +252,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py index da6b5e0258bb..7819c8c0a0ef 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py @@ -154,11 +154,11 @@ def __call__( decoder_timesteps: Optional[List[float]] = None, decoder_guidance_scale: float = 0.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, num_images_per_prompt: int = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -176,10 +176,10 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -218,7 +218,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py index 4640f7696766..4dddd18c30d5 100644 --- a/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py +++ b/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py @@ -54,12 +54,12 @@ class WuerstchenPriorPipelineOutput(BaseOutput): Output class for WuerstchenPriorPipeline. Args: - image_embeddings (`torch.FloatTensor` or `np.ndarray`) + image_embeddings (`torch.Tensor` or `np.ndarray`) Prior image embeddings for text prompt """ - image_embeddings: Union[torch.FloatTensor, np.ndarray] + image_embeddings: Union[torch.Tensor, np.ndarray] class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin): @@ -136,8 +136,8 @@ def encode_prompt( do_classifier_free_guidance, prompt=None, negative_prompt=None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, ): if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -288,11 +288,11 @@ def __call__( timesteps: List[float] = None, guidance_scale: float = 8.0, negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[torch.FloatTensor] = None, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, + latents: Optional[torch.Tensor] = None, output_type: Optional[str] = "pt", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -324,10 +324,10 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `decoder_guidance_scale` is less than `1`). - prompt_embeds (`torch.FloatTensor`, *optional*): + prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): + negative_prompt_embeds (`torch.Tensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. @@ -336,7 +336,7 @@ def __call__( generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): + latents (`torch.Tensor`, *optional*): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. diff --git a/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py b/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py index d776d989a143..f5f9bd256c2e 100644 --- a/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py +++ b/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py @@ -31,19 +31,19 @@ class KarrasVeOutput(BaseOutput): Output class for the scheduler's step function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - derivative (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + derivative (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Derivative of predicted original image sample (x_0). - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample (x_{0}) based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - derivative: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + derivative: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None class KarrasVeScheduler(SchedulerMixin, ConfigMixin): @@ -94,21 +94,21 @@ def __init__( # setable values self.num_inference_steps: int = None self.timesteps: np.IntTensor = None - self.schedule: torch.FloatTensor = None # sigma(t_i) + self.schedule: torch.Tensor = None # sigma(t_i) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -136,14 +136,14 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.schedule = torch.tensor(schedule, dtype=torch.float32, device=device) def add_noise_to_input( - self, sample: torch.FloatTensor, sigma: float, generator: Optional[torch.Generator] = None - ) -> Tuple[torch.FloatTensor, float]: + self, sample: torch.Tensor, sigma: float, generator: Optional[torch.Generator] = None + ) -> Tuple[torch.Tensor, float]: """ Explicit Langevin-like "churn" step of adding noise to the sample according to a `gamma_i ≥ 0` to reach a higher noise level `sigma_hat = sigma_i + gamma_i*sigma_i`. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. sigma (`float`): generator (`torch.Generator`, *optional*): @@ -163,10 +163,10 @@ def add_noise_to_input( def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, sigma_hat: float, sigma_prev: float, - sample_hat: torch.FloatTensor, + sample_hat: torch.Tensor, return_dict: bool = True, ) -> Union[KarrasVeOutput, Tuple]: """ @@ -174,11 +174,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. sigma_hat (`float`): sigma_prev (`float`): - sample_hat (`torch.FloatTensor`): + sample_hat (`torch.Tensor`): return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`. @@ -202,25 +202,25 @@ def step( def step_correct( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, sigma_hat: float, sigma_prev: float, - sample_hat: torch.FloatTensor, - sample_prev: torch.FloatTensor, - derivative: torch.FloatTensor, + sample_hat: torch.Tensor, + sample_prev: torch.Tensor, + derivative: torch.Tensor, return_dict: bool = True, ) -> Union[KarrasVeOutput, Tuple]: """ Corrects the predicted sample based on the `model_output` of the network. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. sigma_hat (`float`): TODO sigma_prev (`float`): TODO - sample_hat (`torch.FloatTensor`): TODO - sample_prev (`torch.FloatTensor`): TODO - derivative (`torch.FloatTensor`): TODO + sample_hat (`torch.Tensor`): TODO + sample_prev (`torch.Tensor`): TODO + derivative (`torch.Tensor`): TODO return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`. diff --git a/src/diffusers/schedulers/scheduling_amused.py b/src/diffusers/schedulers/scheduling_amused.py index 51fbe6a4dc7d..238b8d869171 100644 --- a/src/diffusers/schedulers/scheduling_amused.py +++ b/src/diffusers/schedulers/scheduling_amused.py @@ -29,16 +29,16 @@ class AmusedSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: torch.FloatTensor = None + prev_sample: torch.Tensor + pred_original_sample: torch.Tensor = None class AmusedScheduler(SchedulerMixin, ConfigMixin): @@ -70,7 +70,7 @@ def set_timesteps( def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: torch.long, sample: torch.LongTensor, starting_mask_ratio: int = 1, diff --git a/src/diffusers/schedulers/scheduling_consistency_decoder.py b/src/diffusers/schedulers/scheduling_consistency_decoder.py index 37efb7dc7cc0..d7af018b284a 100644 --- a/src/diffusers/schedulers/scheduling_consistency_decoder.py +++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py @@ -61,12 +61,12 @@ class ConsistencyDecoderSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor class ConsistencyDecoderScheduler(SchedulerMixin, ConfigMixin): @@ -113,28 +113,28 @@ def set_timesteps( def init_noise_sigma(self): return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]] - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample * self.c_in[timestep] def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[ConsistencyDecoderSchedulerOutput, Tuple]: @@ -143,11 +143,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`float`): The current timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py index 3abf52f0afd7..8b8da5d110d2 100644 --- a/src/diffusers/schedulers/scheduling_consistency_models.py +++ b/src/diffusers/schedulers/scheduling_consistency_models.py @@ -33,12 +33,12 @@ class CMStochasticIterativeSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): @@ -126,20 +126,18 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Scales the consistency model input by `(sigma**2 + sigma_data**2) ** 0.5`. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ # Get sigma corresponding to timestep @@ -278,7 +276,7 @@ def get_scalings_for_boundary_condition(self, sigma): Args: - sigma (`torch.FloatTensor`): + sigma (`torch.Tensor`): The current sigma in the Karras sigma schedule. Returns: @@ -319,9 +317,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[CMStochasticIterativeSchedulerOutput, Tuple]: @@ -330,11 +328,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`float`): The current timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -417,10 +415,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 718514abf9d8..1c4b55e89a03 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -35,16 +35,16 @@ class DDIMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -98,11 +98,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -233,19 +233,19 @@ def __init__( self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -261,7 +261,7 @@ def _get_variance(self, timestep, prev_timestep): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -341,13 +341,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: """ @@ -355,11 +355,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -370,7 +370,7 @@ def step( `use_clipped_model_output` has no effect. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): @@ -470,10 +470,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -495,9 +495,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py index 9ca6ed4aca63..1648f4953069 100644 --- a/src/diffusers/schedulers/scheduling_ddim_inverse.py +++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py @@ -33,16 +33,16 @@ class DDIMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -97,11 +97,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -231,19 +231,19 @@ def __init__( self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps).copy().astype(np.int64)) # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -288,9 +288,9 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: """ @@ -298,11 +298,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): The weight of noise for added noise in diffusion step. @@ -311,7 +311,7 @@ def step( because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would coincide with the one provided as input and `use_clipped_model_output` has no effect. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py index 995478f273d1..de13cd307769 100644 --- a/src/diffusers/schedulers/scheduling_ddim_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py @@ -35,16 +35,16 @@ class DDIMParallelSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -99,11 +99,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -241,19 +241,19 @@ def __init__( self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -283,7 +283,7 @@ def _batch_get_variance(self, t, prev_t): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -364,13 +364,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[DDIMParallelSchedulerOutput, Tuple]: """ @@ -378,9 +378,9 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped @@ -388,7 +388,7 @@ def step( `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would coincide with the one provided as input and `use_clipped_model_output` will have not effect. generator: random number generator. - variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we + variance_noise (`torch.Tensor`): instead of generating noise for the variance using `generator`, we can directly provide the noise for the variance itself. This is useful for methods such as CycleDiffusion. (https://arxiv.org/abs/2210.05559) return_dict (`bool`): option for returning tuple rather than DDIMParallelSchedulerOutput class @@ -486,12 +486,12 @@ def step( def batch_step_no_noise( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timesteps: List[int], - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.0, use_clipped_model_output: bool = False, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once. Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise @@ -501,10 +501,10 @@ def batch_step_no_noise( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timesteps (`List[int]`): current discrete timesteps in the diffusion chain. This is now a list of integers. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped @@ -513,7 +513,7 @@ def batch_step_no_noise( coincide with the one provided as input and `use_clipped_model_output` will have not effect. Returns: - `torch.FloatTensor`: sample tensor at previous timestep. + `torch.Tensor`: sample tensor at previous timestep. """ if self.num_inference_steps is None: @@ -595,10 +595,10 @@ def batch_step_no_noise( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -620,9 +620,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 978fb63737e1..d35da4bcd811 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -33,16 +33,16 @@ class DDPMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None def betas_for_alpha_bar( @@ -96,11 +96,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -231,19 +231,19 @@ def __init__( self.variance_type = variance_type - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -363,7 +363,7 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None): return variance - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -398,9 +398,9 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[DDPMSchedulerOutput, Tuple]: @@ -409,11 +409,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -498,10 +498,10 @@ def step( def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -522,9 +522,7 @@ def add_noise( noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py index 30fbad29f1e6..583d8ba8b46b 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -34,16 +34,16 @@ class DDPMParallelSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -98,11 +98,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -240,19 +240,19 @@ def __init__( self.variance_type = variance_type # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -375,7 +375,7 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -410,9 +410,9 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[DDPMParallelSchedulerOutput, Tuple]: @@ -421,9 +421,9 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. generator: random number generator. return_dict (`bool`): option for returning tuple rather than DDPMParallelSchedulerOutput class @@ -506,10 +506,10 @@ def step( def batch_step_no_noise( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timesteps: List[int], - sample: torch.FloatTensor, - ) -> torch.FloatTensor: + sample: torch.Tensor, + ) -> torch.Tensor: """ Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once. Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise @@ -519,14 +519,14 @@ def batch_step_no_noise( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timesteps (`List[int]`): current discrete timesteps in the diffusion chain. This is now a list of integers. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. Returns: - `torch.FloatTensor`: sample tensor at previous timestep. + `torch.Tensor`: sample tensor at previous timestep. """ t = timesteps num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps @@ -587,10 +587,10 @@ def batch_step_no_noise( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -612,9 +612,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py index 6a0f4f5efe3c..71b5669b0528 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py +++ b/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py @@ -33,12 +33,12 @@ class DDPMWuerstchenSchedulerOutput(BaseOutput): Output class for the scheduler's step function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor def betas_for_alpha_bar( @@ -125,17 +125,17 @@ def _alpha_cumprod(self, t, device): ) ** 2 / self._init_alpha_cumprod.to(device) return alpha_cumprod.clamp(0.0001, 0.9999) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): input sample + sample (`torch.Tensor`): input sample timestep (`int`, optional): current timestep Returns: - `torch.FloatTensor`: scaled input sample + `torch.Tensor`: scaled input sample """ return sample @@ -163,9 +163,9 @@ def set_timesteps( def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[DDPMWuerstchenSchedulerOutput, Tuple]: @@ -174,9 +174,9 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. generator: random number generator. return_dict (`bool`): option for returning tuple rather than DDPMWuerstchenSchedulerOutput class @@ -209,10 +209,10 @@ def step( def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: device = original_samples.device dtype = original_samples.dtype alpha_cumprod = self._alpha_cumprod(timesteps, device=device).view( diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index f25fbf029bd4..ea34d1bd2cf7 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -276,7 +276,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -341,7 +341,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -368,24 +368,24 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DEIS algorithm needs. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -425,26 +425,26 @@ def convert_model_output( def deis_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DEIS (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -483,22 +483,22 @@ def deis_first_order_update( def multistep_deis_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order multistep DEIS. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -552,22 +552,22 @@ def ind_fn(t, b, c): def multistep_deis_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order multistep DEIS. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -673,9 +673,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -683,11 +683,11 @@ def step( the multistep DEIS. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -736,17 +736,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -754,10 +754,10 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index cd54148b4370..0b370e689d50 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -78,11 +78,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -408,7 +408,7 @@ def set_timesteps( self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -472,7 +472,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -497,7 +497,7 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho return sigmas - def _convert_to_lu(self, in_lambdas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Lu et al. (2022).""" lambda_min: float = in_lambdas[-1].item() @@ -512,11 +512,11 @@ def _convert_to_lu(self, in_lambdas: torch.FloatTensor, num_inference_steps) -> def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -530,13 +530,13 @@ def convert_model_output( Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -611,23 +611,23 @@ def convert_model_output( def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -680,23 +680,23 @@ def dpm_solver_first_order_update( def multistep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -803,22 +803,22 @@ def multistep_dpm_solver_second_order_update( def multistep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -919,11 +919,11 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -931,15 +931,15 @@ def step( the multistep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`LEdits++`]. return_dict (`bool`): @@ -1006,27 +1006,27 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index 428eaea6a6b8..9b2cd03bdbab 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -295,7 +295,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -360,7 +360,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -388,11 +388,11 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.convert_model_output def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -406,13 +406,13 @@ def convert_model_output( Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -488,23 +488,23 @@ def convert_model_output( # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.dpm_solver_first_order_update def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -558,23 +558,23 @@ def dpm_solver_first_order_update( # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_second_order_update def multistep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -682,22 +682,22 @@ def multistep_dpm_solver_second_order_update( # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_third_order_update def multistep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -786,11 +786,11 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, - variance_noise: Optional[torch.FloatTensor] = None, + variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -798,15 +798,15 @@ def step( the multistep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. - variance_noise (`torch.FloatTensor`): + variance_noise (`torch.Tensor`): Alternative to generating noise with `generator` by directly providing the noise for the variance itself. Useful for methods such as [`CycleDiffusion`]. return_dict (`bool`): @@ -867,27 +867,27 @@ def step( return SchedulerOutput(prev_sample=prev_sample) # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py index 54455b0f2e5a..a9dbb69b04b4 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py @@ -257,21 +257,21 @@ def set_begin_index(self, begin_index: int = 0): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -395,7 +395,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min: float = in_sigmas[-1].item() @@ -414,9 +414,9 @@ def state_in_first_order(self): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], return_dict: bool = True, s_noise: float = 1.0, ) -> Union[SchedulerOutput, Tuple]: @@ -425,11 +425,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor` or `np.ndarray`): + model_output (`torch.Tensor` or `np.ndarray`): The direct output from learned diffusion model. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor` or `np.ndarray`): + sample (`torch.Tensor` or `np.ndarray`): A current instance of a sample created by the diffusion process. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -450,10 +450,10 @@ def step( self.noise_sampler = BrownianTreeNoiseSampler(sample, min_sigma, max_sigma, self.noise_sampler_seed) # Define functions to compute sigma and t from each other - def sigma_fn(_t: torch.FloatTensor) -> torch.FloatTensor: + def sigma_fn(_t: torch.Tensor) -> torch.Tensor: return _t.neg().exp() - def t_fn(_sigma: torch.FloatTensor) -> torch.FloatTensor: + def t_fn(_sigma: torch.Tensor) -> torch.Tensor: return _sigma.log().neg() if self.state_in_first_order: @@ -526,10 +526,10 @@ def t_fn(_sigma: torch.FloatTensor) -> torch.FloatTensor: # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 25118f432925..a274dbd6bc39 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -361,7 +361,7 @@ def set_timesteps( self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -426,7 +426,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -453,11 +453,11 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -471,13 +471,13 @@ def convert_model_output( Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -542,26 +542,26 @@ def convert_model_output( def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -598,27 +598,27 @@ def dpm_solver_first_order_update( def singlestep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the second-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the time `timestep_list[-2]`. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. timestep (`int`): The current and latter discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -692,27 +692,27 @@ def singlestep_dpm_solver_second_order_update( def singlestep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the third-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the time `timestep_list[-3]`. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. timestep (`int`): The current and latter discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ @@ -796,29 +796,29 @@ def singlestep_dpm_solver_third_order_update( def singlestep_dpm_solver_update( self, - model_output_list: List[torch.FloatTensor], + model_output_list: List[torch.Tensor], *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, order: int = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the singlestep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. timestep (`int`): The current and latter discrete timestep in the diffusion chain. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. order (`int`): The solver order at this step. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None) @@ -891,9 +891,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -901,11 +901,11 @@ def step( the singlestep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -950,17 +950,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -968,10 +968,10 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index dfc7978a2ee2..ea2165c76893 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -206,21 +206,19 @@ def precondition_outputs(self, sample, model_output, sigma): return denoised # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.scale_model_input - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -276,7 +274,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas - def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min = sigma_min or self.config.sigma_min sigma_max = sigma_max or self.config.sigma_max @@ -289,7 +287,7 @@ def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch. return sigmas # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas - def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Implementation closely follows k-diffusion. https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26 @@ -300,7 +298,7 @@ def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> t return sigmas # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -365,9 +363,9 @@ def _sigma_to_alpha_sigma_t(self, sigma): def convert_model_output( self, - model_output: torch.FloatTensor, - sample: torch.FloatTensor = None, - ) -> torch.FloatTensor: + model_output: torch.Tensor, + sample: torch.Tensor = None, + ) -> torch.Tensor: """ Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an @@ -381,13 +379,13 @@ def convert_model_output( Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ sigma = self.sigmas[self.step_index] @@ -400,21 +398,21 @@ def convert_model_output( def dpm_solver_first_order_update( self, - model_output: torch.FloatTensor, - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + model_output: torch.Tensor, + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, + ) -> torch.Tensor: """ One step for the first-order DPMSolver (equivalent to DDIM). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index] @@ -438,21 +436,21 @@ def dpm_solver_first_order_update( def multistep_dpm_solver_second_order_update( self, - model_output_list: List[torch.FloatTensor], - sample: torch.FloatTensor = None, - noise: Optional[torch.FloatTensor] = None, - ) -> torch.FloatTensor: + model_output_list: List[torch.Tensor], + sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, + ) -> torch.Tensor: """ One step for the second-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ sigma_t, sigma_s0, sigma_s1 = ( @@ -509,20 +507,20 @@ def multistep_dpm_solver_second_order_update( def multistep_dpm_solver_third_order_update( self, - model_output_list: List[torch.FloatTensor], - sample: torch.FloatTensor = None, - ) -> torch.FloatTensor: + model_output_list: List[torch.Tensor], + sample: torch.Tensor = None, + ) -> torch.Tensor: """ One step for the third-order multistep DPMSolver. Args: - model_output_list (`List[torch.FloatTensor]`): + model_output_list (`List[torch.Tensor]`): The direct outputs from learned diffusion model at current and latter timesteps. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ sigma_t, sigma_s0, sigma_s1, sigma_s2 = ( @@ -596,9 +594,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -607,11 +605,11 @@ def step( the multistep DPMSolver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -675,10 +673,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py index 0ef9263c9e30..b37e6e0fd74a 100644 --- a/src/diffusers/schedulers/scheduling_edm_euler.py +++ b/src/diffusers/schedulers/scheduling_edm_euler.py @@ -35,16 +35,16 @@ class EDMEulerSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None class EDMEulerScheduler(SchedulerMixin, ConfigMixin): @@ -174,21 +174,19 @@ def precondition_outputs(self, sample, model_output, sigma): return denoised - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -227,7 +225,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 - def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min = sigma_min or self.config.sigma_min sigma_max = sigma_max or self.config.sigma_max @@ -239,7 +237,7 @@ def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch. return sigmas - def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor: """Implementation closely follows k-diffusion. https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26 @@ -275,9 +273,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, s_churn: float = 0.0, s_tmin: float = 0.0, s_tmax: float = float("inf"), @@ -290,11 +288,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. s_churn (`float`): s_tmin (`float`): @@ -375,10 +373,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py index 6631ef63a8c1..085683e56f70 100644 --- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -35,16 +35,16 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -99,11 +99,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -250,21 +250,19 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ @@ -346,9 +344,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]: @@ -357,11 +355,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -450,10 +448,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index 476940d0e4f1..5f9db844ff35 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -35,16 +35,16 @@ class EulerDiscreteSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -99,11 +99,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -274,21 +274,19 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -445,7 +443,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -494,9 +492,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, s_churn: float = 0.0, s_tmin: float = 0.0, s_tmax: float = float("inf"), @@ -509,11 +507,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. s_churn (`float`): s_tmin (`float`): @@ -606,10 +604,10 @@ def step( def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): @@ -637,9 +635,7 @@ def add_noise( noisy_samples = original_samples + noise * sigma return noisy_samples - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.FloatTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor: if ( isinstance(timesteps, int) or isinstance(timesteps, torch.IntTensor) diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py index 34b2bd11606b..4ce70dffec2a 100644 --- a/src/diffusers/schedulers/scheduling_heun_discrete.py +++ b/src/diffusers/schedulers/scheduling_heun_discrete.py @@ -198,21 +198,21 @@ def set_begin_index(self, begin_index: int = 0): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -329,7 +329,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -369,9 +369,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -379,11 +379,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -469,10 +469,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_ipndm.py b/src/diffusers/schedulers/scheduling_ipndm.py index afc8fd940eaf..9f15f1fe0aab 100644 --- a/src/diffusers/schedulers/scheduling_ipndm.py +++ b/src/diffusers/schedulers/scheduling_ipndm.py @@ -137,9 +137,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -147,11 +147,11 @@ def step( the linear multistep method. It performs one forward pass multiple times to approximate the solution. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -193,17 +193,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py index fd2b94759f08..2ab686fe6371 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py @@ -175,21 +175,21 @@ def set_begin_index(self, begin_index: int = 0): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -321,7 +321,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -376,9 +376,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -387,11 +387,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -477,10 +477,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py index 57a1af6b797a..0672db314753 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py @@ -175,21 +175,21 @@ def set_begin_index(self, begin_index: int = 0): def scale_model_input( self, - sample: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - ) -> torch.FloatTensor: + sample: torch.Tensor, + timestep: Union[float, torch.Tensor], + ) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ if self.step_index is None: @@ -334,7 +334,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -361,9 +361,9 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def step( self, - model_output: Union[torch.FloatTensor, np.ndarray], - timestep: Union[float, torch.FloatTensor], - sample: Union[torch.FloatTensor, np.ndarray], + model_output: Union[torch.Tensor, np.ndarray], + timestep: Union[float, torch.Tensor], + sample: Union[torch.Tensor, np.ndarray], return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -371,11 +371,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -452,10 +452,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_karras_ve_flax.py b/src/diffusers/schedulers/scheduling_karras_ve_flax.py index 4d099604a9e5..0d387b53ac3e 100644 --- a/src/diffusers/schedulers/scheduling_karras_ve_flax.py +++ b/src/diffusers/schedulers/scheduling_karras_ve_flax.py @@ -176,10 +176,10 @@ def step( Args: state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class. - model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model. + model_output (`torch.Tensor` or `np.ndarray`): direct output from learned diffusion model. sigma_hat (`float`): TODO sigma_prev (`float`): TODO - sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO + sample_hat (`torch.Tensor` or `np.ndarray`): TODO return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class Returns: @@ -213,12 +213,12 @@ def step_correct( Args: state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class. - model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model. + model_output (`torch.Tensor` or `np.ndarray`): direct output from learned diffusion model. sigma_hat (`float`): TODO sigma_prev (`float`): TODO - sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO - sample_prev (`torch.FloatTensor` or `np.ndarray`): TODO - derivative (`torch.FloatTensor` or `np.ndarray`): TODO + sample_hat (`torch.Tensor` or `np.ndarray`): TODO + sample_prev (`torch.Tensor` or `np.ndarray`): TODO + derivative (`torch.Tensor` or `np.ndarray`): TODO return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class Returns: diff --git a/src/diffusers/schedulers/scheduling_lcm.py b/src/diffusers/schedulers/scheduling_lcm.py index f15fe0adf261..e73ce3e420f6 100644 --- a/src/diffusers/schedulers/scheduling_lcm.py +++ b/src/diffusers/schedulers/scheduling_lcm.py @@ -37,16 +37,16 @@ class LCMSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - denoised: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + denoised: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -95,17 +95,17 @@ def alpha_bar_fn(t): # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr -def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor: +def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor: """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -296,24 +296,24 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -497,9 +497,9 @@ def get_scalings_for_boundary_condition_discrete(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[LCMSchedulerOutput, Tuple]: @@ -508,11 +508,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`float`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -594,10 +594,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -619,9 +619,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py index 61a91783c25b..272e9b8572c3 100644 --- a/src/diffusers/schedulers/scheduling_lms_discrete.py +++ b/src/diffusers/schedulers/scheduling_lms_discrete.py @@ -32,16 +32,16 @@ class LMSDiscreteSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -202,21 +202,19 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input( - self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] - ) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ @@ -351,7 +349,7 @@ def _sigma_to_t(self, sigma, log_sigmas): return t # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min: float = in_sigmas[-1].item() @@ -366,9 +364,9 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor: def step( self, - model_output: torch.FloatTensor, - timestep: Union[float, torch.FloatTensor], - sample: torch.FloatTensor, + model_output: torch.Tensor, + timestep: Union[float, torch.Tensor], + sample: torch.Tensor, order: int = 4, return_dict: bool = True, ) -> Union[LMSDiscreteSchedulerOutput, Tuple]: @@ -377,11 +375,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. - timestep (`float` or `torch.FloatTensor`): + timestep (`float` or `torch.Tensor`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. order (`int`, defaults to 4): The order of the linear multistep method. @@ -444,10 +442,10 @@ def step( # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py index e7c861ea3831..baf29d12c0c7 100644 --- a/src/diffusers/schedulers/scheduling_pndm.py +++ b/src/diffusers/schedulers/scheduling_pndm.py @@ -225,9 +225,9 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -236,11 +236,11 @@ def step( or [`~PNDMScheduler.step_plms`] depending on the internal variable `counter`. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -258,9 +258,9 @@ def step( def step_prk( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -269,11 +269,11 @@ def step_prk( equation. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -318,9 +318,9 @@ def step_prk( def step_plms( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -328,11 +328,11 @@ def step_plms( the linear multistep method. It performs one forward pass multiple times to approximate the solution. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple. @@ -387,17 +387,17 @@ def step_plms( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -448,10 +448,10 @@ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py index ccd3d2743192..72175e7e0270 100644 --- a/src/diffusers/schedulers/scheduling_repaint.py +++ b/src/diffusers/schedulers/scheduling_repaint.py @@ -31,16 +31,16 @@ class RePaintSchedulerOutput(BaseOutput): Output class for the scheduler's step function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample (x_{0}) based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: torch.FloatTensor + prev_sample: torch.Tensor + pred_original_sample: torch.Tensor # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -160,19 +160,19 @@ def __init__( self.eta = eta - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -245,11 +245,11 @@ def _get_variance(self, t): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, - original_image: torch.FloatTensor, - mask: torch.FloatTensor, + sample: torch.Tensor, + original_image: torch.Tensor, + mask: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[RePaintSchedulerOutput, Tuple]: @@ -258,15 +258,15 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. - original_image (`torch.FloatTensor`): + original_image (`torch.Tensor`): The original image to inpaint on. - mask (`torch.FloatTensor`): + mask (`torch.Tensor`): The mask where a value of 0.0 indicates which part of the original image to inpaint. generator (`torch.Generator`, *optional*): A random number generator. @@ -351,10 +351,10 @@ def undo_step(self, sample, timestep, generator=None): def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.") def __len__(self): diff --git a/src/diffusers/schedulers/scheduling_sasolver.py b/src/diffusers/schedulers/scheduling_sasolver.py index b8d95c609bf1..94243aff14ea 100644 --- a/src/diffusers/schedulers/scheduling_sasolver.py +++ b/src/diffusers/schedulers/scheduling_sasolver.py @@ -305,7 +305,7 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -370,7 +370,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -397,11 +397,11 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. Noise_prediction is designed to discretize an integral of the noise prediction model, and data_prediction is @@ -415,13 +415,13 @@ def convert_model_output( Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -686,29 +686,29 @@ def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, def stochastic_adams_bashforth_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor, - noise: torch.FloatTensor, + sample: torch.Tensor, + noise: torch.Tensor, order: int, - tau: torch.FloatTensor, + tau: torch.Tensor, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the SA-Predictor. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model at the current timestep. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. order (`int`): The order of SA-Predictor at this timestep. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None) @@ -813,32 +813,32 @@ def stochastic_adams_bashforth_update( def stochastic_adams_moulton_update( self, - this_model_output: torch.FloatTensor, + this_model_output: torch.Tensor, *args, - last_sample: torch.FloatTensor, - last_noise: torch.FloatTensor, - this_sample: torch.FloatTensor, + last_sample: torch.Tensor, + last_noise: torch.Tensor, + this_sample: torch.Tensor, order: int, - tau: torch.FloatTensor, + tau: torch.Tensor, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the SA-Corrector. Args: - this_model_output (`torch.FloatTensor`): + this_model_output (`torch.Tensor`): The model outputs at `x_t`. this_timestep (`int`): The current timestep `t`. - last_sample (`torch.FloatTensor`): + last_sample (`torch.Tensor`): The generated sample before the last predictor `x_{t-1}`. - this_sample (`torch.FloatTensor`): + this_sample (`torch.Tensor`): The generated sample after the last predictor `x_{t}`. order (`int`): The order of SA-Corrector at this step. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The corrected sample tensor at the current timestep. """ @@ -979,9 +979,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator=None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -990,11 +990,11 @@ def step( the SA-Solver. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -1079,17 +1079,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -1097,10 +1097,10 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py index 8f8dd1877331..cedfbf7d6ad5 100644 --- a/src/diffusers/schedulers/scheduling_sde_ve.py +++ b/src/diffusers/schedulers/scheduling_sde_ve.py @@ -32,15 +32,15 @@ class SdeVeOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - prev_sample_mean (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample_mean (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Mean averaged `prev_sample` over previous timesteps. """ - prev_sample: torch.FloatTensor - prev_sample_mean: torch.FloatTensor + prev_sample: torch.Tensor + prev_sample_mean: torch.Tensor class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): @@ -86,19 +86,19 @@ def __init__( self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps) - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -159,9 +159,9 @@ def get_adjacent_sigma(self, timesteps, t): def step_pred( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[SdeVeOutput, Tuple]: @@ -170,11 +170,11 @@ def step_pred( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -227,8 +227,8 @@ def step_pred( def step_correct( self, - model_output: torch.FloatTensor, - sample: torch.FloatTensor, + model_output: torch.Tensor, + sample: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: @@ -237,9 +237,9 @@ def step_correct( making the prediction for the previous timestep. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. generator (`torch.Generator`, *optional*): A random number generator. @@ -282,10 +282,10 @@ def step_correct( def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, - timesteps: torch.FloatTensor, - ) -> torch.FloatTensor: + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples timesteps = timesteps.to(original_samples.device) sigmas = self.discrete_sigmas.to(original_samples.device)[timesteps] diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py index 0216b7afc80a..09aaa2531ac5 100644 --- a/src/diffusers/schedulers/scheduling_tcd.py +++ b/src/diffusers/schedulers/scheduling_tcd.py @@ -37,15 +37,15 @@ class TCDSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_noised_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_noised_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted noised sample `(x_{s})` based on the model output from the current timestep. """ - prev_sample: torch.FloatTensor - pred_noised_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_noised_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -94,17 +94,17 @@ def alpha_bar_fn(t): # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr -def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor: +def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor: """ Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -297,19 +297,19 @@ def set_begin_index(self, begin_index: int = 0): """ self._begin_index = begin_index - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. timestep (`int`, *optional*): The current timestep in the diffusion chain. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -326,7 +326,7 @@ def _get_variance(self, timestep, prev_timestep): return variance # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -524,9 +524,9 @@ def set_timesteps( def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, eta: float = 0.3, generator: Optional[torch.Generator] = None, return_dict: bool = True, @@ -536,11 +536,11 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. eta (`float`): A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every @@ -631,10 +631,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls @@ -656,9 +656,7 @@ def add_noise( return noisy_samples # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity( - self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor - ) -> torch.FloatTensor: + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py index c99e97cd85dd..6e1580290f22 100644 --- a/src/diffusers/schedulers/scheduling_unclip.py +++ b/src/diffusers/schedulers/scheduling_unclip.py @@ -32,16 +32,16 @@ class UnCLIPSchedulerOutput(BaseOutput): Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. """ - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -146,17 +146,17 @@ def __init__( self.variance_type = variance_type - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): input sample + sample (`torch.Tensor`): input sample timestep (`int`, optional): current timestep Returns: - `torch.FloatTensor`: scaled input sample + `torch.Tensor`: scaled input sample """ return sample @@ -215,9 +215,9 @@ def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, prev_timestep: Optional[int] = None, generator=None, return_dict: bool = True, @@ -227,9 +227,9 @@ def step( process from the learned model outputs (most often the predicted noise). Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. + model_output (`torch.Tensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): current instance of sample being created by diffusion process. prev_timestep (`int`, *optional*): The previous timestep to predict the previous sample at. Used to dynamically compute beta. If not given, `t-1` is used and the pre-computed beta is used. @@ -327,10 +327,10 @@ def step( # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as original_samples # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement # for the subsequent add_noise calls diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 74e97a33f1e2..4517ee86e927 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -78,11 +78,11 @@ def rescale_zero_terminal_snr(betas): Args: - betas (`torch.FloatTensor`): + betas (`torch.Tensor`): the betas that the scheduler is being initialized with. Returns: - `torch.FloatTensor`: rescaled betas with zero terminal SNR + `torch.Tensor`: rescaled betas with zero terminal SNR """ # Convert betas to alphas_bar_sqrt alphas = 1.0 - betas @@ -360,7 +360,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample - def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: """ "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by @@ -425,7 +425,7 @@ def _sigma_to_alpha_sigma_t(self, sigma): return alpha_t, sigma_t # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras - def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" # Hack to make sure that other schedulers which copy this function don't break @@ -452,24 +452,24 @@ def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) def convert_model_output( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Convert the model output to the corresponding type the UniPC algorithm needs. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The converted model output. """ timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) @@ -522,27 +522,27 @@ def convert_model_output( def multistep_uni_p_bh_update( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, *args, - sample: torch.FloatTensor = None, + sample: torch.Tensor = None, order: int = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from the learned diffusion model at the current timestep. prev_timestep (`int`): The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. order (`int`): The order of UniP at this timestep (corresponds to the *p* in UniPC-p). Returns: - `torch.FloatTensor`: + `torch.Tensor`: The sample tensor at the previous timestep. """ prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None) @@ -651,30 +651,30 @@ def multistep_uni_p_bh_update( def multistep_uni_c_bh_update( self, - this_model_output: torch.FloatTensor, + this_model_output: torch.Tensor, *args, - last_sample: torch.FloatTensor = None, - this_sample: torch.FloatTensor = None, + last_sample: torch.Tensor = None, + this_sample: torch.Tensor = None, order: int = None, **kwargs, - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ One step for the UniC (B(h) version). Args: - this_model_output (`torch.FloatTensor`): + this_model_output (`torch.Tensor`): The model outputs at `x_t`. this_timestep (`int`): The current timestep `t`. - last_sample (`torch.FloatTensor`): + last_sample (`torch.Tensor`): The generated sample before the last predictor `x_{t-1}`. - this_sample (`torch.FloatTensor`): + this_sample (`torch.Tensor`): The generated sample after the last predictor `x_{t}`. order (`int`): The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`. Returns: - `torch.FloatTensor`: + `torch.Tensor`: The corrected sample tensor at the current timestep. """ this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None) @@ -821,9 +821,9 @@ def _init_step_index(self, timestep): def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: int, - sample: torch.FloatTensor, + sample: torch.Tensor, return_dict: bool = True, ) -> Union[SchedulerOutput, Tuple]: """ @@ -831,11 +831,11 @@ def step( the multistep UniPC. Args: - model_output (`torch.FloatTensor`): + model_output (`torch.Tensor`): The direct output from learned diffusion model. timestep (`int`): The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): A current instance of a sample created by the diffusion process. return_dict (`bool`): Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`. @@ -900,17 +900,17 @@ def step( return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. Args: - sample (`torch.FloatTensor`): + sample (`torch.Tensor`): The input sample. Returns: - `torch.FloatTensor`: + `torch.Tensor`: A scaled input sample. """ return sample @@ -918,10 +918,10 @@ def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise def add_noise( self, - original_samples: torch.FloatTensor, - noise: torch.FloatTensor, + original_samples: torch.Tensor, + noise: torch.Tensor, timesteps: torch.IntTensor, - ) -> torch.FloatTensor: + ) -> torch.Tensor: # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py index 6faf9ee38cd5..33d34e26d89d 100644 --- a/src/diffusers/schedulers/scheduling_utils.py +++ b/src/diffusers/schedulers/scheduling_utils.py @@ -63,12 +63,12 @@ class SchedulerOutput(BaseOutput): Base class for the output of a scheduler's `step` function. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. """ - prev_sample: torch.FloatTensor + prev_sample: torch.Tensor class SchedulerMixin(PushToHubMixin): diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py index 03ba95cad68b..bd8d255fa901 100644 --- a/src/diffusers/schedulers/scheduling_vq_diffusion.py +++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py @@ -38,7 +38,7 @@ class VQDiffusionSchedulerOutput(BaseOutput): prev_sample: torch.LongTensor -def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTensor: +def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.Tensor: """ Convert batch of vector of class indices into batch of log onehot vectors @@ -50,7 +50,7 @@ def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTen number of classes to be used for the onehot vectors Returns: - `torch.FloatTensor` of shape `(batch size, num classes, vector length)`: + `torch.Tensor` of shape `(batch size, num classes, vector length)`: Log onehot vectors """ x_onehot = F.one_hot(x, num_classes) @@ -59,7 +59,7 @@ def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTen return log_x -def gumbel_noised(logits: torch.FloatTensor, generator: Optional[torch.Generator]) -> torch.FloatTensor: +def gumbel_noised(logits: torch.Tensor, generator: Optional[torch.Generator]) -> torch.Tensor: """ Apply gumbel noise to `logits` """ @@ -199,7 +199,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def step( self, - model_output: torch.FloatTensor, + model_output: torch.Tensor, timestep: torch.long, sample: torch.LongTensor, generator: Optional[torch.Generator] = None, @@ -210,7 +210,7 @@ def step( [`~VQDiffusionScheduler.q_posterior`] for more details about how the distribution is computer. Args: - log_p_x_0: (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`): + log_p_x_0: (`torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`): The log probabilities for the predicted classes of the initial latent pixels. Does not include a prediction for the masked class as the initial unnoised image cannot be masked. t (`torch.long`): @@ -251,7 +251,7 @@ def q_posterior(self, log_p_x_0, x_t, t): ``` Args: - log_p_x_0 (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`): + log_p_x_0 (`torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`): The log probabilities for the predicted classes of the initial latent pixels. Does not include a prediction for the masked class as the initial unnoised image cannot be masked. x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`): @@ -260,7 +260,7 @@ def q_posterior(self, log_p_x_0, x_t, t): The timestep that determines which transition matrix is used. Returns: - `torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`: + `torch.Tensor` of shape `(batch size, num classes, num latent pixels)`: The log probabilities for the predicted classes of the image at timestep `t-1`. """ log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed) @@ -354,7 +354,7 @@ def q_posterior(self, log_p_x_0, x_t, t): return log_p_x_t_min_1 def log_Q_t_transitioning_to_known_class( - self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.FloatTensor, cumulative: bool + self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.Tensor, cumulative: bool ): """ Calculates the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each @@ -365,14 +365,14 @@ def log_Q_t_transitioning_to_known_class( The timestep that determines which transition matrix is used. x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`): The classes of each latent pixel at time `t`. - log_onehot_x_t (`torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`): + log_onehot_x_t (`torch.Tensor` of shape `(batch size, num classes, num latent pixels)`): The log one-hot vectors of `x_t`. cumulative (`bool`): If cumulative is `False`, the single step transition matrix `t-1`->`t` is used. If cumulative is `True`, the cumulative transition matrix `0`->`t` is used. Returns: - `torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`: + `torch.Tensor` of shape `(batch size, num classes - 1, num latent pixels)`: Each _column_ of the returned matrix is a _row_ of log probabilities of the complete probability transition matrix. diff --git a/tests/others/test_check_copies.py b/tests/others/test_check_copies.py index 6e1c8fcfa54b..5835712343c7 100644 --- a/tests/others/test_check_copies.py +++ b/tests/others/test_check_copies.py @@ -32,16 +32,16 @@ Output class for the scheduler's `step` function output. Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): The predicted denoised sample `(x_{0})` based on the model output from the current timestep. `pred_original_sample` can be used to preview progress or for guidance. \""" - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None """ diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index a0fc8c5ef0c0..5cb22bd3c813 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -1041,7 +1041,7 @@ def test_stable_diffusion_fp16_vs_autocast(self): def test_stable_diffusion_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 63534118343a..d691565248bc 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -472,7 +472,7 @@ def test_stable_diffusion_img2img_ddim(self): def test_stable_diffusion_img2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index a85ea9c2605d..5da8669215b2 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -353,7 +353,7 @@ def test_stable_diffusion_pix2pix_ddim(self): def test_stable_diffusion_pix2pix_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 63e1cb30e203..494036482a77 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -416,7 +416,7 @@ def test_stable_diffusion_attention_slicing(self): def test_stable_diffusion_text2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 97dc88cd3d26..c260d565a829 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -461,7 +461,7 @@ def test_stable_diffusion_depth2img_pipeline_ddim(self): def test_stable_diffusion_depth2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index d1efd8202b98..c3b1b9b854af 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -475,7 +475,7 @@ def test_download_local(self): def test_stable_diffusion_text2img_intermediate_state_v_pred(self): number_of_steps = 0 - def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def test_callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: test_callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py index 99cd8b2e7d1b..6ddb562aacf2 100644 --- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py @@ -213,7 +213,7 @@ def test_stable_diffusion_img_variation_pipeline_default(self): def test_stable_diffusion_img_variation_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 diff --git a/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py index 35a654542023..4e36dab5ac6c 100644 --- a/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py @@ -349,7 +349,7 @@ def test_stable_diffusion_panorama_k_lms(self): def test_stable_diffusion_panorama_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: + def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1