From 1f6b84a5b33c741efd48f5493f50a2996ca38fc7 Mon Sep 17 00:00:00 2001 From: DannHuang Date: Sun, 14 Apr 2024 16:52:45 +0800 Subject: [PATCH 01/13] enable control ip-adapter per-transformer block on-the-fly --- src/diffusers/loaders/ip_adapter.py | 55 +++++++ src/diffusers/loaders/ip_adapter_utils.py | 144 ++++++++++++++++++ src/diffusers/models/attention_processor.py | 158 ++++++++++---------- 3 files changed, 280 insertions(+), 77 deletions(-) create mode 100644 src/diffusers/loaders/ip_adapter_utils.py diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index c531d5a519f2..b6e7c59fd24e 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -27,6 +27,7 @@ is_transformers_available, logging, ) +from .ip_adapter_utils import _maybe_expand_ip_scales if is_transformers_available(): @@ -249,6 +250,60 @@ def set_ip_adapter_scale(self, scale): f"Expected {len(attn_processor.scale)} but got {len(scale)}." ) attn_processor.scale = scale + + + def activate_ip_adapter(self, scale_config: Union[float, Dict]): + """ + Activate IP-Adapter per-transformer block. + + Example: + + ```py + # To use original IP-Adapter + scale_config = 1.0 + pipeline.activate_ip_adapter(scale_config) + + # To use style block only + scale_config = { + "up": { + "block_0": [0.0, 1.0, 0.0] + }, + } + pipeline.activate_ip_adapter(scale_config) + + # To use style+layout blocks + scale_config = { + "down": { + "block_2": [0.0, 1.0] + }, + "up": { + "block_0": [0.0, 1.0, 0.0] + }, + } + pipeline.activate_ip_adapter(scale_config) + ``` + """ + unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet + scale_config = _maybe_expand_ip_scales(unet, scale_config) + for attn_processor in unet.attn_processors.values(): + # set all to default: skip=False and scale=1.0 + if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): + attn_processor.skip = True + attn_processor.scale = [0.0] * len(attn_processor.scale) + + for attn_name, attn_processor in unet.attn_processors.items(): + if not isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): continue + for key, scale in scale_config.items(): + if attn_name.startswith(key): + attn_processor.skip = True if scale==0.0 else False + _scale = [scale] * len(attn_processor.scale) + if len(attn_processor.scale) != len(_scale): + raise ValueError( + f"`scale` should be a list of same length as the number if ip-adapters " + f"Expected {len(attn_processor.scale)} but got {len(_scale)}." + ) + attn_processor.scale = _scale + def unload_ip_adapter(self): """ diff --git a/src/diffusers/loaders/ip_adapter_utils.py b/src/diffusers/loaders/ip_adapter_utils.py new file mode 100644 index 000000000000..45f1bb9b3130 --- /dev/null +++ b/src/diffusers/loaders/ip_adapter_utils.py @@ -0,0 +1,144 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +from typing import TYPE_CHECKING, Dict, List, Union + +from ..utils import logging + + +if TYPE_CHECKING: + # import here to avoid circular imports + from ..models import UNet2DConditionModel + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def _translate_into_actual_layer_name(name): + """Translate user-friendly name (e.g. 'mid') into actual layer name (e.g. 'mid_block.attentions.0')""" + if name == "mid": + return "mid_block.attentions.0" + + updown, block, attn = name.split(".") + + updown = updown.replace("down", "down_blocks").replace("up", "up_blocks") + block = block.replace("block_", "") + attn = "attentions." + attn + + return ".".join((updown, block, attn)) + + +def _maybe_expand_ip_scales(unet: "UNet2DConditionModel", weight_scale: List[Union[float, Dict]]): + blocks_with_transformer = { + "down": [i for i, block in enumerate(unet.down_blocks) if hasattr(block, "attentions")], + "up": [i for i, block in enumerate(unet.up_blocks) if hasattr(block, "attentions")], + } + transformer_per_block = {"down": unet.config.layers_per_block, "up": unet.config.layers_per_block + 1} + + expanded_weight_scales = _maybe_expand_scales_for_one_ip_adapter( + weight_scale, blocks_with_transformer, transformer_per_block, unet.state_dict() + ) + + return expanded_weight_scales + + +def _maybe_expand_scales_for_one_ip_adapter( + scales: Union[float, Dict], + blocks_with_transformer: Dict[str, int], + transformer_per_block: Dict[str, int], + state_dict: None, +): + """ + Expands the inputs into a more granular dictionary. See the example below for more details. + + Parameters: + scales (`Union[float, Dict]`): + Scales dict to expand. + blocks_with_transformer (`Dict[str, int]`): + Dict with keys 'up' and 'down', showing which blocks have transformer layers + transformer_per_block (`Dict[str, int]`): + Dict with keys 'up' and 'down', showing how many transformer layers each block has + + E.g. turns + ```python + scales = {"down": 2, "mid": 3, "up": {"block_0": 4, "block_1": [5, 6, 7]}} + blocks_with_transformer = {"down": [1, 2], "up": [0, 1]} + transformer_per_block = {"down": 2, "up": 3} + ``` + into + ```python + { + "down.block_1.0": 2, + "down.block_1.1": 2, + "down.block_2.0": 2, + "down.block_2.1": 2, + "mid": 3, + "up.block_0.0": 4, + "up.block_0.1": 4, + "up.block_0.2": 4, + "up.block_1.0": 5, + "up.block_1.1": 6, + "up.block_1.2": 7, + } + ``` + """ + if sorted(blocks_with_transformer.keys()) != ["down", "up"]: + raise ValueError("blocks_with_transformer needs to be a dict with keys `'down' and `'up'`") + + if sorted(transformer_per_block.keys()) != ["down", "up"]: + raise ValueError("transformer_per_block needs to be a dict with keys `'down' and `'up'`") + + if not isinstance(scales, dict): + scales = {"down":scales, "mid":scales, "up":scales} + + scales = copy.deepcopy(scales) + + # defualt scale is 0 to skip + if "mid" not in scales: + scales["mid"] = 0 + + for updown in ["up", "down"]: + if updown not in scales: + scales[updown] = 0 + + # eg {"down": 1} to {"down": {"block_1": 1, "block_2": 1}}}, same scale for all blocks + if not isinstance(scales[updown], dict): + scales[updown] = {f"block_{i}": scales[updown] for i in blocks_with_transformer[updown]} + + # eg {"down": "block_1": 1}} to {"down": "block_1": [1, 1]}}, same scale for all transformers in the block + for i in blocks_with_transformer[updown]: + block = f"block_{i}" + # set not specified blocks to default 0 + if block not in scales[updown]: + scales[updown][block] = 0 + if not isinstance(scales[updown][block], list): + scales[updown][block] = [scales[updown][block] for _ in range(transformer_per_block[updown])] + else: + assert len(scales[updown][block]) == transformer_per_block[updown], \ + f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}." + + # eg {"down": "block_1": [1, 1]}} to {"down.block_1.0": 1, "down.block_1.1": 1} + for i in blocks_with_transformer[updown]: + block = f"block_{i}" + for tf_idx, value in enumerate(scales[updown][block]): + scales[f"{updown}.{block}.{tf_idx}"] = value + + del scales[updown] + + for layer in scales.keys(): + if not any(_translate_into_actual_layer_name(layer) in module for module in state_dict.keys()): + raise ValueError( + f"Can't set ip scale for layer {layer}. It either doesn't exist in this unet or it has no attentions." + ) + + return {_translate_into_actual_layer_name(name): weight for name, weight in scales.items()} \ No newline at end of file diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 237e8236caf4..e4230546fb5d 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -2108,7 +2108,7 @@ class IPAdapterAttnProcessor(nn.Module): the weight scale of image prompt. """ - def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0): + def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0, skip=False): super().__init__() self.hidden_size = hidden_size @@ -2117,6 +2117,7 @@ def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale if not isinstance(num_tokens, (tuple, list)): num_tokens = [num_tokens] self.num_tokens = num_tokens + self.skip = skip if not isinstance(scale, list): scale = [scale] * len(num_tokens) @@ -2226,47 +2227,48 @@ def __call__( ip_adapter_masks = [None] * len(self.scale) # for ip-adapter - for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( - ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks - ): - if mask is not None: - if not isinstance(scale, list): - scale = [scale] + if not self.skip: + for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( + ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks + ): + if mask is not None: + if not isinstance(scale, list): + scale = [scale] + + current_num_images = mask.shape[1] + for i in range(current_num_images): + ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :]) + ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :]) + + ip_key = attn.head_to_batch_dim(ip_key) + ip_value = attn.head_to_batch_dim(ip_value) + + ip_attention_probs = attn.get_attention_scores(query, ip_key, None) + _current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value) + _current_ip_hidden_states = attn.batch_to_head_dim(_current_ip_hidden_states) + + mask_downsample = IPAdapterMaskProcessor.downsample( + mask[:, i, :, :], + batch_size, + _current_ip_hidden_states.shape[1], + _current_ip_hidden_states.shape[2], + ) + + mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device) - current_num_images = mask.shape[1] - for i in range(current_num_images): - ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :]) - ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :]) + hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample) + else: + ip_key = to_k_ip(current_ip_hidden_states) + ip_value = to_v_ip(current_ip_hidden_states) ip_key = attn.head_to_batch_dim(ip_key) ip_value = attn.head_to_batch_dim(ip_value) ip_attention_probs = attn.get_attention_scores(query, ip_key, None) - _current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value) - _current_ip_hidden_states = attn.batch_to_head_dim(_current_ip_hidden_states) - - mask_downsample = IPAdapterMaskProcessor.downsample( - mask[:, i, :, :], - batch_size, - _current_ip_hidden_states.shape[1], - _current_ip_hidden_states.shape[2], - ) - - mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device) - - hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample) - else: - ip_key = to_k_ip(current_ip_hidden_states) - ip_value = to_v_ip(current_ip_hidden_states) - - ip_key = attn.head_to_batch_dim(ip_key) - ip_value = attn.head_to_batch_dim(ip_value) - - ip_attention_probs = attn.get_attention_scores(query, ip_key, None) - current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value) - current_ip_hidden_states = attn.batch_to_head_dim(current_ip_hidden_states) + current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value) + current_ip_hidden_states = attn.batch_to_head_dim(current_ip_hidden_states) - hidden_states = hidden_states + scale * current_ip_hidden_states + hidden_states = hidden_states + scale * current_ip_hidden_states # linear proj hidden_states = attn.to_out[0](hidden_states) @@ -2299,7 +2301,7 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module): the weight scale of image prompt. """ - def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0): + def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0, skip=False): super().__init__() if not hasattr(F, "scaled_dot_product_attention"): @@ -2319,6 +2321,7 @@ def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale if len(scale) != len(num_tokens): raise ValueError("`scale` should be a list of integers with the same length as `num_tokens`.") self.scale = scale + self.skip = skip self.to_k_ip = nn.ModuleList( [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))] @@ -2436,60 +2439,61 @@ def __call__( ip_adapter_masks = [None] * len(self.scale) # for ip-adapter - for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( - ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks - ): - if mask is not None: - if not isinstance(scale, list): - scale = [scale] + if not self.skip: + for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( + ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks + ): + if mask is not None: + if not isinstance(scale, list): + scale = [scale] + + current_num_images = mask.shape[1] + for i in range(current_num_images): + ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :]) + ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :]) + + ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + _current_ip_hidden_states = F.scaled_dot_product_attention( + query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False + ) + + _current_ip_hidden_states = _current_ip_hidden_states.transpose(1, 2).reshape( + batch_size, -1, attn.heads * head_dim + ) + _current_ip_hidden_states = _current_ip_hidden_states.to(query.dtype) + + mask_downsample = IPAdapterMaskProcessor.downsample( + mask[:, i, :, :], + batch_size, + _current_ip_hidden_states.shape[1], + _current_ip_hidden_states.shape[2], + ) - current_num_images = mask.shape[1] - for i in range(current_num_images): - ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :]) - ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :]) + mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device) + hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample) + else: + ip_key = to_k_ip(current_ip_hidden_states) + ip_value = to_v_ip(current_ip_hidden_states) ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) # the output of sdp = (batch, num_heads, seq_len, head_dim) # TODO: add support for attn.scale when we move to Torch 2.1 - _current_ip_hidden_states = F.scaled_dot_product_attention( + current_ip_hidden_states = F.scaled_dot_product_attention( query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False ) - _current_ip_hidden_states = _current_ip_hidden_states.transpose(1, 2).reshape( + current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape( batch_size, -1, attn.heads * head_dim ) - _current_ip_hidden_states = _current_ip_hidden_states.to(query.dtype) - - mask_downsample = IPAdapterMaskProcessor.downsample( - mask[:, i, :, :], - batch_size, - _current_ip_hidden_states.shape[1], - _current_ip_hidden_states.shape[2], - ) - - mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device) - hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample) - else: - ip_key = to_k_ip(current_ip_hidden_states) - ip_value = to_v_ip(current_ip_hidden_states) - - ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) - ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) - - # the output of sdp = (batch, num_heads, seq_len, head_dim) - # TODO: add support for attn.scale when we move to Torch 2.1 - current_ip_hidden_states = F.scaled_dot_product_attention( - query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False - ) - - current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape( - batch_size, -1, attn.heads * head_dim - ) - current_ip_hidden_states = current_ip_hidden_states.to(query.dtype) + current_ip_hidden_states = current_ip_hidden_states.to(query.dtype) - hidden_states = hidden_states + scale * current_ip_hidden_states + hidden_states = hidden_states + scale * current_ip_hidden_states # linear proj hidden_states = attn.to_out[0](hidden_states) From ef9694c255adeaae2ef55116a81da4ef9b4f1033 Mon Sep 17 00:00:00 2001 From: DannHuang Date: Wed, 17 Apr 2024 15:13:04 +0800 Subject: [PATCH 02/13] merge duplicate functions, enable multi IPA control --- src/diffusers/loaders/ip_adapter.py | 95 ++++++------- src/diffusers/loaders/ip_adapter_utils.py | 144 -------------------- src/diffusers/loaders/unet_loader_utils.py | 17 ++- src/diffusers/models/attention_processor.py | 22 ++- 4 files changed, 66 insertions(+), 212 deletions(-) delete mode 100644 src/diffusers/loaders/ip_adapter_utils.py diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index b6e7c59fd24e..0ae5167d7bcf 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -27,7 +27,7 @@ is_transformers_available, logging, ) -from .ip_adapter_utils import _maybe_expand_ip_scales +from .unet_loader_utils import _maybe_expand_lora_scales if is_transformers_available(): @@ -229,50 +229,28 @@ def load_ip_adapter( unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet unet._load_ip_adapter_weights(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage) - def set_ip_adapter_scale(self, scale): + def set_ip_adapter_scale(self, scale_configs: Union[float, Dict, List[Union[float, Dict]]], default_scale=0.0): """ - Sets the conditioning scale between text and image. - - Example: - - ```py - pipeline.set_ip_adapter_scale(0.5) - ``` - """ - unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - for attn_processor in unet.attn_processors.values(): - if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): - if not isinstance(scale, list): - scale = [scale] * len(attn_processor.scale) - if len(attn_processor.scale) != len(scale): - raise ValueError( - f"`scale` should be a list of same length as the number if ip-adapters " - f"Expected {len(attn_processor.scale)} but got {len(scale)}." - ) - attn_processor.scale = scale - - - def activate_ip_adapter(self, scale_config: Union[float, Dict]): - """ - Activate IP-Adapter per-transformer block. + Set IP-Adapter scales per-transformer block. Input `scale_configs` could be a single config or a list of configs + for granular control over each IP-Adapter behavior. A config can be a float or a dictionary. Example: ```py # To use original IP-Adapter - scale_config = 1.0 - pipeline.activate_ip_adapter(scale_config) + scale_configs = 1.0 + pipeline.set_ip_adapter_scale(scale_configs) # To use style block only - scale_config = { + scale_configs = { "up": { "block_0": [0.0, 1.0, 0.0] }, } - pipeline.activate_ip_adapter(scale_config) + pipeline.set_ip_adapter_scale(scale_configs) # To use style+layout blocks - scale_config = { + scale_configs = { "down": { "block_2": [0.0, 1.0] }, @@ -280,30 +258,45 @@ def activate_ip_adapter(self, scale_config: Union[float, Dict]): "block_0": [0.0, 1.0, 0.0] }, } - pipeline.activate_ip_adapter(scale_config) + pipeline.set_ip_adapter_scale(scale_configs) + + # To use style and layout from 2 reference images + scale_configs = [ + { + "down": { + "block_2": [0.0, 1.0] + } + }, + { + "up": { + "block_0": [0.0, 1.0, 0.0] + } + } + ] + pipeline.set_ip_adapter_scale(scale_configs) ``` """ unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - scale_config = _maybe_expand_ip_scales(unet, scale_config) - for attn_processor in unet.attn_processors.values(): - # set all to default: skip=False and scale=1.0 - if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): - attn_processor.skip = True - attn_processor.scale = [0.0] * len(attn_processor.scale) - - for attn_name, attn_processor in unet.attn_processors.items(): - if not isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): continue - for key, scale in scale_config.items(): - if attn_name.startswith(key): - attn_processor.skip = True if scale==0.0 else False - _scale = [scale] * len(attn_processor.scale) - if len(attn_processor.scale) != len(_scale): - raise ValueError( - f"`scale` should be a list of same length as the number if ip-adapters " - f"Expected {len(attn_processor.scale)} but got {len(_scale)}." - ) - attn_processor.scale = _scale + if not isinstance(scale_configs, list): + scale_configs = [scale_configs] + scale_configs = _maybe_expand_lora_scales(unet, scale_configs, default_scale=default_scale) + for attn_name, attn_processor in unet.attn_processors.items(): + if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): + if len(scale_configs)>1 and len(scale_configs)!=len(attn_processor.scale): + raise ValueError( + f"Cannot assign {len(scale_configs)} scale_configs to " + f"{len(attn_processor.scale)} IP-Adapter." + ) + elif len(scale_configs)==1: + scale_configs = scale_configs * len(attn_processor.scale) + for i, scale_config in enumerate(scale_configs): + if isinstance(scale_config, dict): + for key, scale in scale_config.items(): + if attn_name.startswith(key): + attn_processor.scale[i] = scale + else: + attn_processor.scale[i] = scale_config def unload_ip_adapter(self): """ diff --git a/src/diffusers/loaders/ip_adapter_utils.py b/src/diffusers/loaders/ip_adapter_utils.py deleted file mode 100644 index 45f1bb9b3130..000000000000 --- a/src/diffusers/loaders/ip_adapter_utils.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import copy -from typing import TYPE_CHECKING, Dict, List, Union - -from ..utils import logging - - -if TYPE_CHECKING: - # import here to avoid circular imports - from ..models import UNet2DConditionModel - -logger = logging.get_logger(__name__) # pylint: disable=invalid-name - - -def _translate_into_actual_layer_name(name): - """Translate user-friendly name (e.g. 'mid') into actual layer name (e.g. 'mid_block.attentions.0')""" - if name == "mid": - return "mid_block.attentions.0" - - updown, block, attn = name.split(".") - - updown = updown.replace("down", "down_blocks").replace("up", "up_blocks") - block = block.replace("block_", "") - attn = "attentions." + attn - - return ".".join((updown, block, attn)) - - -def _maybe_expand_ip_scales(unet: "UNet2DConditionModel", weight_scale: List[Union[float, Dict]]): - blocks_with_transformer = { - "down": [i for i, block in enumerate(unet.down_blocks) if hasattr(block, "attentions")], - "up": [i for i, block in enumerate(unet.up_blocks) if hasattr(block, "attentions")], - } - transformer_per_block = {"down": unet.config.layers_per_block, "up": unet.config.layers_per_block + 1} - - expanded_weight_scales = _maybe_expand_scales_for_one_ip_adapter( - weight_scale, blocks_with_transformer, transformer_per_block, unet.state_dict() - ) - - return expanded_weight_scales - - -def _maybe_expand_scales_for_one_ip_adapter( - scales: Union[float, Dict], - blocks_with_transformer: Dict[str, int], - transformer_per_block: Dict[str, int], - state_dict: None, -): - """ - Expands the inputs into a more granular dictionary. See the example below for more details. - - Parameters: - scales (`Union[float, Dict]`): - Scales dict to expand. - blocks_with_transformer (`Dict[str, int]`): - Dict with keys 'up' and 'down', showing which blocks have transformer layers - transformer_per_block (`Dict[str, int]`): - Dict with keys 'up' and 'down', showing how many transformer layers each block has - - E.g. turns - ```python - scales = {"down": 2, "mid": 3, "up": {"block_0": 4, "block_1": [5, 6, 7]}} - blocks_with_transformer = {"down": [1, 2], "up": [0, 1]} - transformer_per_block = {"down": 2, "up": 3} - ``` - into - ```python - { - "down.block_1.0": 2, - "down.block_1.1": 2, - "down.block_2.0": 2, - "down.block_2.1": 2, - "mid": 3, - "up.block_0.0": 4, - "up.block_0.1": 4, - "up.block_0.2": 4, - "up.block_1.0": 5, - "up.block_1.1": 6, - "up.block_1.2": 7, - } - ``` - """ - if sorted(blocks_with_transformer.keys()) != ["down", "up"]: - raise ValueError("blocks_with_transformer needs to be a dict with keys `'down' and `'up'`") - - if sorted(transformer_per_block.keys()) != ["down", "up"]: - raise ValueError("transformer_per_block needs to be a dict with keys `'down' and `'up'`") - - if not isinstance(scales, dict): - scales = {"down":scales, "mid":scales, "up":scales} - - scales = copy.deepcopy(scales) - - # defualt scale is 0 to skip - if "mid" not in scales: - scales["mid"] = 0 - - for updown in ["up", "down"]: - if updown not in scales: - scales[updown] = 0 - - # eg {"down": 1} to {"down": {"block_1": 1, "block_2": 1}}}, same scale for all blocks - if not isinstance(scales[updown], dict): - scales[updown] = {f"block_{i}": scales[updown] for i in blocks_with_transformer[updown]} - - # eg {"down": "block_1": 1}} to {"down": "block_1": [1, 1]}}, same scale for all transformers in the block - for i in blocks_with_transformer[updown]: - block = f"block_{i}" - # set not specified blocks to default 0 - if block not in scales[updown]: - scales[updown][block] = 0 - if not isinstance(scales[updown][block], list): - scales[updown][block] = [scales[updown][block] for _ in range(transformer_per_block[updown])] - else: - assert len(scales[updown][block]) == transformer_per_block[updown], \ - f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}." - - # eg {"down": "block_1": [1, 1]}} to {"down.block_1.0": 1, "down.block_1.1": 1} - for i in blocks_with_transformer[updown]: - block = f"block_{i}" - for tf_idx, value in enumerate(scales[updown][block]): - scales[f"{updown}.{block}.{tf_idx}"] = value - - del scales[updown] - - for layer in scales.keys(): - if not any(_translate_into_actual_layer_name(layer) in module for module in state_dict.keys()): - raise ValueError( - f"Can't set ip scale for layer {layer}. It either doesn't exist in this unet or it has no attentions." - ) - - return {_translate_into_actual_layer_name(name): weight for name, weight in scales.items()} \ No newline at end of file diff --git a/src/diffusers/loaders/unet_loader_utils.py b/src/diffusers/loaders/unet_loader_utils.py index 3ee4a96fad0a..febeac86141d 100644 --- a/src/diffusers/loaders/unet_loader_utils.py +++ b/src/diffusers/loaders/unet_loader_utils.py @@ -38,7 +38,7 @@ def _translate_into_actual_layer_name(name): return ".".join((updown, block, attn)) -def _maybe_expand_lora_scales(unet: "UNet2DConditionModel", weight_scales: List[Union[float, Dict]]): +def _maybe_expand_lora_scales(unet: "UNet2DConditionModel", weight_scales: List[Union[float, Dict]], default_scale=1.0): blocks_with_transformer = { "down": [i for i, block in enumerate(unet.down_blocks) if hasattr(block, "attentions")], "up": [i for i, block in enumerate(unet.up_blocks) if hasattr(block, "attentions")], @@ -47,7 +47,7 @@ def _maybe_expand_lora_scales(unet: "UNet2DConditionModel", weight_scales: List[ expanded_weight_scales = [ _maybe_expand_lora_scales_for_one_adapter( - weight_for_adapter, blocks_with_transformer, transformer_per_block, unet.state_dict() + weight_for_adapter, blocks_with_transformer, transformer_per_block, unet.state_dict(), default_scale=default_scale ) for weight_for_adapter in weight_scales ] @@ -60,6 +60,7 @@ def _maybe_expand_lora_scales_for_one_adapter( blocks_with_transformer: Dict[str, int], transformer_per_block: Dict[str, int], state_dict: None, + default_scale: float=1.0, ): """ Expands the inputs into a more granular dictionary. See the example below for more details. @@ -108,21 +109,27 @@ def _maybe_expand_lora_scales_for_one_adapter( scales = copy.deepcopy(scales) if "mid" not in scales: - scales["mid"] = 1 + scales["mid"] = default_scale for updown in ["up", "down"]: if updown not in scales: - scales[updown] = 1 + scales[updown] = default_scale # eg {"down": 1} to {"down": {"block_1": 1, "block_2": 1}}} if not isinstance(scales[updown], dict): scales[updown] = {f"block_{i}": scales[updown] for i in blocks_with_transformer[updown]} - # eg {"down": "block_1": 1}} to {"down": "block_1": [1, 1]}} + # eg {"down": {"block_1": 1}} to {"down": {"block_1": [1, 1]}} for i in blocks_with_transformer[updown]: block = f"block_{i}" + # set not assigned blocks to default scale + if block not in scales[updown]: + scales[updown][block] = default_scale if not isinstance(scales[updown][block], list): scales[updown][block] = [scales[updown][block] for _ in range(transformer_per_block[updown])] + else: + assert len(scales[updown][block]) == transformer_per_block[updown], \ + f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}." # eg {"down": "block_1": [1, 1]}} to {"down.block_1.0": 1, "down.block_1.1": 1} for i in blocks_with_transformer[updown]: diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index e4230546fb5d..e0a8f99e5f5f 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -2108,7 +2108,7 @@ class IPAdapterAttnProcessor(nn.Module): the weight scale of image prompt. """ - def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0, skip=False): + def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0): super().__init__() self.hidden_size = hidden_size @@ -2117,7 +2117,6 @@ def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale if not isinstance(num_tokens, (tuple, list)): num_tokens = [num_tokens] self.num_tokens = num_tokens - self.skip = skip if not isinstance(scale, list): scale = [scale] * len(num_tokens) @@ -2227,10 +2226,10 @@ def __call__( ip_adapter_masks = [None] * len(self.scale) # for ip-adapter - if not self.skip: - for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( - ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks - ): + for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( + ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks + ): + if scale > 0: if mask is not None: if not isinstance(scale, list): scale = [scale] @@ -2301,7 +2300,7 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module): the weight scale of image prompt. """ - def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0, skip=False): + def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale=1.0): super().__init__() if not hasattr(F, "scaled_dot_product_attention"): @@ -2321,7 +2320,6 @@ def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=(4,), scale if len(scale) != len(num_tokens): raise ValueError("`scale` should be a list of integers with the same length as `num_tokens`.") self.scale = scale - self.skip = skip self.to_k_ip = nn.ModuleList( [nn.Linear(cross_attention_dim, hidden_size, bias=False) for _ in range(len(num_tokens))] @@ -2439,10 +2437,10 @@ def __call__( ip_adapter_masks = [None] * len(self.scale) # for ip-adapter - if not self.skip: - for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( - ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks - ): + for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( + ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks + ): + if scale > 0: if mask is not None: if not isinstance(scale, list): scale = [scale] From 24224a06f0f5343f023c4b29169fe070fedf1d53 Mon Sep 17 00:00:00 2001 From: DannHuang Date: Thu, 18 Apr 2024 08:42:40 +0800 Subject: [PATCH 03/13] adapt to the repo's user warning convention --- src/diffusers/loaders/ip_adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index 0ae5167d7bcf..06171ee2f631 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -283,7 +283,7 @@ def set_ip_adapter_scale(self, scale_configs: Union[float, Dict, List[Union[floa for attn_name, attn_processor in unet.attn_processors.items(): if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): - if len(scale_configs)>1 and len(scale_configs)!=len(attn_processor.scale): + if len(scale_configs) != len(attn_processor.scale): raise ValueError( f"Cannot assign {len(scale_configs)} scale_configs to " f"{len(attn_processor.scale)} IP-Adapter." From 9d09a56f42d5cb08c0e775bfbc2220a328ab2685 Mon Sep 17 00:00:00 2001 From: DannHuang Date: Thu, 18 Apr 2024 10:54:09 +0800 Subject: [PATCH 04/13] make quality --- .../train_dreambooth_lora_sd15_advanced.py | 6 ++-- .../train_dreambooth_lora_sdxl_advanced.py | 6 ++-- .../textual_inversion.py | 6 ++-- .../textual_inversion/textual_inversion.py | 6 ++-- .../textual_inversion/textual_inversion.py | 6 ++-- .../textual_inversion_sdxl.py | 6 ++-- scripts/convert_svd_to_diffusers.py | 12 +++---- src/diffusers/loaders/ip_adapter.py | 31 +++++-------------- src/diffusers/loaders/lora.py | 6 ++-- .../loaders/lora_conversion_utils.py | 18 +++++------ src/diffusers/loaders/unet_loader_utils.py | 17 +++++++--- tests/models/autoencoders/test_models_vae.py | 6 ++-- tests/pipelines/amused/test_amused.py | 3 +- tests/pipelines/amused/test_amused_img2img.py | 3 +- tests/pipelines/amused/test_amused_inpaint.py | 3 +- utils/update_metadata.py | 1 + 16 files changed, 61 insertions(+), 75 deletions(-) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py index 6cdf2e7b21ab..4c6ab506fe91 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py @@ -744,9 +744,9 @@ def initialize_new_tokens(self, inserting_toks: List[str]): .to(dtype=self.dtype) * std_token_embedding ) - self.embeddings_settings[ - f"original_embeddings_{idx}" - ] = text_encoder.text_model.embeddings.token_embedding.weight.data.clone() + self.embeddings_settings[f"original_embeddings_{idx}"] = ( + text_encoder.text_model.embeddings.token_embedding.weight.data.clone() + ) self.embeddings_settings[f"std_token_embedding_{idx}"] = std_token_embedding inu = torch.ones((len(tokenizer),), dtype=torch.bool) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py index 21a84b77245a..d6a63f91939d 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py @@ -776,9 +776,9 @@ def initialize_new_tokens(self, inserting_toks: List[str]): .to(dtype=self.dtype) * std_token_embedding ) - self.embeddings_settings[ - f"original_embeddings_{idx}" - ] = text_encoder.text_model.embeddings.token_embedding.weight.data.clone() + self.embeddings_settings[f"original_embeddings_{idx}"] = ( + text_encoder.text_model.embeddings.token_embedding.weight.data.clone() + ) self.embeddings_settings[f"std_token_embedding_{idx}"] = std_token_embedding inu = torch.ones((len(tokenizer),), dtype=torch.bool) diff --git a/examples/research_projects/multi_token_textual_inversion/textual_inversion.py b/examples/research_projects/multi_token_textual_inversion/textual_inversion.py index 57ad77477b0d..7aad64ecb1dd 100644 --- a/examples/research_projects/multi_token_textual_inversion/textual_inversion.py +++ b/examples/research_projects/multi_token_textual_inversion/textual_inversion.py @@ -830,9 +830,9 @@ def main(): # Let's make sure we don't update any embedding weights besides the newly added token index_no_updates = get_mask(tokenizer, accelerator) with torch.no_grad(): - accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[ - index_no_updates - ] = orig_embeds_params[index_no_updates] + accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[index_no_updates] = ( + orig_embeds_params[index_no_updates] + ) # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: diff --git a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py index e10564fa59ef..5f0710e85319 100644 --- a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py +++ b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py @@ -886,9 +886,9 @@ def main(): index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False with torch.no_grad(): - accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[ - index_no_updates - ] = orig_embeds_params[index_no_updates] + accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[index_no_updates] = ( + orig_embeds_params[index_no_updates] + ) # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index 4922789862b5..3ae1e85723ee 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -910,9 +910,9 @@ def main(): index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False with torch.no_grad(): - accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[ - index_no_updates - ] = orig_embeds_params[index_no_updates] + accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[index_no_updates] = ( + orig_embeds_params[index_no_updates] + ) # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: diff --git a/examples/textual_inversion/textual_inversion_sdxl.py b/examples/textual_inversion/textual_inversion_sdxl.py index c24a4c4f4855..cc020499be8e 100644 --- a/examples/textual_inversion/textual_inversion_sdxl.py +++ b/examples/textual_inversion/textual_inversion_sdxl.py @@ -940,9 +940,9 @@ def main(): index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False with torch.no_grad(): - accelerator.unwrap_model(text_encoder_1).get_input_embeddings().weight[ - index_no_updates - ] = orig_embeds_params[index_no_updates] + accelerator.unwrap_model(text_encoder_1).get_input_embeddings().weight[index_no_updates] = ( + orig_embeds_params[index_no_updates] + ) # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: diff --git a/scripts/convert_svd_to_diffusers.py b/scripts/convert_svd_to_diffusers.py index 3243ce294b26..e46410ccb3bd 100644 --- a/scripts/convert_svd_to_diffusers.py +++ b/scripts/convert_svd_to_diffusers.py @@ -381,9 +381,9 @@ def convert_ldm_unet_checkpoint( # TODO resnet time_mixer.mix_factor if f"input_blocks.{i}.0.time_mixer.mix_factor" in unet_state_dict: - new_checkpoint[ - f"down_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor" - ] = unet_state_dict[f"input_blocks.{i}.0.time_mixer.mix_factor"] + new_checkpoint[f"down_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor"] = ( + unet_state_dict[f"input_blocks.{i}.0.time_mixer.mix_factor"] + ) if len(attentions): paths = renew_attention_paths(attentions) @@ -478,9 +478,9 @@ def convert_ldm_unet_checkpoint( ) if f"output_blocks.{i}.0.time_mixer.mix_factor" in unet_state_dict: - new_checkpoint[ - f"up_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor" - ] = unet_state_dict[f"output_blocks.{i}.0.time_mixer.mix_factor"] + new_checkpoint[f"up_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor"] = ( + unet_state_dict[f"output_blocks.{i}.0.time_mixer.mix_factor"] + ) output_block_list = {k: sorted(v) for k, v in output_block_list.items()} if ["conv.bias", "conv.weight"] in output_block_list.values(): diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index 06171ee2f631..09622f615b4a 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -231,8 +231,8 @@ def load_ip_adapter( def set_ip_adapter_scale(self, scale_configs: Union[float, Dict, List[Union[float, Dict]]], default_scale=0.0): """ - Set IP-Adapter scales per-transformer block. Input `scale_configs` could be a single config or a list of configs - for granular control over each IP-Adapter behavior. A config can be a float or a dictionary. + Set IP-Adapter scales per-transformer block. Input `scale_configs` could be a single config or a list of + configs for granular control over each IP-Adapter behavior. A config can be a float or a dictionary. Example: @@ -243,36 +243,19 @@ def set_ip_adapter_scale(self, scale_configs: Union[float, Dict, List[Union[floa # To use style block only scale_configs = { - "up": { - "block_0": [0.0, 1.0, 0.0] - }, + "up": {"block_0": [0.0, 1.0, 0.0]}, } pipeline.set_ip_adapter_scale(scale_configs) # To use style+layout blocks scale_configs = { - "down": { - "block_2": [0.0, 1.0] - }, - "up": { - "block_0": [0.0, 1.0, 0.0] - }, + "down": {"block_2": [0.0, 1.0]}, + "up": {"block_0": [0.0, 1.0, 0.0]}, } pipeline.set_ip_adapter_scale(scale_configs) # To use style and layout from 2 reference images - scale_configs = [ - { - "down": { - "block_2": [0.0, 1.0] - } - }, - { - "up": { - "block_0": [0.0, 1.0, 0.0] - } - } - ] + scale_configs = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}] pipeline.set_ip_adapter_scale(scale_configs) ``` """ @@ -288,7 +271,7 @@ def set_ip_adapter_scale(self, scale_configs: Union[float, Dict, List[Union[floa f"Cannot assign {len(scale_configs)} scale_configs to " f"{len(attn_processor.scale)} IP-Adapter." ) - elif len(scale_configs)==1: + elif len(scale_configs) == 1: scale_configs = scale_configs * len(attn_processor.scale) for i, scale_config in enumerate(scale_configs): if isinstance(scale_config, dict): diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py index 5d89658830f1..48eabbb541ab 100644 --- a/src/diffusers/loaders/lora.py +++ b/src/diffusers/loaders/lora.py @@ -1288,9 +1288,9 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, text_encoder_module.lora_A[adapter_name].to(device) text_encoder_module.lora_B[adapter_name].to(device) # this is a param, not a module, so device placement is not in-place -> re-assign - text_encoder_module.lora_magnitude_vector[ - adapter_name - ] = text_encoder_module.lora_magnitude_vector[adapter_name].to(device) + text_encoder_module.lora_magnitude_vector[adapter_name] = ( + text_encoder_module.lora_magnitude_vector[adapter_name].to(device) + ) class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin): diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py index 11e3311a6402..e4877d495970 100644 --- a/src/diffusers/loaders/lora_conversion_utils.py +++ b/src/diffusers/loaders/lora_conversion_utils.py @@ -209,9 +209,9 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_ if is_unet_dora_lora: dora_scale_key_to_replace = "_lora.down." if "_lora.down." in diffusers_name else ".lora.down." - unet_state_dict[ - diffusers_name.replace(dora_scale_key_to_replace, ".lora_magnitude_vector.") - ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale")) + unet_state_dict[diffusers_name.replace(dora_scale_key_to_replace, ".lora_magnitude_vector.")] = ( + state_dict.pop(key.replace("lora_down.weight", "dora_scale")) + ) elif lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")): if lora_name.startswith(("lora_te_", "lora_te1_")): @@ -249,13 +249,13 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_ "_lora.down." if "_lora.down." in diffusers_name else ".lora_linear_layer." ) if lora_name.startswith(("lora_te_", "lora_te1_")): - te_state_dict[ - diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.") - ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale")) + te_state_dict[diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")] = ( + state_dict.pop(key.replace("lora_down.weight", "dora_scale")) + ) elif lora_name.startswith("lora_te2_"): - te2_state_dict[ - diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.") - ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale")) + te2_state_dict[diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")] = ( + state_dict.pop(key.replace("lora_down.weight", "dora_scale")) + ) # Rename the alphas so that they can be mapped appropriately. if lora_name_alpha in state_dict: diff --git a/src/diffusers/loaders/unet_loader_utils.py b/src/diffusers/loaders/unet_loader_utils.py index febeac86141d..598cf33563d2 100644 --- a/src/diffusers/loaders/unet_loader_utils.py +++ b/src/diffusers/loaders/unet_loader_utils.py @@ -38,7 +38,9 @@ def _translate_into_actual_layer_name(name): return ".".join((updown, block, attn)) -def _maybe_expand_lora_scales(unet: "UNet2DConditionModel", weight_scales: List[Union[float, Dict]], default_scale=1.0): +def _maybe_expand_lora_scales( + unet: "UNet2DConditionModel", weight_scales: List[Union[float, Dict]], default_scale=1.0 +): blocks_with_transformer = { "down": [i for i, block in enumerate(unet.down_blocks) if hasattr(block, "attentions")], "up": [i for i, block in enumerate(unet.up_blocks) if hasattr(block, "attentions")], @@ -47,7 +49,11 @@ def _maybe_expand_lora_scales(unet: "UNet2DConditionModel", weight_scales: List[ expanded_weight_scales = [ _maybe_expand_lora_scales_for_one_adapter( - weight_for_adapter, blocks_with_transformer, transformer_per_block, unet.state_dict(), default_scale=default_scale + weight_for_adapter, + blocks_with_transformer, + transformer_per_block, + unet.state_dict(), + default_scale=default_scale, ) for weight_for_adapter in weight_scales ] @@ -60,7 +66,7 @@ def _maybe_expand_lora_scales_for_one_adapter( blocks_with_transformer: Dict[str, int], transformer_per_block: Dict[str, int], state_dict: None, - default_scale: float=1.0, + default_scale: float = 1.0, ): """ Expands the inputs into a more granular dictionary. See the example below for more details. @@ -128,8 +134,9 @@ def _maybe_expand_lora_scales_for_one_adapter( if not isinstance(scales[updown][block], list): scales[updown][block] = [scales[updown][block] for _ in range(transformer_per_block[updown])] else: - assert len(scales[updown][block]) == transformer_per_block[updown], \ - f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}." + assert ( + len(scales[updown][block]) == transformer_per_block[updown] + ), f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}." # eg {"down": "block_1": [1, 1]}} to {"down.block_1.0": 1, "down.block_1.1": 1} for i in blocks_with_transformer[updown]: diff --git a/tests/models/autoencoders/test_models_vae.py b/tests/models/autoencoders/test_models_vae.py index 026e01f0ed6a..56947609ed7d 100644 --- a/tests/models/autoencoders/test_models_vae.py +++ b/tests/models/autoencoders/test_models_vae.py @@ -384,12 +384,10 @@ def prepare_init_args_and_inputs_for_common(self): return self.init_dict, self.inputs_dict() @unittest.skip - def test_training(self): - ... + def test_training(self): ... @unittest.skip - def test_ema_training(self): - ... + def test_ema_training(self): ... class AutoencoderKLTemporalDecoderFastTests(ModelTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/amused/test_amused.py b/tests/pipelines/amused/test_amused.py index f03751e2f830..ed03fef2b0cd 100644 --- a/tests/pipelines/amused/test_amused.py +++ b/tests/pipelines/amused/test_amused.py @@ -125,8 +125,7 @@ def test_inference_batch_consistent(self, batch_sizes=[2]): self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False) @unittest.skip("aMUSEd does not support lists of generators") - def test_inference_batch_single_identical(self): - ... + def test_inference_batch_single_identical(self): ... @slow diff --git a/tests/pipelines/amused/test_amused_img2img.py b/tests/pipelines/amused/test_amused_img2img.py index efbca1f437a4..794f23792911 100644 --- a/tests/pipelines/amused/test_amused_img2img.py +++ b/tests/pipelines/amused/test_amused_img2img.py @@ -129,8 +129,7 @@ def test_inference_batch_consistent(self, batch_sizes=[2]): self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False) @unittest.skip("aMUSEd does not support lists of generators") - def test_inference_batch_single_identical(self): - ... + def test_inference_batch_single_identical(self): ... @slow diff --git a/tests/pipelines/amused/test_amused_inpaint.py b/tests/pipelines/amused/test_amused_inpaint.py index d397f8d81297..9c8b1a68b1e1 100644 --- a/tests/pipelines/amused/test_amused_inpaint.py +++ b/tests/pipelines/amused/test_amused_inpaint.py @@ -133,8 +133,7 @@ def test_inference_batch_consistent(self, batch_sizes=[2]): self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False) @unittest.skip("aMUSEd does not support lists of generators") - def test_inference_batch_single_identical(self): - ... + def test_inference_batch_single_identical(self): ... @slow diff --git a/utils/update_metadata.py b/utils/update_metadata.py index 840e4be78423..7d91a17b2539 100644 --- a/utils/update_metadata.py +++ b/utils/update_metadata.py @@ -24,6 +24,7 @@ Script modified from: https://github.com/huggingface/transformers/blob/main/utils/update_metadata.py """ + import argparse import os import tempfile From 07a18fd0e785ea2d7bd888a5e7439ca95ab6ba61 Mon Sep 17 00:00:00 2001 From: DannHuang Date: Thu, 18 Apr 2024 19:41:23 +0800 Subject: [PATCH 05/13] fixed arg name --- src/diffusers/loaders/ip_adapter.py | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index 09622f615b4a..f930f93fb3ce 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -229,40 +229,40 @@ def load_ip_adapter( unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet unet._load_ip_adapter_weights(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage) - def set_ip_adapter_scale(self, scale_configs: Union[float, Dict, List[Union[float, Dict]]], default_scale=0.0): + def set_ip_adapter_scale(self, scale: Union[float, Dict, List[Union[float, Dict]]], default_scale=0.0): """ - Set IP-Adapter scales per-transformer block. Input `scale_configs` could be a single config or a list of - configs for granular control over each IP-Adapter behavior. A config can be a float or a dictionary. + Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for + granular control over each IP-Adapter behavior. A config can be a float or a dictionary. Example: ```py # To use original IP-Adapter - scale_configs = 1.0 - pipeline.set_ip_adapter_scale(scale_configs) + scale = 1.0 + pipeline.set_ip_adapter_scale(scale) # To use style block only - scale_configs = { + scale = { "up": {"block_0": [0.0, 1.0, 0.0]}, } - pipeline.set_ip_adapter_scale(scale_configs) + pipeline.set_ip_adapter_scale(scale) # To use style+layout blocks - scale_configs = { + scale = { "down": {"block_2": [0.0, 1.0]}, "up": {"block_0": [0.0, 1.0, 0.0]}, } - pipeline.set_ip_adapter_scale(scale_configs) + pipeline.set_ip_adapter_scale(scale) # To use style and layout from 2 reference images - scale_configs = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}] - pipeline.set_ip_adapter_scale(scale_configs) + scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}] + pipeline.set_ip_adapter_scale(scales) ``` """ unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet - if not isinstance(scale_configs, list): - scale_configs = [scale_configs] - scale_configs = _maybe_expand_lora_scales(unet, scale_configs, default_scale=default_scale) + if not isinstance(scale, list): + scale = [scale] + scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=default_scale) for attn_name, attn_processor in unet.attn_processors.items(): if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): @@ -275,9 +275,9 @@ def set_ip_adapter_scale(self, scale_configs: Union[float, Dict, List[Union[floa scale_configs = scale_configs * len(attn_processor.scale) for i, scale_config in enumerate(scale_configs): if isinstance(scale_config, dict): - for key, scale in scale_config.items(): - if attn_name.startswith(key): - attn_processor.scale[i] = scale + for k, s in scale_config.items(): + if attn_name.startswith(k): + attn_processor.scale[i] = s else: attn_processor.scale[i] = scale_config From c45b1c779e5a1939f0a5001f0c16fa9257dfa1b2 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 19 Apr 2024 08:37:12 +0530 Subject: [PATCH 06/13] actual style. --- .../train_dreambooth_lora_sd15_advanced.py | 6 +++--- .../train_dreambooth_lora_sdxl_advanced.py | 6 +++--- .../textual_inversion.py | 6 +++--- .../textual_inversion/textual_inversion.py | 6 +++--- .../textual_inversion/textual_inversion.py | 6 +++--- .../textual_inversion_sdxl.py | 6 +++--- scripts/convert_svd_to_diffusers.py | 12 ++++++------ src/diffusers/loaders/lora.py | 6 +++--- src/diffusers/loaders/lora_conversion_utils.py | 18 +++++++++--------- tests/models/autoencoders/test_models_vae.py | 6 ++++-- tests/pipelines/amused/test_amused.py | 3 ++- tests/pipelines/amused/test_amused_img2img.py | 3 ++- tests/pipelines/amused/test_amused_inpaint.py | 3 ++- 13 files changed, 46 insertions(+), 41 deletions(-) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py index 4c6ab506fe91..6cdf2e7b21ab 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py @@ -744,9 +744,9 @@ def initialize_new_tokens(self, inserting_toks: List[str]): .to(dtype=self.dtype) * std_token_embedding ) - self.embeddings_settings[f"original_embeddings_{idx}"] = ( - text_encoder.text_model.embeddings.token_embedding.weight.data.clone() - ) + self.embeddings_settings[ + f"original_embeddings_{idx}" + ] = text_encoder.text_model.embeddings.token_embedding.weight.data.clone() self.embeddings_settings[f"std_token_embedding_{idx}"] = std_token_embedding inu = torch.ones((len(tokenizer),), dtype=torch.bool) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py index d6a63f91939d..21a84b77245a 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py @@ -776,9 +776,9 @@ def initialize_new_tokens(self, inserting_toks: List[str]): .to(dtype=self.dtype) * std_token_embedding ) - self.embeddings_settings[f"original_embeddings_{idx}"] = ( - text_encoder.text_model.embeddings.token_embedding.weight.data.clone() - ) + self.embeddings_settings[ + f"original_embeddings_{idx}" + ] = text_encoder.text_model.embeddings.token_embedding.weight.data.clone() self.embeddings_settings[f"std_token_embedding_{idx}"] = std_token_embedding inu = torch.ones((len(tokenizer),), dtype=torch.bool) diff --git a/examples/research_projects/multi_token_textual_inversion/textual_inversion.py b/examples/research_projects/multi_token_textual_inversion/textual_inversion.py index 7aad64ecb1dd..57ad77477b0d 100644 --- a/examples/research_projects/multi_token_textual_inversion/textual_inversion.py +++ b/examples/research_projects/multi_token_textual_inversion/textual_inversion.py @@ -830,9 +830,9 @@ def main(): # Let's make sure we don't update any embedding weights besides the newly added token index_no_updates = get_mask(tokenizer, accelerator) with torch.no_grad(): - accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[index_no_updates] = ( - orig_embeds_params[index_no_updates] - ) + accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[ + index_no_updates + ] = orig_embeds_params[index_no_updates] # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: diff --git a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py index 5f0710e85319..e10564fa59ef 100644 --- a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py +++ b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py @@ -886,9 +886,9 @@ def main(): index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False with torch.no_grad(): - accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[index_no_updates] = ( - orig_embeds_params[index_no_updates] - ) + accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[ + index_no_updates + ] = orig_embeds_params[index_no_updates] # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index 3ae1e85723ee..4922789862b5 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -910,9 +910,9 @@ def main(): index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False with torch.no_grad(): - accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[index_no_updates] = ( - orig_embeds_params[index_no_updates] - ) + accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[ + index_no_updates + ] = orig_embeds_params[index_no_updates] # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: diff --git a/examples/textual_inversion/textual_inversion_sdxl.py b/examples/textual_inversion/textual_inversion_sdxl.py index cc020499be8e..c24a4c4f4855 100644 --- a/examples/textual_inversion/textual_inversion_sdxl.py +++ b/examples/textual_inversion/textual_inversion_sdxl.py @@ -940,9 +940,9 @@ def main(): index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False with torch.no_grad(): - accelerator.unwrap_model(text_encoder_1).get_input_embeddings().weight[index_no_updates] = ( - orig_embeds_params[index_no_updates] - ) + accelerator.unwrap_model(text_encoder_1).get_input_embeddings().weight[ + index_no_updates + ] = orig_embeds_params[index_no_updates] # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: diff --git a/scripts/convert_svd_to_diffusers.py b/scripts/convert_svd_to_diffusers.py index e46410ccb3bd..3243ce294b26 100644 --- a/scripts/convert_svd_to_diffusers.py +++ b/scripts/convert_svd_to_diffusers.py @@ -381,9 +381,9 @@ def convert_ldm_unet_checkpoint( # TODO resnet time_mixer.mix_factor if f"input_blocks.{i}.0.time_mixer.mix_factor" in unet_state_dict: - new_checkpoint[f"down_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor"] = ( - unet_state_dict[f"input_blocks.{i}.0.time_mixer.mix_factor"] - ) + new_checkpoint[ + f"down_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor" + ] = unet_state_dict[f"input_blocks.{i}.0.time_mixer.mix_factor"] if len(attentions): paths = renew_attention_paths(attentions) @@ -478,9 +478,9 @@ def convert_ldm_unet_checkpoint( ) if f"output_blocks.{i}.0.time_mixer.mix_factor" in unet_state_dict: - new_checkpoint[f"up_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor"] = ( - unet_state_dict[f"output_blocks.{i}.0.time_mixer.mix_factor"] - ) + new_checkpoint[ + f"up_blocks.{block_id}.resnets.{layer_in_block_id}.time_mixer.mix_factor" + ] = unet_state_dict[f"output_blocks.{i}.0.time_mixer.mix_factor"] output_block_list = {k: sorted(v) for k, v in output_block_list.items()} if ["conv.bias", "conv.weight"] in output_block_list.values(): diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py index 48eabbb541ab..5d89658830f1 100644 --- a/src/diffusers/loaders/lora.py +++ b/src/diffusers/loaders/lora.py @@ -1288,9 +1288,9 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, text_encoder_module.lora_A[adapter_name].to(device) text_encoder_module.lora_B[adapter_name].to(device) # this is a param, not a module, so device placement is not in-place -> re-assign - text_encoder_module.lora_magnitude_vector[adapter_name] = ( - text_encoder_module.lora_magnitude_vector[adapter_name].to(device) - ) + text_encoder_module.lora_magnitude_vector[ + adapter_name + ] = text_encoder_module.lora_magnitude_vector[adapter_name].to(device) class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin): diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py index e4877d495970..11e3311a6402 100644 --- a/src/diffusers/loaders/lora_conversion_utils.py +++ b/src/diffusers/loaders/lora_conversion_utils.py @@ -209,9 +209,9 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_ if is_unet_dora_lora: dora_scale_key_to_replace = "_lora.down." if "_lora.down." in diffusers_name else ".lora.down." - unet_state_dict[diffusers_name.replace(dora_scale_key_to_replace, ".lora_magnitude_vector.")] = ( - state_dict.pop(key.replace("lora_down.weight", "dora_scale")) - ) + unet_state_dict[ + diffusers_name.replace(dora_scale_key_to_replace, ".lora_magnitude_vector.") + ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale")) elif lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")): if lora_name.startswith(("lora_te_", "lora_te1_")): @@ -249,13 +249,13 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_ "_lora.down." if "_lora.down." in diffusers_name else ".lora_linear_layer." ) if lora_name.startswith(("lora_te_", "lora_te1_")): - te_state_dict[diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")] = ( - state_dict.pop(key.replace("lora_down.weight", "dora_scale")) - ) + te_state_dict[ + diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.") + ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale")) elif lora_name.startswith("lora_te2_"): - te2_state_dict[diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.")] = ( - state_dict.pop(key.replace("lora_down.weight", "dora_scale")) - ) + te2_state_dict[ + diffusers_name.replace(dora_scale_key_to_replace_te, ".lora_magnitude_vector.") + ] = state_dict.pop(key.replace("lora_down.weight", "dora_scale")) # Rename the alphas so that they can be mapped appropriately. if lora_name_alpha in state_dict: diff --git a/tests/models/autoencoders/test_models_vae.py b/tests/models/autoencoders/test_models_vae.py index 56947609ed7d..026e01f0ed6a 100644 --- a/tests/models/autoencoders/test_models_vae.py +++ b/tests/models/autoencoders/test_models_vae.py @@ -384,10 +384,12 @@ def prepare_init_args_and_inputs_for_common(self): return self.init_dict, self.inputs_dict() @unittest.skip - def test_training(self): ... + def test_training(self): + ... @unittest.skip - def test_ema_training(self): ... + def test_ema_training(self): + ... class AutoencoderKLTemporalDecoderFastTests(ModelTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/amused/test_amused.py b/tests/pipelines/amused/test_amused.py index ed03fef2b0cd..f03751e2f830 100644 --- a/tests/pipelines/amused/test_amused.py +++ b/tests/pipelines/amused/test_amused.py @@ -125,7 +125,8 @@ def test_inference_batch_consistent(self, batch_sizes=[2]): self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False) @unittest.skip("aMUSEd does not support lists of generators") - def test_inference_batch_single_identical(self): ... + def test_inference_batch_single_identical(self): + ... @slow diff --git a/tests/pipelines/amused/test_amused_img2img.py b/tests/pipelines/amused/test_amused_img2img.py index 794f23792911..efbca1f437a4 100644 --- a/tests/pipelines/amused/test_amused_img2img.py +++ b/tests/pipelines/amused/test_amused_img2img.py @@ -129,7 +129,8 @@ def test_inference_batch_consistent(self, batch_sizes=[2]): self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False) @unittest.skip("aMUSEd does not support lists of generators") - def test_inference_batch_single_identical(self): ... + def test_inference_batch_single_identical(self): + ... @slow diff --git a/tests/pipelines/amused/test_amused_inpaint.py b/tests/pipelines/amused/test_amused_inpaint.py index 9c8b1a68b1e1..d397f8d81297 100644 --- a/tests/pipelines/amused/test_amused_inpaint.py +++ b/tests/pipelines/amused/test_amused_inpaint.py @@ -133,7 +133,8 @@ def test_inference_batch_consistent(self, batch_sizes=[2]): self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False) @unittest.skip("aMUSEd does not support lists of generators") - def test_inference_batch_single_identical(self): ... + def test_inference_batch_single_identical(self): + ... @slow From cb0ade60a95a254b4527e24019a7415357e1a1df Mon Sep 17 00:00:00 2001 From: DannHuang Date: Fri, 19 Apr 2024 17:20:18 +0800 Subject: [PATCH 07/13] format doc-string, add ValueError --- src/diffusers/loaders/ip_adapter.py | 26 +++++++++++----------- src/diffusers/loaders/unet_loader_utils.py | 6 ++--- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index f930f93fb3ce..2f4df82108ee 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -238,25 +238,25 @@ def set_ip_adapter_scale(self, scale: Union[float, Dict, List[Union[float, Dict] ```py # To use original IP-Adapter - scale = 1.0 - pipeline.set_ip_adapter_scale(scale) + >>> scale = 1.0 + >>> pipeline.set_ip_adapter_scale(scale) # To use style block only - scale = { - "up": {"block_0": [0.0, 1.0, 0.0]}, - } - pipeline.set_ip_adapter_scale(scale) + >>> scale = { + ... "up": {"block_0": [0.0, 1.0, 0.0]}, + ... } + >>> pipeline.set_ip_adapter_scale(scale) # To use style+layout blocks - scale = { - "down": {"block_2": [0.0, 1.0]}, - "up": {"block_0": [0.0, 1.0, 0.0]}, - } - pipeline.set_ip_adapter_scale(scale) + >>> scale = { + ... "down": {"block_2": [0.0, 1.0]}, + ... "up": {"block_0": [0.0, 1.0, 0.0]}, + ... } + >>> pipeline.set_ip_adapter_scale(scale) # To use style and layout from 2 reference images - scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}] - pipeline.set_ip_adapter_scale(scales) + >>> scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}] + >>> pipeline.set_ip_adapter_scale(scales) ``` """ unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet diff --git a/src/diffusers/loaders/unet_loader_utils.py b/src/diffusers/loaders/unet_loader_utils.py index 598cf33563d2..9b2e20e3350f 100644 --- a/src/diffusers/loaders/unet_loader_utils.py +++ b/src/diffusers/loaders/unet_loader_utils.py @@ -134,9 +134,9 @@ def _maybe_expand_lora_scales_for_one_adapter( if not isinstance(scales[updown][block], list): scales[updown][block] = [scales[updown][block] for _ in range(transformer_per_block[updown])] else: - assert ( - len(scales[updown][block]) == transformer_per_block[updown] - ), f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}." + raise ValueError( + f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}." + ) # eg {"down": "block_1": [1, 1]}} to {"down.block_1.0": 1, "down.block_1.1": 1} for i in blocks_with_transformer[updown]: From 3f01c6d7c82cb801521763130cf2b585a060ea36 Mon Sep 17 00:00:00 2001 From: DannHuang Date: Sat, 20 Apr 2024 11:16:55 +0800 Subject: [PATCH 08/13] resolved conflict and issue; include documentation --- docs/source/en/using-diffusers/ip_adapter.md | 88 ++++++++++++++++++++ src/diffusers/loaders/ip_adapter.py | 13 +++ src/diffusers/models/attention_processor.py | 16 +++- 3 files changed, 115 insertions(+), 2 deletions(-) diff --git a/docs/source/en/using-diffusers/ip_adapter.md b/docs/source/en/using-diffusers/ip_adapter.md index 4ae403538d2b..933f3dbd67e1 100644 --- a/docs/source/en/using-diffusers/ip_adapter.md +++ b/docs/source/en/using-diffusers/ip_adapter.md @@ -592,3 +592,91 @@ image
   
+ +### Style & layout control + +[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This is achieved by only inserting IP-Adapters to some specific part of the model. + +By default IP-Adapters are inserted to all layers of the model. Use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method with a dictionary to assign scales to IP-Adapter at different layers. + +```py +from diffusers import AutoPipelineForImage2Image +from diffusers.utils import load_image +import torch + +pipeline = AutoPipelineForImage2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda") +pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") + +scale = { + "down": {"block_2": [0.0, 1.0]}, + "up": {"block_0": [0.0, 1.0, 0.0]}, +} +pipeline.set_ip_adapter_scale(scale) +``` + +This will activate IP-Adapter at the second layer in the model's down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following the style and layout of image prompt, but with contents more aligned to text prompt. + +```py +style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg") + +generator = torch.Generator(device="cpu").manual_seed(42) +image = pipeline( + prompt="a cat, masterpiece, best quality, high quality", + image=style_image, + negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry", + guidance_scale=5, + num_inference_steps=30, + generator=generator, +).images[0] +image +``` + +
+
+ +
IP-Adapter image
+
+
+ +
generated image
+
+
+ +In contrast, inserting IP-Adapter to all layers will often generate images that overly focus on image prompt and diminish diversity. + +Activate IP-Adapter only in the style layer and then call the pipeline again. + +```py +scale = { + "up": {"block_0": [0.0, 1.0, 0.0]}, +} +pipeline.set_ip_adapter_scale(scale) + +generator = torch.Generator(device="cpu").manual_seed(42) +image = pipeline( + prompt="a cat, masterpiece, best quality, high quality", + image=style_image, + negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry", + guidance_scale=5, + num_inference_steps=30, + generator=generator, +).images[0] +image +``` + +
+
+ +
IP-Adapter only in style layer
+
+
+ +
IP-Adapter in all layers
+
+
+ +Note that you don't have to specify all layers in the dictionary. Those not included in the dictionary will be set to scale 0 which means disable IP-Adapter by default. This default value can also be changed by passing a ```default_scale``` to the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method. + +```py +pipeline.set_ip_adapter_scale(scale, default_scale=1.0) +``` diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index 2f4df82108ee..61450a85f316 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -21,6 +21,7 @@ from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict from ..utils import ( + USE_PEFT_BACKEND, _get_model_file, is_accelerate_available, is_torch_version, @@ -229,6 +230,18 @@ def load_ip_adapter( unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet unet._load_ip_adapter_weights(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage) + extra_loras = unet._load_ip_adapter_loras(state_dicts) + if extra_loras != {}: + if not USE_PEFT_BACKEND: + logger.warning("PEFT backend is required to load these weights.") + else: + # apply the IP Adapter Face ID LoRA weights + peft_config = getattr(unet, "peft_config", {}) + for k, lora in extra_loras.items(): + if f"faceid_{k}" not in peft_config: + self.load_lora_weights(lora, adapter_name=f"faceid_{k}") + self.set_adapters([f"faceid_{k}"], adapter_weights=[1.0]) + def set_ip_adapter_scale(self, scale: Union[float, Dict, List[Union[float, Dict]]], default_scale=0.0): """ Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index e0a8f99e5f5f..50d7cca8d25f 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -2229,7 +2229,13 @@ def __call__( for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks ): - if scale > 0: + skip = False + if isinstance(scale, list): + if all(s == 0 for s in scale): + skip = True + elif isinstance(scale, float) and scale == 0: + skip = True + if not skip: if mask is not None: if not isinstance(scale, list): scale = [scale] @@ -2440,7 +2446,13 @@ def __call__( for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip( ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks ): - if scale > 0: + skip = False + if isinstance(scale, list): + if all(s == 0 for s in scale): + skip = True + elif isinstance(scale, float) and scale == 0: + skip = True + if not skip: if mask is not None: if not isinstance(scale, list): scale = [scale] From d2d31e90cdd49b1aa08503a3b0a0ebed79f3da8b Mon Sep 17 00:00:00 2001 From: DannHuang Date: Sat, 20 Apr 2024 12:42:08 +0800 Subject: [PATCH 09/13] ready for pull the latest --- src/diffusers/loaders/ip_adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index 61450a85f316..4f0c4fbf2dca 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -242,7 +242,7 @@ def load_ip_adapter( self.load_lora_weights(lora, adapter_name=f"faceid_{k}") self.set_adapters([f"faceid_{k}"], adapter_weights=[1.0]) - def set_ip_adapter_scale(self, scale: Union[float, Dict, List[Union[float, Dict]]], default_scale=0.0): + def set_ip_adapter_scale(self, scale, default_scale=0.0): """ Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for granular control over each IP-Adapter behavior. A config can be a float or a dictionary. From 5d0bdfa623d2ecbd2f90e60512db8aa39adafbe7 Mon Sep 17 00:00:00 2001 From: DannHuang Date: Sat, 20 Apr 2024 12:52:11 +0800 Subject: [PATCH 10/13] ready for merge to remote main --- docs/source/en/using-diffusers/ip_adapter.md | 6 +----- src/diffusers/loaders/ip_adapter.py | 4 ++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/docs/source/en/using-diffusers/ip_adapter.md b/docs/source/en/using-diffusers/ip_adapter.md index 933f3dbd67e1..4a64050a7b80 100644 --- a/docs/source/en/using-diffusers/ip_adapter.md +++ b/docs/source/en/using-diffusers/ip_adapter.md @@ -675,8 +675,4 @@ image -Note that you don't have to specify all layers in the dictionary. Those not included in the dictionary will be set to scale 0 which means disable IP-Adapter by default. This default value can also be changed by passing a ```default_scale``` to the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method. - -```py -pipeline.set_ip_adapter_scale(scale, default_scale=1.0) -``` +Note that you don't have to specify all layers in the dictionary. Those not included in the dictionary will be set to scale 0 which means disable IP-Adapter by default. diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index 4f0c4fbf2dca..faf9c809274b 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -242,7 +242,7 @@ def load_ip_adapter( self.load_lora_weights(lora, adapter_name=f"faceid_{k}") self.set_adapters([f"faceid_{k}"], adapter_weights=[1.0]) - def set_ip_adapter_scale(self, scale, default_scale=0.0): + def set_ip_adapter_scale(self, scale): """ Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for granular control over each IP-Adapter behavior. A config can be a float or a dictionary. @@ -275,7 +275,7 @@ def set_ip_adapter_scale(self, scale, default_scale=0.0): unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet if not isinstance(scale, list): scale = [scale] - scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=default_scale) + scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=0.0) for attn_name, attn_processor in unet.attn_processors.items(): if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)): From 6fc9a3af2df947d99e80148cc7c40d4abb0ac86d Mon Sep 17 00:00:00 2001 From: DannHuang Date: Sat, 20 Apr 2024 21:58:29 +0800 Subject: [PATCH 11/13] support multiple masked IP inputs --- src/diffusers/loaders/unet_loader_utils.py | 14 +++- src/diffusers/models/attention_processor.py | 8 +- .../test_ip_adapter_stable_diffusion.py | 79 ++++++++++++++++++- 3 files changed, 94 insertions(+), 7 deletions(-) diff --git a/src/diffusers/loaders/unet_loader_utils.py b/src/diffusers/loaders/unet_loader_utils.py index 9b2e20e3350f..11bd21da20af 100644 --- a/src/diffusers/loaders/unet_loader_utils.py +++ b/src/diffusers/loaders/unet_loader_utils.py @@ -116,6 +116,13 @@ def _maybe_expand_lora_scales_for_one_adapter( if "mid" not in scales: scales["mid"] = default_scale + elif isinstance(scales["mid"], list): + if len(scales["mid"]) == 1: + scales["mid"] = scales["mid"][0] + else: + raise ValueError( + f"Expected 1 scales for mid, got {len(scales['mid'])}." + ) for updown in ["up", "down"]: if updown not in scales: @@ -123,7 +130,7 @@ def _maybe_expand_lora_scales_for_one_adapter( # eg {"down": 1} to {"down": {"block_1": 1, "block_2": 1}}} if not isinstance(scales[updown], dict): - scales[updown] = {f"block_{i}": scales[updown] for i in blocks_with_transformer[updown]} + scales[updown] = {f"block_{i}": copy.deepcopy(scales[updown]) for i in blocks_with_transformer[updown]} # eg {"down": {"block_1": 1}} to {"down": {"block_1": [1, 1]}} for i in blocks_with_transformer[updown]: @@ -133,7 +140,10 @@ def _maybe_expand_lora_scales_for_one_adapter( scales[updown][block] = default_scale if not isinstance(scales[updown][block], list): scales[updown][block] = [scales[updown][block] for _ in range(transformer_per_block[updown])] - else: + elif len(scales[updown][block]) == 1: + # a list specifying scale to each masked IP input + scales[updown][block] = scales[updown][block] * transformer_per_block[updown] + elif len(scales[updown][block]) != transformer_per_block[updown]: raise ValueError( f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}." ) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 50d7cca8d25f..429807989296 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -2233,12 +2233,12 @@ def __call__( if isinstance(scale, list): if all(s == 0 for s in scale): skip = True - elif isinstance(scale, float) and scale == 0: + elif scale == 0: skip = True if not skip: if mask is not None: if not isinstance(scale, list): - scale = [scale] + scale = [scale] * mask.shape[1] current_num_images = mask.shape[1] for i in range(current_num_images): @@ -2450,12 +2450,12 @@ def __call__( if isinstance(scale, list): if all(s == 0 for s in scale): skip = True - elif isinstance(scale, float) and scale == 0: + elif scale == 0: skip = True if not skip: if mask is not None: if not isinstance(scale, list): - scale = [scale] + scale = [scale] * mask.shape[1] current_num_images = mask.shape[1] for i in range(current_num_images): diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index ef70baa05f19..8cb1e4099649 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -73,7 +73,7 @@ def get_image_processor(self, repo_id): image_processor = CLIPImageProcessor.from_pretrained(repo_id) return image_processor - def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_sdxl=False, for_masks=False): + def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_sdxl=False, for_masks=False, for_instant_style=False): image = load_image( "https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png" ) @@ -126,6 +126,38 @@ def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_s } ) + elif for_instant_style: + composition_mask = load_image( + "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/1024_whole_mask.png" + ) + female_mask = load_image( + "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter_None_20240321125641_mask.png" + ) + male_mask = load_image( + "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter_None_20240321125344_mask.png" + ) + background_mask = load_image( + "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter_6_20240321130722_mask.png" + ) + ip_composition_image = load_image( + "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter__20240321125152.png" + ) + ip_female_style = load_image( + "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter__20240321125625.png" + ) + ip_male_style = load_image( + "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter__20240321125329.png" + ) + ip_background = load_image( + "https://huggingface.co/datasets/OzzyGT/testing-resources/resolve/main/ip_adapter__20240321130643.png" + ) + input_kwargs.update( + { + "ip_adapter_image": [ip_composition_image, [ip_female_style, ip_male_style, ip_background]], + "cross_attention_kwargs": {"ip_adapter_masks": [[composition_mask], [female_mask, male_mask, background_mask]]}, + } + ) + return input_kwargs @@ -575,6 +607,51 @@ def test_ip_adapter_multiple_masks(self): max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 + def test_instant_style_multiple_masks(self): + image_encoder = CLIPVisionModelWithProjection.from_pretrained( + "h94/IP-Adapter", subfolder="models/image_encoder", torch_dtype=torch.float16 + ).to("cuda") + pipeline = StableDiffusionXLPipeline.from_pretrained( + "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, image_encoder=image_encoder, variant="fp16" + ).to("cuda") + pipeline.enable_model_cpu_offload() + + pipeline.load_ip_adapter( + ["ostris/ip-composition-adapter", "h94/IP-Adapter"], + subfolder=["", "sdxl_models"], + weight_name=[ + "ip_plus_composition_sdxl.safetensors", + "ip-adapter_sdxl_vit-h.safetensors", + ], + image_encoder_folder=None, + ) + scale_1 = { + "down": [[0.0, 0.0, 1.0]], + "mid": [[0.0, 0.0, 1.0]], + "up": { + "block_0": [[0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 1.0]], + "block_1": [[0.0, 0.0, 1.0]] + }, + } + pipeline.set_ip_adapter_scale([1.0, scale_1]) + + inputs = self.get_dummy_inputs(for_instant_style=True) + processor = IPAdapterMaskProcessor() + masks1 = inputs["cross_attention_kwargs"]["ip_adapter_masks"][0] + masks2 = inputs["cross_attention_kwargs"]["ip_adapter_masks"][1] + masks1 = processor.preprocess(masks1, height=1024, width=1024) + masks2 = processor.preprocess(masks2, height=1024, width=1024) + masks2 = masks2.reshape(1, masks2.shape[0], masks2.shape[2], masks2.shape[3]) + inputs["cross_attention_kwargs"]["ip_adapter_masks"] = [masks1, masks2] + images = pipeline(**inputs).images + image_slice = images[0, :3, :3, -1].flatten() + expected_slice = np.array( + [0.23551631, 0.20476806, 0.14099443, 0. , 0.07675594, 0.05672678, 0. , 0. , 0.02099729] + ) + + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) + assert max_diff < 5e-4 + def test_ip_adapter_multiple_masks_one_adapter(self): image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") pipeline = StableDiffusionXLPipeline.from_pretrained( From 1e550beafd2e9eb1eac2b899e022e02c042d53b3 Mon Sep 17 00:00:00 2001 From: ResearcherXman Date: Sat, 20 Apr 2024 23:57:41 +0800 Subject: [PATCH 12/13] format --- src/diffusers/loaders/ip_adapter.py | 26 +++++++++---------- src/diffusers/loaders/unet_loader_utils.py | 4 +-- .../test_ip_adapter_stable_diffusion.py | 15 ++++++----- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index faf9c809274b..cb158a4bc194 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -251,25 +251,25 @@ def set_ip_adapter_scale(self, scale): ```py # To use original IP-Adapter - >>> scale = 1.0 - >>> pipeline.set_ip_adapter_scale(scale) + scale = 1.0 + pipeline.set_ip_adapter_scale(scale) # To use style block only - >>> scale = { - ... "up": {"block_0": [0.0, 1.0, 0.0]}, - ... } - >>> pipeline.set_ip_adapter_scale(scale) + scale = { + "up": {"block_0": [0.0, 1.0, 0.0]}, + } + pipeline.set_ip_adapter_scale(scale) # To use style+layout blocks - >>> scale = { - ... "down": {"block_2": [0.0, 1.0]}, - ... "up": {"block_0": [0.0, 1.0, 0.0]}, - ... } - >>> pipeline.set_ip_adapter_scale(scale) + scale = { + "down": {"block_2": [0.0, 1.0]}, + "up": {"block_0": [0.0, 1.0, 0.0]}, + } + pipeline.set_ip_adapter_scale(scale) # To use style and layout from 2 reference images - >>> scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}] - >>> pipeline.set_ip_adapter_scale(scales) + scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}] + pipeline.set_ip_adapter_scale(scales) ``` """ unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet diff --git a/src/diffusers/loaders/unet_loader_utils.py b/src/diffusers/loaders/unet_loader_utils.py index 11bd21da20af..8f202ed4d44b 100644 --- a/src/diffusers/loaders/unet_loader_utils.py +++ b/src/diffusers/loaders/unet_loader_utils.py @@ -120,9 +120,7 @@ def _maybe_expand_lora_scales_for_one_adapter( if len(scales["mid"]) == 1: scales["mid"] = scales["mid"][0] else: - raise ValueError( - f"Expected 1 scales for mid, got {len(scales['mid'])}." - ) + raise ValueError(f"Expected 1 scales for mid, got {len(scales['mid'])}.") for updown in ["up", "down"]: if updown not in scales: diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index 8cb1e4099649..3a5ff03e564a 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -73,7 +73,9 @@ def get_image_processor(self, repo_id): image_processor = CLIPImageProcessor.from_pretrained(repo_id) return image_processor - def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_sdxl=False, for_masks=False, for_instant_style=False): + def get_dummy_inputs( + self, for_image_to_image=False, for_inpainting=False, for_sdxl=False, for_masks=False, for_instant_style=False + ): image = load_image( "https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png" ) @@ -154,7 +156,9 @@ def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_s input_kwargs.update( { "ip_adapter_image": [ip_composition_image, [ip_female_style, ip_male_style, ip_background]], - "cross_attention_kwargs": {"ip_adapter_masks": [[composition_mask], [female_mask, male_mask, background_mask]]}, + "cross_attention_kwargs": { + "ip_adapter_masks": [[composition_mask], [female_mask, male_mask, background_mask]] + }, } ) @@ -628,10 +632,7 @@ def test_instant_style_multiple_masks(self): scale_1 = { "down": [[0.0, 0.0, 1.0]], "mid": [[0.0, 0.0, 1.0]], - "up": { - "block_0": [[0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 1.0]], - "block_1": [[0.0, 0.0, 1.0]] - }, + "up": {"block_0": [[0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 1.0]], "block_1": [[0.0, 0.0, 1.0]]}, } pipeline.set_ip_adapter_scale([1.0, scale_1]) @@ -646,7 +647,7 @@ def test_instant_style_multiple_masks(self): images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() expected_slice = np.array( - [0.23551631, 0.20476806, 0.14099443, 0. , 0.07675594, 0.05672678, 0. , 0. , 0.02099729] + [0.23551631, 0.20476806, 0.14099443, 0.0, 0.07675594, 0.05672678, 0.0, 0.0, 0.02099729] ) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) From 8540c1c7308d490ee5159908f21eb421a36dc84f Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Mon, 22 Apr 2024 13:04:50 -1000 Subject: [PATCH 13/13] Update utils/update_metadata.py --- utils/update_metadata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/update_metadata.py b/utils/update_metadata.py index 7d91a17b2539..840e4be78423 100644 --- a/utils/update_metadata.py +++ b/utils/update_metadata.py @@ -24,7 +24,6 @@ Script modified from: https://github.com/huggingface/transformers/blob/main/utils/update_metadata.py """ - import argparse import os import tempfile