|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations |
|
|
|
from typing import Any, Dict, List, Optional, Tuple, Union |
|
import logging |
|
|
|
import torch |
|
from diffusers.models.attention_processor import Attention, AttnProcessor |
|
from einops import rearrange, repeat |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
import xformers |
|
from diffusers.models.lora import LoRACompatibleLinear |
|
from diffusers.models.unet_2d_condition import ( |
|
UNet2DConditionModel, |
|
UNet2DConditionOutput, |
|
) |
|
from diffusers.configuration_utils import ConfigMixin, register_to_config |
|
from diffusers.utils.constants import USE_PEFT_BACKEND |
|
from diffusers.utils.deprecation_utils import deprecate |
|
from diffusers.utils.peft_utils import scale_lora_layers, unscale_lora_layers |
|
from diffusers.utils.torch_utils import maybe_allow_in_graph |
|
from diffusers.models.modeling_utils import ModelMixin, load_state_dict |
|
from diffusers.loaders import UNet2DConditionLoadersMixin |
|
from diffusers.utils import ( |
|
USE_PEFT_BACKEND, |
|
BaseOutput, |
|
deprecate, |
|
scale_lora_layers, |
|
unscale_lora_layers, |
|
) |
|
from diffusers.models.activations import get_activation |
|
from diffusers.models.attention_processor import ( |
|
ADDED_KV_ATTENTION_PROCESSORS, |
|
CROSS_ATTENTION_PROCESSORS, |
|
AttentionProcessor, |
|
AttnAddedKVProcessor, |
|
AttnProcessor, |
|
) |
|
from diffusers.models.embeddings import ( |
|
GaussianFourierProjection, |
|
ImageHintTimeEmbedding, |
|
ImageProjection, |
|
ImageTimeEmbedding, |
|
PositionNet, |
|
TextImageProjection, |
|
TextImageTimeEmbedding, |
|
TextTimeEmbedding, |
|
TimestepEmbedding, |
|
Timesteps, |
|
) |
|
from diffusers.models.modeling_utils import ModelMixin |
|
|
|
|
|
from ..data.data_util import align_repeat_tensor_single_dim |
|
from .unet_3d_condition import UNet3DConditionModel |
|
from .attention import BasicTransformerBlock, IPAttention |
|
from .unet_2d_blocks import ( |
|
UNetMidBlock2D, |
|
UNetMidBlock2DCrossAttn, |
|
UNetMidBlock2DSimpleCrossAttn, |
|
get_down_block, |
|
get_up_block, |
|
) |
|
|
|
from . import Model_Register |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
@Model_Register.register |
|
class ReferenceNet2D(UNet2DConditionModel, nn.Module): |
|
"""继承 UNet2DConditionModel. 新增功能,类似controlnet 返回模型中间特征,用于后续作用 |
|
Inherit Unet2DConditionModel. Add new functions, similar to controlnet, return the intermediate features of the model for subsequent effects |
|
Args: |
|
UNet2DConditionModel (_type_): _description_ |
|
""" |
|
|
|
_supports_gradient_checkpointing = True |
|
print_idx = 0 |
|
|
|
@register_to_config |
|
def __init__( |
|
self, |
|
sample_size: int | None = None, |
|
in_channels: int = 4, |
|
out_channels: int = 4, |
|
center_input_sample: bool = False, |
|
flip_sin_to_cos: bool = True, |
|
freq_shift: int = 0, |
|
down_block_types: Tuple[str] = ( |
|
"CrossAttnDownBlock2D", |
|
"CrossAttnDownBlock2D", |
|
"CrossAttnDownBlock2D", |
|
"DownBlock2D", |
|
), |
|
mid_block_type: str | None = "UNetMidBlock2DCrossAttn", |
|
up_block_types: Tuple[str] = ( |
|
"UpBlock2D", |
|
"CrossAttnUpBlock2D", |
|
"CrossAttnUpBlock2D", |
|
"CrossAttnUpBlock2D", |
|
), |
|
only_cross_attention: bool | Tuple[bool] = False, |
|
block_out_channels: Tuple[int] = (320, 640, 1280, 1280), |
|
layers_per_block: int | Tuple[int] = 2, |
|
downsample_padding: int = 1, |
|
mid_block_scale_factor: float = 1, |
|
dropout: float = 0, |
|
act_fn: str = "silu", |
|
norm_num_groups: int | None = 32, |
|
norm_eps: float = 0.00001, |
|
cross_attention_dim: int | Tuple[int] = 1280, |
|
transformer_layers_per_block: int | Tuple[int] | Tuple[Tuple] = 1, |
|
reverse_transformer_layers_per_block: Tuple[Tuple[int]] | None = None, |
|
encoder_hid_dim: int | None = None, |
|
encoder_hid_dim_type: str | None = None, |
|
attention_head_dim: int | Tuple[int] = 8, |
|
num_attention_heads: int | Tuple[int] | None = None, |
|
dual_cross_attention: bool = False, |
|
use_linear_projection: bool = False, |
|
class_embed_type: str | None = None, |
|
addition_embed_type: str | None = None, |
|
addition_time_embed_dim: int | None = None, |
|
num_class_embeds: int | None = None, |
|
upcast_attention: bool = False, |
|
resnet_time_scale_shift: str = "default", |
|
resnet_skip_time_act: bool = False, |
|
resnet_out_scale_factor: int = 1, |
|
time_embedding_type: str = "positional", |
|
time_embedding_dim: int | None = None, |
|
time_embedding_act_fn: str | None = None, |
|
timestep_post_act: str | None = None, |
|
time_cond_proj_dim: int | None = None, |
|
conv_in_kernel: int = 3, |
|
conv_out_kernel: int = 3, |
|
projection_class_embeddings_input_dim: int | None = None, |
|
attention_type: str = "default", |
|
class_embeddings_concat: bool = False, |
|
mid_block_only_cross_attention: bool | None = None, |
|
cross_attention_norm: str | None = None, |
|
addition_embed_type_num_heads=64, |
|
need_self_attn_block_embs: bool = False, |
|
need_block_embs: bool = False, |
|
): |
|
super().__init__() |
|
|
|
self.sample_size = sample_size |
|
|
|
if num_attention_heads is not None: |
|
raise ValueError( |
|
"At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19." |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
num_attention_heads = num_attention_heads or attention_head_dim |
|
|
|
|
|
if len(down_block_types) != len(up_block_types): |
|
raise ValueError( |
|
f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}." |
|
) |
|
|
|
if len(block_out_channels) != len(down_block_types): |
|
raise ValueError( |
|
f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." |
|
) |
|
|
|
if not isinstance(only_cross_attention, bool) and len( |
|
only_cross_attention |
|
) != len(down_block_types): |
|
raise ValueError( |
|
f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." |
|
) |
|
|
|
if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len( |
|
down_block_types |
|
): |
|
raise ValueError( |
|
f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." |
|
) |
|
|
|
if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len( |
|
down_block_types |
|
): |
|
raise ValueError( |
|
f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." |
|
) |
|
|
|
if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len( |
|
down_block_types |
|
): |
|
raise ValueError( |
|
f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}." |
|
) |
|
|
|
if not isinstance(layers_per_block, int) and len(layers_per_block) != len( |
|
down_block_types |
|
): |
|
raise ValueError( |
|
f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}." |
|
) |
|
if ( |
|
isinstance(transformer_layers_per_block, list) |
|
and reverse_transformer_layers_per_block is None |
|
): |
|
for layer_number_per_block in transformer_layers_per_block: |
|
if isinstance(layer_number_per_block, list): |
|
raise ValueError( |
|
"Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet." |
|
) |
|
|
|
|
|
conv_in_padding = (conv_in_kernel - 1) // 2 |
|
self.conv_in = nn.Conv2d( |
|
in_channels, |
|
block_out_channels[0], |
|
kernel_size=conv_in_kernel, |
|
padding=conv_in_padding, |
|
) |
|
|
|
|
|
if time_embedding_type == "fourier": |
|
time_embed_dim = time_embedding_dim or block_out_channels[0] * 2 |
|
if time_embed_dim % 2 != 0: |
|
raise ValueError( |
|
f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}." |
|
) |
|
self.time_proj = GaussianFourierProjection( |
|
time_embed_dim // 2, |
|
set_W_to_weight=False, |
|
log=False, |
|
flip_sin_to_cos=flip_sin_to_cos, |
|
) |
|
timestep_input_dim = time_embed_dim |
|
elif time_embedding_type == "positional": |
|
time_embed_dim = time_embedding_dim or block_out_channels[0] * 4 |
|
|
|
self.time_proj = Timesteps( |
|
block_out_channels[0], flip_sin_to_cos, freq_shift |
|
) |
|
timestep_input_dim = block_out_channels[0] |
|
else: |
|
raise ValueError( |
|
f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`." |
|
) |
|
|
|
self.time_embedding = TimestepEmbedding( |
|
timestep_input_dim, |
|
time_embed_dim, |
|
act_fn=act_fn, |
|
post_act_fn=timestep_post_act, |
|
cond_proj_dim=time_cond_proj_dim, |
|
) |
|
|
|
if encoder_hid_dim_type is None and encoder_hid_dim is not None: |
|
encoder_hid_dim_type = "text_proj" |
|
self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type) |
|
logger.info( |
|
"encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined." |
|
) |
|
|
|
if encoder_hid_dim is None and encoder_hid_dim_type is not None: |
|
raise ValueError( |
|
f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}." |
|
) |
|
|
|
if encoder_hid_dim_type == "text_proj": |
|
self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim) |
|
elif encoder_hid_dim_type == "text_image_proj": |
|
|
|
|
|
|
|
self.encoder_hid_proj = TextImageProjection( |
|
text_embed_dim=encoder_hid_dim, |
|
image_embed_dim=cross_attention_dim, |
|
cross_attention_dim=cross_attention_dim, |
|
) |
|
elif encoder_hid_dim_type == "image_proj": |
|
|
|
self.encoder_hid_proj = ImageProjection( |
|
image_embed_dim=encoder_hid_dim, |
|
cross_attention_dim=cross_attention_dim, |
|
) |
|
elif encoder_hid_dim_type is not None: |
|
raise ValueError( |
|
f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." |
|
) |
|
else: |
|
self.encoder_hid_proj = None |
|
|
|
|
|
if class_embed_type is None and num_class_embeds is not None: |
|
self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) |
|
elif class_embed_type == "timestep": |
|
self.class_embedding = TimestepEmbedding( |
|
timestep_input_dim, time_embed_dim, act_fn=act_fn |
|
) |
|
elif class_embed_type == "identity": |
|
self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) |
|
elif class_embed_type == "projection": |
|
if projection_class_embeddings_input_dim is None: |
|
raise ValueError( |
|
"`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.class_embedding = TimestepEmbedding( |
|
projection_class_embeddings_input_dim, time_embed_dim |
|
) |
|
elif class_embed_type == "simple_projection": |
|
if projection_class_embeddings_input_dim is None: |
|
raise ValueError( |
|
"`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set" |
|
) |
|
self.class_embedding = nn.Linear( |
|
projection_class_embeddings_input_dim, time_embed_dim |
|
) |
|
else: |
|
self.class_embedding = None |
|
|
|
if addition_embed_type == "text": |
|
if encoder_hid_dim is not None: |
|
text_time_embedding_from_dim = encoder_hid_dim |
|
else: |
|
text_time_embedding_from_dim = cross_attention_dim |
|
|
|
self.add_embedding = TextTimeEmbedding( |
|
text_time_embedding_from_dim, |
|
time_embed_dim, |
|
num_heads=addition_embed_type_num_heads, |
|
) |
|
elif addition_embed_type == "text_image": |
|
|
|
|
|
|
|
self.add_embedding = TextImageTimeEmbedding( |
|
text_embed_dim=cross_attention_dim, |
|
image_embed_dim=cross_attention_dim, |
|
time_embed_dim=time_embed_dim, |
|
) |
|
elif addition_embed_type == "text_time": |
|
self.add_time_proj = Timesteps( |
|
addition_time_embed_dim, flip_sin_to_cos, freq_shift |
|
) |
|
self.add_embedding = TimestepEmbedding( |
|
projection_class_embeddings_input_dim, time_embed_dim |
|
) |
|
elif addition_embed_type == "image": |
|
|
|
self.add_embedding = ImageTimeEmbedding( |
|
image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim |
|
) |
|
elif addition_embed_type == "image_hint": |
|
|
|
self.add_embedding = ImageHintTimeEmbedding( |
|
image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim |
|
) |
|
elif addition_embed_type is not None: |
|
raise ValueError( |
|
f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'." |
|
) |
|
|
|
if time_embedding_act_fn is None: |
|
self.time_embed_act = None |
|
else: |
|
self.time_embed_act = get_activation(time_embedding_act_fn) |
|
|
|
self.down_blocks = nn.ModuleList([]) |
|
self.up_blocks = nn.ModuleList([]) |
|
|
|
if isinstance(only_cross_attention, bool): |
|
if mid_block_only_cross_attention is None: |
|
mid_block_only_cross_attention = only_cross_attention |
|
|
|
only_cross_attention = [only_cross_attention] * len(down_block_types) |
|
|
|
if mid_block_only_cross_attention is None: |
|
mid_block_only_cross_attention = False |
|
|
|
if isinstance(num_attention_heads, int): |
|
num_attention_heads = (num_attention_heads,) * len(down_block_types) |
|
|
|
if isinstance(attention_head_dim, int): |
|
attention_head_dim = (attention_head_dim,) * len(down_block_types) |
|
|
|
if isinstance(cross_attention_dim, int): |
|
cross_attention_dim = (cross_attention_dim,) * len(down_block_types) |
|
|
|
if isinstance(layers_per_block, int): |
|
layers_per_block = [layers_per_block] * len(down_block_types) |
|
|
|
if isinstance(transformer_layers_per_block, int): |
|
transformer_layers_per_block = [transformer_layers_per_block] * len( |
|
down_block_types |
|
) |
|
|
|
if class_embeddings_concat: |
|
|
|
|
|
|
|
blocks_time_embed_dim = time_embed_dim * 2 |
|
else: |
|
blocks_time_embed_dim = time_embed_dim |
|
|
|
|
|
output_channel = block_out_channels[0] |
|
for i, down_block_type in enumerate(down_block_types): |
|
input_channel = output_channel |
|
output_channel = block_out_channels[i] |
|
is_final_block = i == len(block_out_channels) - 1 |
|
|
|
down_block = get_down_block( |
|
down_block_type, |
|
num_layers=layers_per_block[i], |
|
transformer_layers_per_block=transformer_layers_per_block[i], |
|
in_channels=input_channel, |
|
out_channels=output_channel, |
|
temb_channels=blocks_time_embed_dim, |
|
add_downsample=not is_final_block, |
|
resnet_eps=norm_eps, |
|
resnet_act_fn=act_fn, |
|
resnet_groups=norm_num_groups, |
|
cross_attention_dim=cross_attention_dim[i], |
|
num_attention_heads=num_attention_heads[i], |
|
downsample_padding=downsample_padding, |
|
dual_cross_attention=dual_cross_attention, |
|
use_linear_projection=use_linear_projection, |
|
only_cross_attention=only_cross_attention[i], |
|
upcast_attention=upcast_attention, |
|
resnet_time_scale_shift=resnet_time_scale_shift, |
|
attention_type=attention_type, |
|
resnet_skip_time_act=resnet_skip_time_act, |
|
resnet_out_scale_factor=resnet_out_scale_factor, |
|
cross_attention_norm=cross_attention_norm, |
|
attention_head_dim=attention_head_dim[i] |
|
if attention_head_dim[i] is not None |
|
else output_channel, |
|
dropout=dropout, |
|
) |
|
self.down_blocks.append(down_block) |
|
|
|
|
|
if mid_block_type == "UNetMidBlock2DCrossAttn": |
|
self.mid_block = UNetMidBlock2DCrossAttn( |
|
transformer_layers_per_block=transformer_layers_per_block[-1], |
|
in_channels=block_out_channels[-1], |
|
temb_channels=blocks_time_embed_dim, |
|
dropout=dropout, |
|
resnet_eps=norm_eps, |
|
resnet_act_fn=act_fn, |
|
output_scale_factor=mid_block_scale_factor, |
|
resnet_time_scale_shift=resnet_time_scale_shift, |
|
cross_attention_dim=cross_attention_dim[-1], |
|
num_attention_heads=num_attention_heads[-1], |
|
resnet_groups=norm_num_groups, |
|
dual_cross_attention=dual_cross_attention, |
|
use_linear_projection=use_linear_projection, |
|
upcast_attention=upcast_attention, |
|
attention_type=attention_type, |
|
) |
|
elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn": |
|
self.mid_block = UNetMidBlock2DSimpleCrossAttn( |
|
in_channels=block_out_channels[-1], |
|
temb_channels=blocks_time_embed_dim, |
|
dropout=dropout, |
|
resnet_eps=norm_eps, |
|
resnet_act_fn=act_fn, |
|
output_scale_factor=mid_block_scale_factor, |
|
cross_attention_dim=cross_attention_dim[-1], |
|
attention_head_dim=attention_head_dim[-1], |
|
resnet_groups=norm_num_groups, |
|
resnet_time_scale_shift=resnet_time_scale_shift, |
|
skip_time_act=resnet_skip_time_act, |
|
only_cross_attention=mid_block_only_cross_attention, |
|
cross_attention_norm=cross_attention_norm, |
|
) |
|
elif mid_block_type == "UNetMidBlock2D": |
|
self.mid_block = UNetMidBlock2D( |
|
in_channels=block_out_channels[-1], |
|
temb_channels=blocks_time_embed_dim, |
|
dropout=dropout, |
|
num_layers=0, |
|
resnet_eps=norm_eps, |
|
resnet_act_fn=act_fn, |
|
output_scale_factor=mid_block_scale_factor, |
|
resnet_groups=norm_num_groups, |
|
resnet_time_scale_shift=resnet_time_scale_shift, |
|
add_attention=False, |
|
) |
|
elif mid_block_type is None: |
|
self.mid_block = None |
|
else: |
|
raise ValueError(f"unknown mid_block_type : {mid_block_type}") |
|
|
|
|
|
self.num_upsamplers = 0 |
|
|
|
|
|
reversed_block_out_channels = list(reversed(block_out_channels)) |
|
reversed_num_attention_heads = list(reversed(num_attention_heads)) |
|
reversed_layers_per_block = list(reversed(layers_per_block)) |
|
reversed_cross_attention_dim = list(reversed(cross_attention_dim)) |
|
reversed_transformer_layers_per_block = ( |
|
list(reversed(transformer_layers_per_block)) |
|
if reverse_transformer_layers_per_block is None |
|
else reverse_transformer_layers_per_block |
|
) |
|
only_cross_attention = list(reversed(only_cross_attention)) |
|
|
|
output_channel = reversed_block_out_channels[0] |
|
for i, up_block_type in enumerate(up_block_types): |
|
is_final_block = i == len(block_out_channels) - 1 |
|
|
|
prev_output_channel = output_channel |
|
output_channel = reversed_block_out_channels[i] |
|
input_channel = reversed_block_out_channels[ |
|
min(i + 1, len(block_out_channels) - 1) |
|
] |
|
|
|
|
|
if not is_final_block: |
|
add_upsample = True |
|
self.num_upsamplers += 1 |
|
else: |
|
add_upsample = False |
|
|
|
up_block = get_up_block( |
|
up_block_type, |
|
num_layers=reversed_layers_per_block[i] + 1, |
|
transformer_layers_per_block=reversed_transformer_layers_per_block[i], |
|
in_channels=input_channel, |
|
out_channels=output_channel, |
|
prev_output_channel=prev_output_channel, |
|
temb_channels=blocks_time_embed_dim, |
|
add_upsample=add_upsample, |
|
resnet_eps=norm_eps, |
|
resnet_act_fn=act_fn, |
|
resolution_idx=i, |
|
resnet_groups=norm_num_groups, |
|
cross_attention_dim=reversed_cross_attention_dim[i], |
|
num_attention_heads=reversed_num_attention_heads[i], |
|
dual_cross_attention=dual_cross_attention, |
|
use_linear_projection=use_linear_projection, |
|
only_cross_attention=only_cross_attention[i], |
|
upcast_attention=upcast_attention, |
|
resnet_time_scale_shift=resnet_time_scale_shift, |
|
attention_type=attention_type, |
|
resnet_skip_time_act=resnet_skip_time_act, |
|
resnet_out_scale_factor=resnet_out_scale_factor, |
|
cross_attention_norm=cross_attention_norm, |
|
attention_head_dim=attention_head_dim[i] |
|
if attention_head_dim[i] is not None |
|
else output_channel, |
|
dropout=dropout, |
|
) |
|
self.up_blocks.append(up_block) |
|
prev_output_channel = output_channel |
|
|
|
|
|
if norm_num_groups is not None: |
|
self.conv_norm_out = nn.GroupNorm( |
|
num_channels=block_out_channels[0], |
|
num_groups=norm_num_groups, |
|
eps=norm_eps, |
|
) |
|
|
|
self.conv_act = get_activation(act_fn) |
|
|
|
else: |
|
self.conv_norm_out = None |
|
self.conv_act = None |
|
|
|
conv_out_padding = (conv_out_kernel - 1) // 2 |
|
self.conv_out = nn.Conv2d( |
|
block_out_channels[0], |
|
out_channels, |
|
kernel_size=conv_out_kernel, |
|
padding=conv_out_padding, |
|
) |
|
|
|
if attention_type in ["gated", "gated-text-image"]: |
|
positive_len = 768 |
|
if isinstance(cross_attention_dim, int): |
|
positive_len = cross_attention_dim |
|
elif isinstance(cross_attention_dim, tuple) or isinstance( |
|
cross_attention_dim, list |
|
): |
|
positive_len = cross_attention_dim[0] |
|
|
|
feature_type = "text-only" if attention_type == "gated" else "text-image" |
|
self.position_net = PositionNet( |
|
positive_len=positive_len, |
|
out_dim=cross_attention_dim, |
|
feature_type=feature_type, |
|
) |
|
self.need_block_embs = need_block_embs |
|
self.need_self_attn_block_embs = need_self_attn_block_embs |
|
|
|
|
|
self.conv_norm_out = None |
|
self.conv_act = None |
|
self.conv_out = None |
|
|
|
self.up_blocks[-1].attentions[-1].proj_out = None |
|
self.up_blocks[-1].attentions[-1].transformer_blocks[-1].attn1 = None |
|
self.up_blocks[-1].attentions[-1].transformer_blocks[-1].attn2 = None |
|
self.up_blocks[-1].attentions[-1].transformer_blocks[-1].norm2 = None |
|
self.up_blocks[-1].attentions[-1].transformer_blocks[-1].ff = None |
|
self.up_blocks[-1].attentions[-1].transformer_blocks[-1].norm3 = None |
|
if not self.need_self_attn_block_embs: |
|
self.up_blocks = None |
|
|
|
self.insert_spatial_self_attn_idx() |
|
|
|
def forward( |
|
self, |
|
sample: torch.FloatTensor, |
|
timestep: Union[torch.Tensor, float, int], |
|
encoder_hidden_states: torch.Tensor, |
|
class_labels: Optional[torch.Tensor] = None, |
|
timestep_cond: Optional[torch.Tensor] = None, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None, |
|
added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, |
|
down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, |
|
mid_block_additional_residual: Optional[torch.Tensor] = None, |
|
down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None, |
|
encoder_attention_mask: Optional[torch.Tensor] = None, |
|
return_dict: bool = True, |
|
|
|
num_frames: int = None, |
|
return_ndim: int = 5, |
|
|
|
) -> Union[UNet2DConditionOutput, Tuple]: |
|
r""" |
|
The [`UNet2DConditionModel`] forward method. |
|
|
|
Args: |
|
sample (`torch.FloatTensor`): |
|
The noisy input tensor with the following shape `(batch, channel, height, width)`. |
|
timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. |
|
encoder_hidden_states (`torch.FloatTensor`): |
|
The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. |
|
class_labels (`torch.Tensor`, *optional*, defaults to `None`): |
|
Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. |
|
timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`): |
|
Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed |
|
through the `self.time_embedding` layer to obtain the timestep embeddings. |
|
attention_mask (`torch.Tensor`, *optional*, defaults to `None`): |
|
An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask |
|
is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large |
|
negative values to the attention scores corresponding to "discard" tokens. |
|
cross_attention_kwargs (`dict`, *optional*): |
|
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under |
|
`self.processor` in |
|
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). |
|
added_cond_kwargs: (`dict`, *optional*): |
|
A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that |
|
are passed along to the UNet blocks. |
|
down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*): |
|
A tuple of tensors that if specified are added to the residuals of down unet blocks. |
|
mid_block_additional_residual: (`torch.Tensor`, *optional*): |
|
A tensor that if specified is added to the residual of the middle unet block. |
|
encoder_attention_mask (`torch.Tensor`): |
|
A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If |
|
`True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, |
|
which adds large negative values to the attention scores corresponding to "discard" tokens. |
|
return_dict (`bool`, *optional*, defaults to `True`): |
|
Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain |
|
tuple. |
|
cross_attention_kwargs (`dict`, *optional*): |
|
A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. |
|
added_cond_kwargs: (`dict`, *optional*): |
|
A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that |
|
are passed along to the UNet blocks. |
|
down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*): |
|
additional residuals to be added to UNet long skip connections from down blocks to up blocks for |
|
example from ControlNet side model(s) |
|
mid_block_additional_residual (`torch.Tensor`, *optional*): |
|
additional residual to be added to UNet mid block output, for example from ControlNet side model |
|
down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*): |
|
additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s) |
|
|
|
Returns: |
|
[`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: |
|
If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise |
|
a `tuple` is returned where the first element is the sample tensor. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
default_overall_up_factor = 2**self.num_upsamplers |
|
|
|
|
|
forward_upsample_size = False |
|
upsample_size = None |
|
|
|
for dim in sample.shape[-2:]: |
|
if dim % default_overall_up_factor != 0: |
|
|
|
forward_upsample_size = True |
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if attention_mask is not None: |
|
|
|
|
|
|
|
|
|
attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 |
|
attention_mask = attention_mask.unsqueeze(1) |
|
|
|
|
|
if encoder_attention_mask is not None: |
|
encoder_attention_mask = ( |
|
1 - encoder_attention_mask.to(sample.dtype) |
|
) * -10000.0 |
|
encoder_attention_mask = encoder_attention_mask.unsqueeze(1) |
|
|
|
|
|
if self.config.center_input_sample: |
|
sample = 2 * sample - 1.0 |
|
|
|
|
|
timesteps = timestep |
|
if not torch.is_tensor(timesteps): |
|
|
|
|
|
is_mps = sample.device.type == "mps" |
|
if isinstance(timestep, float): |
|
dtype = torch.float32 if is_mps else torch.float64 |
|
else: |
|
dtype = torch.int32 if is_mps else torch.int64 |
|
timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) |
|
elif len(timesteps.shape) == 0: |
|
timesteps = timesteps[None].to(sample.device) |
|
|
|
|
|
timesteps = timesteps.expand(sample.shape[0]) |
|
|
|
t_emb = self.time_proj(timesteps) |
|
|
|
|
|
|
|
|
|
t_emb = t_emb.to(dtype=sample.dtype) |
|
|
|
emb = self.time_embedding(t_emb, timestep_cond) |
|
aug_emb = None |
|
|
|
if self.class_embedding is not None: |
|
if class_labels is None: |
|
raise ValueError( |
|
"class_labels should be provided when num_class_embeds > 0" |
|
) |
|
|
|
if self.config.class_embed_type == "timestep": |
|
class_labels = self.time_proj(class_labels) |
|
|
|
|
|
|
|
class_labels = class_labels.to(dtype=sample.dtype) |
|
|
|
class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype) |
|
|
|
if self.config.class_embeddings_concat: |
|
emb = torch.cat([emb, class_emb], dim=-1) |
|
else: |
|
emb = emb + class_emb |
|
|
|
if self.config.addition_embed_type == "text": |
|
aug_emb = self.add_embedding(encoder_hidden_states) |
|
elif self.config.addition_embed_type == "text_image": |
|
|
|
if "image_embeds" not in added_cond_kwargs: |
|
raise ValueError( |
|
f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" |
|
) |
|
|
|
image_embs = added_cond_kwargs.get("image_embeds") |
|
text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) |
|
aug_emb = self.add_embedding(text_embs, image_embs) |
|
elif self.config.addition_embed_type == "text_time": |
|
|
|
if "text_embeds" not in added_cond_kwargs: |
|
raise ValueError( |
|
f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" |
|
) |
|
text_embeds = added_cond_kwargs.get("text_embeds") |
|
if "time_ids" not in added_cond_kwargs: |
|
raise ValueError( |
|
f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" |
|
) |
|
time_ids = added_cond_kwargs.get("time_ids") |
|
time_embeds = self.add_time_proj(time_ids.flatten()) |
|
time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) |
|
add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) |
|
add_embeds = add_embeds.to(emb.dtype) |
|
aug_emb = self.add_embedding(add_embeds) |
|
elif self.config.addition_embed_type == "image": |
|
|
|
if "image_embeds" not in added_cond_kwargs: |
|
raise ValueError( |
|
f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" |
|
) |
|
image_embs = added_cond_kwargs.get("image_embeds") |
|
aug_emb = self.add_embedding(image_embs) |
|
elif self.config.addition_embed_type == "image_hint": |
|
|
|
if ( |
|
"image_embeds" not in added_cond_kwargs |
|
or "hint" not in added_cond_kwargs |
|
): |
|
raise ValueError( |
|
f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`" |
|
) |
|
image_embs = added_cond_kwargs.get("image_embeds") |
|
hint = added_cond_kwargs.get("hint") |
|
aug_emb, hint = self.add_embedding(image_embs, hint) |
|
sample = torch.cat([sample, hint], dim=1) |
|
|
|
emb = emb + aug_emb if aug_emb is not None else emb |
|
|
|
if self.time_embed_act is not None: |
|
emb = self.time_embed_act(emb) |
|
|
|
if ( |
|
self.encoder_hid_proj is not None |
|
and self.config.encoder_hid_dim_type == "text_proj" |
|
): |
|
encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states) |
|
elif ( |
|
self.encoder_hid_proj is not None |
|
and self.config.encoder_hid_dim_type == "text_image_proj" |
|
): |
|
|
|
if "image_embeds" not in added_cond_kwargs: |
|
raise ValueError( |
|
f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" |
|
) |
|
|
|
image_embeds = added_cond_kwargs.get("image_embeds") |
|
encoder_hidden_states = self.encoder_hid_proj( |
|
encoder_hidden_states, image_embeds |
|
) |
|
elif ( |
|
self.encoder_hid_proj is not None |
|
and self.config.encoder_hid_dim_type == "image_proj" |
|
): |
|
|
|
if "image_embeds" not in added_cond_kwargs: |
|
raise ValueError( |
|
f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" |
|
) |
|
image_embeds = added_cond_kwargs.get("image_embeds") |
|
encoder_hidden_states = self.encoder_hid_proj(image_embeds) |
|
elif ( |
|
self.encoder_hid_proj is not None |
|
and self.config.encoder_hid_dim_type == "ip_image_proj" |
|
): |
|
if "image_embeds" not in added_cond_kwargs: |
|
raise ValueError( |
|
f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" |
|
) |
|
image_embeds = added_cond_kwargs.get("image_embeds") |
|
image_embeds = self.encoder_hid_proj(image_embeds).to( |
|
encoder_hidden_states.dtype |
|
) |
|
encoder_hidden_states = torch.cat( |
|
[encoder_hidden_states, image_embeds], dim=1 |
|
) |
|
|
|
|
|
|
|
|
|
if self.need_self_attn_block_embs: |
|
self_attn_block_embs = [None] * self.self_attn_num |
|
else: |
|
self_attn_block_embs = None |
|
|
|
sample = self.conv_in(sample) |
|
if self.print_idx == 0: |
|
logger.debug(f"after conv in sample={sample.mean()}") |
|
|
|
if ( |
|
cross_attention_kwargs is not None |
|
and cross_attention_kwargs.get("gligen", None) is not None |
|
): |
|
cross_attention_kwargs = cross_attention_kwargs.copy() |
|
gligen_args = cross_attention_kwargs.pop("gligen") |
|
cross_attention_kwargs["gligen"] = { |
|
"objs": self.position_net(**gligen_args) |
|
} |
|
|
|
|
|
lora_scale = ( |
|
cross_attention_kwargs.get("scale", 1.0) |
|
if cross_attention_kwargs is not None |
|
else 1.0 |
|
) |
|
if USE_PEFT_BACKEND: |
|
|
|
scale_lora_layers(self, lora_scale) |
|
|
|
is_controlnet = ( |
|
mid_block_additional_residual is not None |
|
and down_block_additional_residuals is not None |
|
) |
|
|
|
is_adapter = down_intrablock_additional_residuals is not None |
|
|
|
|
|
|
|
if ( |
|
not is_adapter |
|
and mid_block_additional_residual is None |
|
and down_block_additional_residuals is not None |
|
): |
|
deprecate( |
|
"T2I should not use down_block_additional_residuals", |
|
"1.3.0", |
|
"Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \ |
|
and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \ |
|
for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ", |
|
standard_warn=False, |
|
) |
|
down_intrablock_additional_residuals = down_block_additional_residuals |
|
is_adapter = True |
|
|
|
down_block_res_samples = (sample,) |
|
for i_downsample_block, downsample_block in enumerate(self.down_blocks): |
|
if ( |
|
hasattr(downsample_block, "has_cross_attention") |
|
and downsample_block.has_cross_attention |
|
): |
|
|
|
additional_residuals = {} |
|
if is_adapter and len(down_intrablock_additional_residuals) > 0: |
|
additional_residuals[ |
|
"additional_residuals" |
|
] = down_intrablock_additional_residuals.pop(0) |
|
if self.print_idx == 0: |
|
logger.debug( |
|
f"downsample_block {i_downsample_block} sample={sample.mean()}" |
|
) |
|
sample, res_samples = downsample_block( |
|
hidden_states=sample, |
|
temb=emb, |
|
encoder_hidden_states=encoder_hidden_states, |
|
attention_mask=attention_mask, |
|
cross_attention_kwargs=cross_attention_kwargs, |
|
encoder_attention_mask=encoder_attention_mask, |
|
**additional_residuals, |
|
self_attn_block_embs=self_attn_block_embs, |
|
) |
|
else: |
|
sample, res_samples = downsample_block( |
|
hidden_states=sample, |
|
temb=emb, |
|
scale=lora_scale, |
|
self_attn_block_embs=self_attn_block_embs, |
|
) |
|
if is_adapter and len(down_intrablock_additional_residuals) > 0: |
|
sample += down_intrablock_additional_residuals.pop(0) |
|
|
|
down_block_res_samples += res_samples |
|
|
|
if is_controlnet: |
|
new_down_block_res_samples = () |
|
|
|
for down_block_res_sample, down_block_additional_residual in zip( |
|
down_block_res_samples, down_block_additional_residuals |
|
): |
|
down_block_res_sample = ( |
|
down_block_res_sample + down_block_additional_residual |
|
) |
|
new_down_block_res_samples = new_down_block_res_samples + ( |
|
down_block_res_sample, |
|
) |
|
|
|
down_block_res_samples = new_down_block_res_samples |
|
|
|
|
|
def reshape_return_emb(tmp_emb): |
|
if return_ndim == 4: |
|
return tmp_emb |
|
elif return_ndim == 5: |
|
return rearrange(tmp_emb, "(b t) c h w-> b c t h w", t=num_frames) |
|
else: |
|
raise ValueError( |
|
f"reshape_emb only support 4, 5 but given {return_ndim}" |
|
) |
|
|
|
if self.need_block_embs: |
|
return_down_block_res_samples = [ |
|
reshape_return_emb(tmp_emb) for tmp_emb in down_block_res_samples |
|
] |
|
else: |
|
return_down_block_res_samples = None |
|
|
|
|
|
|
|
if self.mid_block is not None: |
|
if ( |
|
hasattr(self.mid_block, "has_cross_attention") |
|
and self.mid_block.has_cross_attention |
|
): |
|
sample = self.mid_block( |
|
sample, |
|
emb, |
|
encoder_hidden_states=encoder_hidden_states, |
|
attention_mask=attention_mask, |
|
cross_attention_kwargs=cross_attention_kwargs, |
|
encoder_attention_mask=encoder_attention_mask, |
|
self_attn_block_embs=self_attn_block_embs, |
|
) |
|
else: |
|
sample = self.mid_block(sample, emb) |
|
|
|
|
|
if ( |
|
is_adapter |
|
and len(down_intrablock_additional_residuals) > 0 |
|
and sample.shape == down_intrablock_additional_residuals[0].shape |
|
): |
|
sample += down_intrablock_additional_residuals.pop(0) |
|
|
|
if is_controlnet: |
|
sample = sample + mid_block_additional_residual |
|
|
|
if self.need_block_embs: |
|
return_mid_block_res_samples = reshape_return_emb(sample) |
|
logger.debug( |
|
f"return_mid_block_res_samples, is_leaf={return_mid_block_res_samples.is_leaf}, requires_grad={return_mid_block_res_samples.requires_grad}" |
|
) |
|
else: |
|
return_mid_block_res_samples = None |
|
|
|
if self.up_blocks is not None: |
|
|
|
|
|
|
|
for i, upsample_block in enumerate(self.up_blocks): |
|
is_final_block = i == len(self.up_blocks) - 1 |
|
|
|
res_samples = down_block_res_samples[-len(upsample_block.resnets) :] |
|
down_block_res_samples = down_block_res_samples[ |
|
: -len(upsample_block.resnets) |
|
] |
|
|
|
|
|
|
|
if not is_final_block and forward_upsample_size: |
|
upsample_size = down_block_res_samples[-1].shape[2:] |
|
|
|
if ( |
|
hasattr(upsample_block, "has_cross_attention") |
|
and upsample_block.has_cross_attention |
|
): |
|
sample = upsample_block( |
|
hidden_states=sample, |
|
temb=emb, |
|
res_hidden_states_tuple=res_samples, |
|
encoder_hidden_states=encoder_hidden_states, |
|
cross_attention_kwargs=cross_attention_kwargs, |
|
upsample_size=upsample_size, |
|
attention_mask=attention_mask, |
|
encoder_attention_mask=encoder_attention_mask, |
|
self_attn_block_embs=self_attn_block_embs, |
|
) |
|
else: |
|
sample = upsample_block( |
|
hidden_states=sample, |
|
temb=emb, |
|
res_hidden_states_tuple=res_samples, |
|
upsample_size=upsample_size, |
|
scale=lora_scale, |
|
self_attn_block_embs=self_attn_block_embs, |
|
) |
|
|
|
|
|
if self.need_block_embs or self.need_self_attn_block_embs: |
|
if self_attn_block_embs is not None: |
|
self_attn_block_embs = [ |
|
reshape_return_emb(tmp_emb=tmp_emb) |
|
for tmp_emb in self_attn_block_embs |
|
] |
|
self.print_idx += 1 |
|
return ( |
|
return_down_block_res_samples, |
|
return_mid_block_res_samples, |
|
self_attn_block_embs, |
|
) |
|
|
|
if not self.need_block_embs and not self.need_self_attn_block_embs: |
|
|
|
if self.conv_norm_out: |
|
sample = self.conv_norm_out(sample) |
|
sample = self.conv_act(sample) |
|
sample = self.conv_out(sample) |
|
|
|
if USE_PEFT_BACKEND: |
|
|
|
unscale_lora_layers(self, lora_scale) |
|
self.print_idx += 1 |
|
if not return_dict: |
|
return (sample,) |
|
|
|
return UNet2DConditionOutput(sample=sample) |
|
|
|
def insert_spatial_self_attn_idx(self): |
|
attns, basic_transformers = self.spatial_self_attns |
|
self.self_attn_num = len(attns) |
|
for i, (name, layer) in enumerate(attns): |
|
logger.debug(f"{self.__class__.__name__}, {i}, {name}, {type(layer)}") |
|
if layer is not None: |
|
layer.spatial_self_attn_idx = i |
|
for i, (name, layer) in enumerate(basic_transformers): |
|
logger.debug(f"{self.__class__.__name__}, {i}, {name}, {type(layer)}") |
|
if layer is not None: |
|
layer.spatial_self_attn_idx = i |
|
|
|
@property |
|
def spatial_self_attns( |
|
self, |
|
) -> List[Tuple[str, Attention]]: |
|
attns, spatial_transformers = self.get_self_attns( |
|
include="attentions", exclude="temp_attentions" |
|
) |
|
attns = sorted(attns) |
|
spatial_transformers = sorted(spatial_transformers) |
|
return attns, spatial_transformers |
|
|
|
def get_self_attns( |
|
self, include: str = None, exclude: str = None |
|
) -> List[Tuple[str, Attention]]: |
|
r""" |
|
Returns: |
|
`dict` of attention attns: A dictionary containing all attention attns used in the model with |
|
indexed by its weight name. |
|
""" |
|
|
|
attns = [] |
|
spatial_transformers = [] |
|
|
|
def fn_recursive_add_attns( |
|
name: str, |
|
module: torch.nn.Module, |
|
attns: List[Tuple[str, Attention]], |
|
spatial_transformers: List[Tuple[str, BasicTransformerBlock]], |
|
): |
|
is_target = False |
|
if isinstance(module, BasicTransformerBlock) and hasattr(module, "attn1"): |
|
is_target = True |
|
if include is not None: |
|
is_target = include in name |
|
if exclude is not None: |
|
is_target = exclude not in name |
|
if is_target: |
|
attns.append([f"{name}.attn1", module.attn1]) |
|
spatial_transformers.append([f"{name}", module]) |
|
for sub_name, child in module.named_children(): |
|
fn_recursive_add_attns( |
|
f"{name}.{sub_name}", child, attns, spatial_transformers |
|
) |
|
|
|
return attns |
|
|
|
for name, module in self.named_children(): |
|
fn_recursive_add_attns(name, module, attns, spatial_transformers) |
|
|
|
return attns, spatial_transformers |
|
|
|
|
|
class ReferenceNet3D(UNet3DConditionModel): |
|
"""继承 UNet3DConditionModel, 用于提取中间emb用于后续作用。 |
|
Inherit Unet3DConditionModel, used to extract the middle emb for subsequent actions. |
|
Args: |
|
UNet3DConditionModel (_type_): _description_ |
|
""" |
|
|
|
pass |
|
|