Spaces:

jmanhype
/

MuseV

Runtime error

MuseV / musev /models /attention_processor.py

jmanhype

Initial commit without binary files

06e9d12 3 months ago

28.7 kB

	# Copyright 2024 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""该模型是自定义的attn_processor，实现特殊功能的 Attn功能。
	相对而言，开源代码经常会重新定义Attention 类，

	This module implements special AttnProcessor function with custom attn_processor class.
	While other open source code always modify Attention class.
	"""
	# modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
	from __future__ import annotations

	import time
	from typing import Any, Callable, Optional
	import logging

	from einops import rearrange, repeat
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import xformers
	from diffusers.models.lora import LoRACompatibleLinear

	from diffusers.utils.torch_utils import maybe_allow_in_graph
	from diffusers.models.attention_processor import (
	Attention as DiffusersAttention,
	AttnProcessor,
	AttnProcessor2_0,
	)
	from ..data.data_util import (
	batch_concat_two_tensor_with_index,
	batch_index_select,
	align_repeat_tensor_single_dim,
	batch_adain_conditioned_tensor,
	)

	from . import Model_Register

	logger = logging.getLogger(__name__) # pylint: disable=invalid-name


	@maybe_allow_in_graph
	class IPAttention(DiffusersAttention):
	r"""
	Modified Attention class which has special layer, like ip_apadapter_to_k, ip_apadapter_to_v,
	"""

	def __init__(
	self,
	query_dim: int,
	cross_attention_dim: int \| None = None,
	heads: int = 8,
	dim_head: int = 64,
	dropout: float = 0,
	bias=False,
	upcast_attention: bool = False,
	upcast_softmax: bool = False,
	cross_attention_norm: str \| None = None,
	cross_attention_norm_num_groups: int = 32,
	added_kv_proj_dim: int \| None = None,
	norm_num_groups: int \| None = None,
	spatial_norm_dim: int \| None = None,
	out_bias: bool = True,
	scale_qk: bool = True,
	only_cross_attention: bool = False,
	eps: float = 0.00001,
	rescale_output_factor: float = 1,
	residual_connection: bool = False,
	_from_deprecated_attn_block=False,
	processor: AttnProcessor \| None = None,
	cross_attn_temporal_cond: bool = False,
	image_scale: float = 1.0,
	ip_adapter_dim: int = None,
	need_t2i_facein: bool = False,
	facein_dim: int = None,
	need_t2i_ip_adapter_face: bool = False,
	ip_adapter_face_dim: int = None,
	):
	super().__init__(
	query_dim,
	cross_attention_dim,
	heads,
	dim_head,
	dropout,
	bias,
	upcast_attention,
	upcast_softmax,
	cross_attention_norm,
	cross_attention_norm_num_groups,
	added_kv_proj_dim,
	norm_num_groups,
	spatial_norm_dim,
	out_bias,
	scale_qk,
	only_cross_attention,
	eps,
	rescale_output_factor,
	residual_connection,
	_from_deprecated_attn_block,
	processor,
	)
	self.cross_attn_temporal_cond = cross_attn_temporal_cond
	self.image_scale = image_scale
	# 面向首帧的 ip_adapter
	# ip_apdater
	if cross_attn_temporal_cond:
	self.to_k_ip = LoRACompatibleLinear(ip_adapter_dim, query_dim, bias=False)
	self.to_v_ip = LoRACompatibleLinear(ip_adapter_dim, query_dim, bias=False)
	# facein
	self.need_t2i_facein = need_t2i_facein
	self.facein_dim = facein_dim
	if need_t2i_facein:
	raise NotImplementedError("facein")

	# ip_adapter_face
	self.need_t2i_ip_adapter_face = need_t2i_ip_adapter_face
	self.ip_adapter_face_dim = ip_adapter_face_dim
	if need_t2i_ip_adapter_face:
	self.ip_adapter_face_to_k_ip = LoRACompatibleLinear(
	ip_adapter_face_dim, query_dim, bias=False
	)
	self.ip_adapter_face_to_v_ip = LoRACompatibleLinear(
	ip_adapter_face_dim, query_dim, bias=False
	)

	def set_use_memory_efficient_attention_xformers(
	self,
	use_memory_efficient_attention_xformers: bool,
	attention_op: Callable[..., Any] \| None = None,
	):
	if (
	"XFormers" in self.processor.__class__.__name__
	or "IP" in self.processor.__class__.__name__
	):
	pass
	else:
	return super().set_use_memory_efficient_attention_xformers(
	use_memory_efficient_attention_xformers, attention_op
	)


	@Model_Register.register
	class BaseIPAttnProcessor(nn.Module):
	print_idx = 0

	def __init__(self, args, *kwargs) -> None:
	super().__init__(args, *kwargs)


	@Model_Register.register
	class T2IReferencenetIPAdapterXFormersAttnProcessor(BaseIPAttnProcessor):
	r"""
	面向 ref_image的 self_attn的 IPAdapter
	"""
	print_idx = 0

	def __init__(
	self,
	attention_op: Optional[Callable] = None,
	):
	super().__init__()

	self.attention_op = attention_op

	def __call__(
	self,
	attn: IPAttention,
	hidden_states: torch.FloatTensor,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	temb: Optional[torch.FloatTensor] = None,
	scale: float = 1.0,
	num_frames: int = None,
	sample_index: torch.LongTensor = None,
	vision_conditon_frames_sample_index: torch.LongTensor = None,
	refer_emb: torch.Tensor = None,
	vision_clip_emb: torch.Tensor = None,
	ip_adapter_scale: float = 1.0,
	face_emb: torch.Tensor = None,
	facein_scale: float = 1.0,
	ip_adapter_face_emb: torch.Tensor = None,
	ip_adapter_face_scale: float = 1.0,
	do_classifier_free_guidance: bool = False,
	):
	residual = hidden_states

	if attn.spatial_norm is not None:
	hidden_states = attn.spatial_norm(hidden_states, temb)

	input_ndim = hidden_states.ndim

	if input_ndim == 4:
	batch_size, channel, height, width = hidden_states.shape
	hidden_states = hidden_states.view(
	batch_size, channel, height * width
	).transpose(1, 2)

	batch_size, key_tokens, _ = (
	hidden_states.shape
	if encoder_hidden_states is None
	else encoder_hidden_states.shape
	)

	attention_mask = attn.prepare_attention_mask(
	attention_mask, key_tokens, batch_size
	)
	if attention_mask is not None:
	# expand our mask's singleton query_tokens dimension:
	# [batch*heads, 1, key_tokens] ->
	# [batch*heads, query_tokens, key_tokens]
	# so that it can be added as a bias onto the attention scores that xformers computes:
	# [batch*heads, query_tokens, key_tokens]
	# we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
	_, query_tokens, _ = hidden_states.shape
	attention_mask = attention_mask.expand(-1, query_tokens, -1)

	if attn.group_norm is not None:
	hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
	1, 2
	)

	query = attn.to_q(hidden_states, scale=scale)

	if encoder_hidden_states is None:
	encoder_hidden_states = hidden_states
	elif attn.norm_cross:
	encoder_hidden_states = attn.norm_encoder_hidden_states(
	encoder_hidden_states
	)
	encoder_hidden_states = align_repeat_tensor_single_dim(
	encoder_hidden_states, target_length=hidden_states.shape[0], dim=0
	)
	key = attn.to_k(encoder_hidden_states, scale=scale)
	value = attn.to_v(encoder_hidden_states, scale=scale)

	# for facein
	if self.print_idx == 0:
	logger.debug(
	f"T2IReferencenetIPAdapterXFormersAttnProcessor,type(face_emb)={type(face_emb)}, facein_scale={facein_scale}"
	)
	if facein_scale > 0 and face_emb is not None:
	raise NotImplementedError("facein")

	query = attn.head_to_batch_dim(query).contiguous()
	key = attn.head_to_batch_dim(key).contiguous()
	value = attn.head_to_batch_dim(value).contiguous()
	hidden_states = xformers.ops.memory_efficient_attention(
	query,
	key,
	value,
	attn_bias=attention_mask,
	op=self.attention_op,
	scale=attn.scale,
	)

	# ip-adapter start
	if self.print_idx == 0:
	logger.debug(
	f"T2IReferencenetIPAdapterXFormersAttnProcessor,type(vision_clip_emb)={type(vision_clip_emb)}"
	)
	if ip_adapter_scale > 0 and vision_clip_emb is not None:
	if self.print_idx == 0:
	logger.debug(
	f"T2I cross_attn, ipadapter, vision_clip_emb={vision_clip_emb.shape}, hidden_states={hidden_states.shape}, batch_size={batch_size}"
	)
	ip_key = attn.to_k_ip(vision_clip_emb)
	ip_value = attn.to_v_ip(vision_clip_emb)
	ip_key = align_repeat_tensor_single_dim(
	ip_key, target_length=batch_size, dim=0
	)
	ip_value = align_repeat_tensor_single_dim(
	ip_value, target_length=batch_size, dim=0
	)
	ip_key = attn.head_to_batch_dim(ip_key).contiguous()
	ip_value = attn.head_to_batch_dim(ip_value).contiguous()
	if self.print_idx == 0:
	logger.debug(
	f"query={query.shape}, ip_key={ip_key.shape}, ip_value={ip_value.shape}"
	)
	# the output of sdp = (batch, num_heads, seq_len, head_dim)
	hidden_states_from_ip = xformers.ops.memory_efficient_attention(
	query,
	ip_key,
	ip_value,
	attn_bias=attention_mask,
	op=self.attention_op,
	scale=attn.scale,
	)
	hidden_states = hidden_states + ip_adapter_scale * hidden_states_from_ip
	# ip-adapter end

	# ip-adapter face start
	if self.print_idx == 0:
	logger.debug(
	f"T2IReferencenetIPAdapterXFormersAttnProcessor,type(ip_adapter_face_emb)={type(ip_adapter_face_emb)}"
	)
	if ip_adapter_face_scale > 0 and ip_adapter_face_emb is not None:
	if self.print_idx == 0:
	logger.debug(
	f"T2I cross_attn, ipadapter face, ip_adapter_face_emb={vision_clip_emb.shape}, hidden_states={hidden_states.shape}, batch_size={batch_size}"
	)
	ip_key = attn.ip_adapter_face_to_k_ip(ip_adapter_face_emb)
	ip_value = attn.ip_adapter_face_to_v_ip(ip_adapter_face_emb)
	ip_key = align_repeat_tensor_single_dim(
	ip_key, target_length=batch_size, dim=0
	)
	ip_value = align_repeat_tensor_single_dim(
	ip_value, target_length=batch_size, dim=0
	)
	ip_key = attn.head_to_batch_dim(ip_key).contiguous()
	ip_value = attn.head_to_batch_dim(ip_value).contiguous()
	if self.print_idx == 0:
	logger.debug(
	f"query={query.shape}, ip_key={ip_key.shape}, ip_value={ip_value.shape}"
	)
	# the output of sdp = (batch, num_heads, seq_len, head_dim)
	hidden_states_from_ip = xformers.ops.memory_efficient_attention(
	query,
	ip_key,
	ip_value,
	attn_bias=attention_mask,
	op=self.attention_op,
	scale=attn.scale,
	)
	hidden_states = (
	hidden_states + ip_adapter_face_scale * hidden_states_from_ip
	)
	# ip-adapter face end

	hidden_states = hidden_states.to(query.dtype)
	hidden_states = attn.batch_to_head_dim(hidden_states)

	# linear proj
	hidden_states = attn.to_out[0](hidden_states, scale=scale)
	# dropout
	hidden_states = attn.to_out[1](hidden_states)

	if input_ndim == 4:
	hidden_states = hidden_states.transpose(-1, -2).reshape(
	batch_size, channel, height, width
	)

	if attn.residual_connection:
	hidden_states = hidden_states + residual

	hidden_states = hidden_states / attn.rescale_output_factor
	self.print_idx += 1
	return hidden_states


	@Model_Register.register
	class NonParamT2ISelfReferenceXFormersAttnProcessor(BaseIPAttnProcessor):
	r"""
	面向首帧的 referenceonly attn,适用于 T2I的 self_attn
	referenceonly with vis_cond as key, value, in t2i self_attn.
	"""
	print_idx = 0

	def __init__(
	self,
	attention_op: Optional[Callable] = None,
	):
	super().__init__()

	self.attention_op = attention_op

	def __call__(
	self,
	attn: IPAttention,
	hidden_states: torch.FloatTensor,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	temb: Optional[torch.FloatTensor] = None,
	scale: float = 1.0,
	num_frames: int = None,
	sample_index: torch.LongTensor = None,
	vision_conditon_frames_sample_index: torch.LongTensor = None,
	refer_emb: torch.Tensor = None,
	face_emb: torch.Tensor = None,
	vision_clip_emb: torch.Tensor = None,
	ip_adapter_scale: float = 1.0,
	facein_scale: float = 1.0,
	ip_adapter_face_emb: torch.Tensor = None,
	ip_adapter_face_scale: float = 1.0,
	do_classifier_free_guidance: bool = False,
	):
	residual = hidden_states

	if attn.spatial_norm is not None:
	hidden_states = attn.spatial_norm(hidden_states, temb)

	input_ndim = hidden_states.ndim

	if input_ndim == 4:
	batch_size, channel, height, width = hidden_states.shape
	hidden_states = hidden_states.view(
	batch_size, channel, height * width
	).transpose(1, 2)

	batch_size, key_tokens, _ = (
	hidden_states.shape
	if encoder_hidden_states is None
	else encoder_hidden_states.shape
	)

	attention_mask = attn.prepare_attention_mask(
	attention_mask, key_tokens, batch_size
	)
	if attention_mask is not None:
	# expand our mask's singleton query_tokens dimension:
	# [batch*heads, 1, key_tokens] ->
	# [batch*heads, query_tokens, key_tokens]
	# so that it can be added as a bias onto the attention scores that xformers computes:
	# [batch*heads, query_tokens, key_tokens]
	# we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
	_, query_tokens, _ = hidden_states.shape
	attention_mask = attention_mask.expand(-1, query_tokens, -1)

	# vision_cond in same unet attn start
	if (
	vision_conditon_frames_sample_index is not None and num_frames > 1
	) or refer_emb is not None:
	batchsize_timesize = hidden_states.shape[0]
	if self.print_idx == 0:
	logger.debug(
	f"NonParamT2ISelfReferenceXFormersAttnProcessor 0, hidden_states={hidden_states.shape}, vision_conditon_frames_sample_index={vision_conditon_frames_sample_index}"
	)
	encoder_hidden_states = rearrange(
	hidden_states, "(b t) hw c -> b t hw c", t=num_frames
	)
	# if False:
	if vision_conditon_frames_sample_index is not None and num_frames > 1:
	ip_hidden_states = batch_index_select(
	encoder_hidden_states,
	dim=1,
	index=vision_conditon_frames_sample_index,
	).contiguous()
	if self.print_idx == 0:
	logger.debug(
	f"NonParamT2ISelfReferenceXFormersAttnProcessor 1, vis_cond referenceonly, encoder_hidden_states={encoder_hidden_states.shape}, ip_hidden_states={ip_hidden_states.shape}"
	)
	#
	ip_hidden_states = rearrange(
	ip_hidden_states, "b t hw c -> b 1 (t hw) c"
	)
	ip_hidden_states = align_repeat_tensor_single_dim(
	ip_hidden_states,
	dim=1,
	target_length=num_frames,
	)
	# b t hw c -> b t hw + hw c
	if self.print_idx == 0:
	logger.debug(
	f"NonParamT2ISelfReferenceXFormersAttnProcessor 2, vis_cond referenceonly, encoder_hidden_states={encoder_hidden_states.shape}, ip_hidden_states={ip_hidden_states.shape}"
	)
	encoder_hidden_states = torch.concat(
	[encoder_hidden_states, ip_hidden_states], dim=2
	)
	if self.print_idx == 0:
	logger.debug(
	f"NonParamT2ISelfReferenceXFormersAttnProcessor 3, hidden_states={hidden_states.shape}, ip_hidden_states={ip_hidden_states.shape}"
	)
	# if False:
	if refer_emb is not None: # and num_frames > 1:
	refer_emb = rearrange(refer_emb, "b c t h w->b 1 (t h w) c")
	refer_emb = align_repeat_tensor_single_dim(
	refer_emb, target_length=num_frames, dim=1
	)
	if self.print_idx == 0:
	logger.debug(
	f"NonParamT2ISelfReferenceXFormersAttnProcessor4, referencenet, encoder_hidden_states={encoder_hidden_states.shape}, refer_emb={refer_emb.shape}"
	)
	encoder_hidden_states = torch.concat(
	[encoder_hidden_states, refer_emb], dim=2
	)
	if self.print_idx == 0:
	logger.debug(
	f"NonParamT2ISelfReferenceXFormersAttnProcessor5, referencenet, encoder_hidden_states={encoder_hidden_states.shape}, refer_emb={refer_emb.shape}"
	)
	encoder_hidden_states = rearrange(
	encoder_hidden_states, "b t hw c -> (b t) hw c"
	)
	# vision_cond in same unet attn end

	if attn.group_norm is not None:
	hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
	1, 2
	)

	query = attn.to_q(hidden_states, scale=scale)

	if encoder_hidden_states is None:
	encoder_hidden_states = hidden_states
	elif attn.norm_cross:
	encoder_hidden_states = attn.norm_encoder_hidden_states(
	encoder_hidden_states
	)
	encoder_hidden_states = align_repeat_tensor_single_dim(
	encoder_hidden_states, target_length=hidden_states.shape[0], dim=0
	)
	key = attn.to_k(encoder_hidden_states, scale=scale)
	value = attn.to_v(encoder_hidden_states, scale=scale)

	query = attn.head_to_batch_dim(query).contiguous()
	key = attn.head_to_batch_dim(key).contiguous()
	value = attn.head_to_batch_dim(value).contiguous()

	hidden_states = xformers.ops.memory_efficient_attention(
	query,
	key,
	value,
	attn_bias=attention_mask,
	op=self.attention_op,
	scale=attn.scale,
	)
	hidden_states = hidden_states.to(query.dtype)
	hidden_states = attn.batch_to_head_dim(hidden_states)

	# linear proj
	hidden_states = attn.to_out[0](hidden_states, scale=scale)
	# dropout
	hidden_states = attn.to_out[1](hidden_states)

	if input_ndim == 4:
	hidden_states = hidden_states.transpose(-1, -2).reshape(
	batch_size, channel, height, width
	)

	if attn.residual_connection:
	hidden_states = hidden_states + residual

	hidden_states = hidden_states / attn.rescale_output_factor
	self.print_idx += 1

	return hidden_states


	@Model_Register.register
	class NonParamReferenceIPXFormersAttnProcessor(
	NonParamT2ISelfReferenceXFormersAttnProcessor
	):
	def __init__(self, attention_op: Callable[..., Any] \| None = None):
	super().__init__(attention_op)


	@maybe_allow_in_graph
	class ReferEmbFuseAttention(IPAttention):
	"""使用 attention 融合 refernet 中的 emb 到 unet 对应的 latens 中
	# TODO: 目前只支持 bt hw c 的融合，后续考虑增加对视频 bhw t c、b thw c的融合
	residual_connection: bool = True, 默认，从不产生影响开始学习

	use attention to fuse referencenet emb into unet latents
	# TODO: by now, only support bt hw c, later consider to support bhw t c, b thw c
	residual_connection: bool = True, default, start from no effect

	Args:
	IPAttention (_type_): _description_
	"""

	print_idx = 0

	def __init__(
	self,
	query_dim: int,
	cross_attention_dim: int \| None = None,
	heads: int = 8,
	dim_head: int = 64,
	dropout: float = 0,
	bias=False,
	upcast_attention: bool = False,
	upcast_softmax: bool = False,
	cross_attention_norm: str \| None = None,
	cross_attention_norm_num_groups: int = 32,
	added_kv_proj_dim: int \| None = None,
	norm_num_groups: int \| None = None,
	spatial_norm_dim: int \| None = None,
	out_bias: bool = True,
	scale_qk: bool = True,
	only_cross_attention: bool = False,
	eps: float = 0.00001,
	rescale_output_factor: float = 1,
	residual_connection: bool = True,
	_from_deprecated_attn_block=False,
	processor: AttnProcessor \| None = None,
	cross_attn_temporal_cond: bool = False,
	image_scale: float = 1,
	):
	super().__init__(
	query_dim,
	cross_attention_dim,
	heads,
	dim_head,
	dropout,
	bias,
	upcast_attention,
	upcast_softmax,
	cross_attention_norm,
	cross_attention_norm_num_groups,
	added_kv_proj_dim,
	norm_num_groups,
	spatial_norm_dim,
	out_bias,
	scale_qk,
	only_cross_attention,
	eps,
	rescale_output_factor,
	residual_connection,
	_from_deprecated_attn_block,
	processor,
	cross_attn_temporal_cond,
	image_scale,
	)
	self.processor = None
	# 配合residual,使一开始不影响之前结果
	nn.init.zeros_(self.to_out[0].weight)
	nn.init.zeros_(self.to_out[0].bias)

	def forward(
	self,
	hidden_states: torch.FloatTensor,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	temb: Optional[torch.FloatTensor] = None,
	scale: float = 1.0,
	num_frames: int = None,
	) -> torch.Tensor:
	"""fuse referencenet emb b c t2 h2 w2 into unet latents b c t1 h1 w1 with attn
	refer to musev/models/attention_processor.py::NonParamT2ISelfReferenceXFormersAttnProcessor

	Args:
	hidden_states (torch.FloatTensor): unet latents, (b t1) c h1 w1
	encoder_hidden_states (Optional[torch.FloatTensor], optional): referencenet emb b c2 t2 h2 w2. Defaults to None.
	attention_mask (Optional[torch.FloatTensor], optional): _description_. Defaults to None.
	temb (Optional[torch.FloatTensor], optional): _description_. Defaults to None.
	scale (float, optional): _description_. Defaults to 1.0.
	num_frames (int, optional): _description_. Defaults to None.

	Returns:
	torch.Tensor: _description_
	"""
	residual = hidden_states
	# start
	hidden_states = rearrange(
	hidden_states, "(b t) c h w -> b c t h w", t=num_frames
	)
	batch_size, channel, t1, height, width = hidden_states.shape
	if self.print_idx == 0:
	logger.debug(
	f"hidden_states={hidden_states.shape},encoder_hidden_states={encoder_hidden_states.shape}"
	)
	# concat with hidden_states b c t1 h1 w1 in hw channel into bt (t2 + 1)hw c
	encoder_hidden_states = rearrange(
	encoder_hidden_states, " b c t2 h w-> b (t2 h w) c"
	)
	encoder_hidden_states = repeat(
	encoder_hidden_states, " b t2hw c -> (b t) t2hw c", t=t1
	)
	hidden_states = rearrange(hidden_states, " b c t h w-> (b t) (h w) c")
	# bt (t2+1)hw d
	encoder_hidden_states = torch.concat(
	[encoder_hidden_states, hidden_states], dim=1
	)
	# encoder_hidden_states = align_repeat_tensor_single_dim(
	# encoder_hidden_states, target_length=hidden_states.shape[0], dim=0
	# )
	# end

	if self.spatial_norm is not None:
	hidden_states = self.spatial_norm(hidden_states, temb)

	_, key_tokens, _ = (
	hidden_states.shape
	if encoder_hidden_states is None
	else encoder_hidden_states.shape
	)

	attention_mask = self.prepare_attention_mask(
	attention_mask, key_tokens, batch_size
	)
	if attention_mask is not None:
	# expand our mask's singleton query_tokens dimension:
	# [batch*heads, 1, key_tokens] ->
	# [batch*heads, query_tokens, key_tokens]
	# so that it can be added as a bias onto the attention scores that xformers computes:
	# [batch*heads, query_tokens, key_tokens]
	# we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
	_, query_tokens, _ = hidden_states.shape
	attention_mask = attention_mask.expand(-1, query_tokens, -1)

	if self.group_norm is not None:
	hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(
	1, 2
	)

	query = self.to_q(hidden_states, scale=scale)

	if encoder_hidden_states is None:
	encoder_hidden_states = hidden_states
	elif self.norm_cross:
	encoder_hidden_states = self.norm_encoder_hidden_states(
	encoder_hidden_states
	)

	key = self.to_k(encoder_hidden_states, scale=scale)
	value = self.to_v(encoder_hidden_states, scale=scale)

	query = self.head_to_batch_dim(query).contiguous()
	key = self.head_to_batch_dim(key).contiguous()
	value = self.head_to_batch_dim(value).contiguous()

	# query: b t hw d
	# key/value: bt (t1+1)hw d
	hidden_states = xformers.ops.memory_efficient_attention(
	query,
	key,
	value,
	attn_bias=attention_mask,
	scale=self.scale,
	)
	hidden_states = hidden_states.to(query.dtype)
	hidden_states = self.batch_to_head_dim(hidden_states)

	# linear proj
	hidden_states = self.to_out[0](hidden_states, scale=scale)
	# dropout
	hidden_states = self.to_out[1](hidden_states)

	hidden_states = rearrange(
	hidden_states,
	"bt (h w) c-> bt c h w",
	h=height,
	w=width,
	)
	if self.residual_connection:
	hidden_states = hidden_states + residual

	hidden_states = hidden_states / self.rescale_output_factor
	self.print_idx += 1
	return hidden_states