Spaces:

ML-INTA
/

ML-Image

Sleeping

App Files Files Community

ML-Image / diffusers /pipelines /stable_diffusion /pipeline_stable_diffusion_pix2pix_zero.py

ML-INTA

Upload 358 files

7f43c1b over 1 year ago

raw

history blame contribute delete

58.4 kB

	# Copyright 2023 Pix2Pix Zero Authors and The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import inspect
	from dataclasses import dataclass
	from typing import Any, Callable, Dict, List, Optional, Union

	import numpy as np
	import PIL
	import torch
	import torch.nn.functional as F
	from transformers import (
	BlipForConditionalGeneration,
	BlipProcessor,
	CLIPFeatureExtractor,
	CLIPTextModel,
	CLIPTokenizer,
	)

	from ...models import AutoencoderKL, UNet2DConditionModel
	from ...models.cross_attention import CrossAttention
	from ...schedulers import DDIMScheduler, DDPMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler
	from ...schedulers.scheduling_ddim_inverse import DDIMInverseScheduler
	from ...utils import (
	PIL_INTERPOLATION,
	BaseOutput,
	is_accelerate_available,
	is_accelerate_version,
	logging,
	randn_tensor,
	replace_example_docstring,
	)
	from ..pipeline_utils import DiffusionPipeline
	from . import StableDiffusionPipelineOutput
	from .safety_checker import StableDiffusionSafetyChecker


	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	@dataclass
	class Pix2PixInversionPipelineOutput(BaseOutput):
	"""
	Output class for Stable Diffusion pipelines.

	Args:
	latents (`torch.FloatTensor`)
	inverted latents tensor
	images (`List[PIL.Image.Image]` or `np.ndarray`)
	List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
	num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
	"""

	latents: torch.FloatTensor
	images: Union[List[PIL.Image.Image], np.ndarray]


	EXAMPLE_DOC_STRING = """
	Examples:
	```py
	>>> import requests
	>>> import torch

	>>> from diffusers import DDIMScheduler, StableDiffusionPix2PixZeroPipeline


	>>> def download(embedding_url, local_filepath):
	... r = requests.get(embedding_url)
	... with open(local_filepath, "wb") as f:
	... f.write(r.content)


	>>> model_ckpt = "CompVis/stable-diffusion-v1-4"
	>>> pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(model_ckpt, torch_dtype=torch.float16)
	>>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
	>>> pipeline.to("cuda")

	>>> prompt = "a high resolution painting of a cat in the style of van gough"
	>>> source_emb_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/cat.pt"
	>>> target_emb_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/dog.pt"

	>>> for url in [source_emb_url, target_emb_url]:
	... download(url, url.split("/")[-1])

	>>> src_embeds = torch.load(source_emb_url.split("/")[-1])
	>>> target_embeds = torch.load(target_emb_url.split("/")[-1])
	>>> images = pipeline(
	... prompt,
	... source_embeds=src_embeds,
	... target_embeds=target_embeds,
	... num_inference_steps=50,
	... cross_attention_guidance_amount=0.15,
	... ).images

	>>> images[0].save("edited_image_dog.png")
	```
	"""

	EXAMPLE_INVERT_DOC_STRING = """
	Examples:
	```py
	>>> import torch
	>>> from transformers import BlipForConditionalGeneration, BlipProcessor
	>>> from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline

	>>> import requests
	>>> from PIL import Image

	>>> captioner_id = "Salesforce/blip-image-captioning-base"
	>>> processor = BlipProcessor.from_pretrained(captioner_id)
	>>> model = BlipForConditionalGeneration.from_pretrained(
	... captioner_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
	... )

	>>> sd_model_ckpt = "CompVis/stable-diffusion-v1-4"
	>>> pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
	... sd_model_ckpt,
	... caption_generator=model,
	... caption_processor=processor,
	... torch_dtype=torch.float16,
	... safety_checker=None,
	... )

	>>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
	>>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
	>>> pipeline.enable_model_cpu_offload()

	>>> img_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/test_images/cats/cat_6.png"

	>>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB").resize((512, 512))
	>>> # generate caption
	>>> caption = pipeline.generate_caption(raw_image)

	>>> # "a photography of a cat with flowers and dai dai daie - daie - daie kasaii"
	>>> inv_latents = pipeline.invert(caption, image=raw_image).latents
	>>> # we need to generate source and target embeds

	>>> source_prompts = ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]

	>>> target_prompts = ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]

	>>> source_embeds = pipeline.get_embeds(source_prompts)
	>>> target_embeds = pipeline.get_embeds(target_prompts)
	>>> # the latents can then be used to edit a real image

	>>> image = pipeline(
	... caption,
	... source_embeds=source_embeds,
	... target_embeds=target_embeds,
	... num_inference_steps=50,
	... cross_attention_guidance_amount=0.15,
	... generator=generator,
	... latents=inv_latents,
	... negative_prompt=caption,
	... ).images[0]
	>>> image.save("edited_image.png")
	```
	"""


	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
	def preprocess(image):
	if isinstance(image, torch.Tensor):
	return image
	elif isinstance(image, PIL.Image.Image):
	image = [image]

	if isinstance(image[0], PIL.Image.Image):
	w, h = image[0].size
	w, h = map(lambda x: x - x % 8, (w, h)) # resize to integer multiple of 8

	image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
	image = np.concatenate(image, axis=0)
	image = np.array(image).astype(np.float32) / 255.0
	image = image.transpose(0, 3, 1, 2)
	image = 2.0 * image - 1.0
	image = torch.from_numpy(image)
	elif isinstance(image[0], torch.Tensor):
	image = torch.cat(image, dim=0)
	return image


	def prepare_unet(unet: UNet2DConditionModel):
	"""Modifies the UNet (`unet`) to perform Pix2Pix Zero optimizations."""
	pix2pix_zero_attn_procs = {}
	for name in unet.attn_processors.keys():
	module_name = name.replace(".processor", "")
	module = unet.get_submodule(module_name)
	if "attn2" in name:
	pix2pix_zero_attn_procs[name] = Pix2PixZeroCrossAttnProcessor(is_pix2pix_zero=True)
	module.requires_grad_(True)
	else:
	pix2pix_zero_attn_procs[name] = Pix2PixZeroCrossAttnProcessor(is_pix2pix_zero=False)
	module.requires_grad_(False)

	unet.set_attn_processor(pix2pix_zero_attn_procs)
	return unet


	class Pix2PixZeroL2Loss:
	def __init__(self):
	self.loss = 0.0

	def compute_loss(self, predictions, targets):
	self.loss += ((predictions - targets) ** 2).sum((1, 2)).mean(0)


	class Pix2PixZeroCrossAttnProcessor:
	"""An attention processor class to store the attention weights.
	In Pix2Pix Zero, it happens during computations in the cross-attention blocks."""

	def __init__(self, is_pix2pix_zero=False):
	self.is_pix2pix_zero = is_pix2pix_zero
	if self.is_pix2pix_zero:
	self.reference_cross_attn_map = {}

	def __call__(
	self,
	attn: CrossAttention,
	hidden_states,
	encoder_hidden_states=None,
	attention_mask=None,
	timestep=None,
	loss=None,
	):
	batch_size, sequence_length, _ = hidden_states.shape
	attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
	query = attn.to_q(hidden_states)

	if encoder_hidden_states is None:
	encoder_hidden_states = hidden_states
	elif attn.cross_attention_norm:
	encoder_hidden_states = attn.norm_cross(encoder_hidden_states)

	key = attn.to_k(encoder_hidden_states)
	value = attn.to_v(encoder_hidden_states)

	query = attn.head_to_batch_dim(query)
	key = attn.head_to_batch_dim(key)
	value = attn.head_to_batch_dim(value)

	attention_probs = attn.get_attention_scores(query, key, attention_mask)
	if self.is_pix2pix_zero and timestep is not None:
	# new bookkeeping to save the attention weights.
	if loss is None:
	self.reference_cross_attn_map[timestep.item()] = attention_probs.detach().cpu()
	# compute loss
	elif loss is not None:
	prev_attn_probs = self.reference_cross_attn_map.pop(timestep.item())
	loss.compute_loss(attention_probs, prev_attn_probs.to(attention_probs.device))

	hidden_states = torch.bmm(attention_probs, value)
	hidden_states = attn.batch_to_head_dim(hidden_states)

	# linear proj
	hidden_states = attn.to_out[0](hidden_states)
	# dropout
	hidden_states = attn.to_out[1](hidden_states)

	return hidden_states


	class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
	r"""
	Pipeline for pixel-levl image editing using Pix2Pix Zero. Based on Stable Diffusion.

	This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
	library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

	Args:
	vae ([`AutoencoderKL`]):
	Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
	text_encoder ([`CLIPTextModel`]):
	Frozen text-encoder. Stable Diffusion uses the text portion of
	[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
	the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
	tokenizer (`CLIPTokenizer`):
	Tokenizer of class
	[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
	unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
	scheduler ([`SchedulerMixin`]):
	A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
	[`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`], or [`DDPMScheduler`].
	safety_checker ([`StableDiffusionSafetyChecker`]):
	Classification module that estimates whether generated images could be considered offensive or harmful.
	Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
	feature_extractor ([`CLIPFeatureExtractor`]):
	Model that extracts features from generated images to be used as inputs for the `safety_checker`.
	requires_safety_checker (bool):
	Whether the pipeline requires a safety checker. We recommend setting it to True if you're using the
	pipeline publicly.
	"""
	_optional_components = [
	"safety_checker",
	"feature_extractor",
	"caption_generator",
	"caption_processor",
	"inverse_scheduler",
	]

	def __init__(
	self,
	vae: AutoencoderKL,
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	unet: UNet2DConditionModel,
	scheduler: Union[DDPMScheduler, DDIMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler],
	feature_extractor: CLIPFeatureExtractor,
	safety_checker: StableDiffusionSafetyChecker,
	inverse_scheduler: DDIMInverseScheduler,
	caption_generator: BlipForConditionalGeneration,
	caption_processor: BlipProcessor,
	requires_safety_checker: bool = True,
	):
	super().__init__()

	if safety_checker is None and requires_safety_checker:
	logger.warning(
	f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
	" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
	" results in services or applications open to the public. Both the diffusers team and Hugging Face"
	" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
	" it only for use-cases that involve analyzing network behavior or auditing its results. For more"
	" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
	)

	if safety_checker is not None and feature_extractor is None:
	raise ValueError(
	"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
	" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
	)

	self.register_modules(
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=scheduler,
	safety_checker=safety_checker,
	feature_extractor=feature_extractor,
	caption_processor=caption_processor,
	caption_generator=caption_generator,
	inverse_scheduler=inverse_scheduler,
	)
	self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
	self.register_to_config(requires_safety_checker=requires_safety_checker)

	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
	def enable_sequential_cpu_offload(self, gpu_id=0):
	r"""
	Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
	text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
	`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
	Note that offloading happens on a submodule basis. Memory savings are higher than with
	`enable_model_cpu_offload`, but performance is lower.
	"""
	if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
	from accelerate import cpu_offload
	else:
	raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")

	device = torch.device(f"cuda:{gpu_id}")

	if self.device.type != "cpu":
	self.to("cpu", silence_dtype_warnings=True)
	torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)

	for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
	cpu_offload(cpu_offloaded_model, device)

	if self.safety_checker is not None:
	cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)

	def enable_model_cpu_offload(self, gpu_id=0):
	r"""
	Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
	to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
	method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
	`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
	"""
	if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
	from accelerate import cpu_offload_with_hook
	else:
	raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")

	device = torch.device(f"cuda:{gpu_id}")

	hook = None
	for cpu_offloaded_model in [self.vae, self.text_encoder, self.unet, self.vae]:
	_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)

	if self.safety_checker is not None:
	_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)

	# We'll offload the last model manually.
	self.final_offload_hook = hook

	@property
	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
	def _execution_device(self):
	r"""
	Returns the device on which the pipeline's models will be executed. After calling
	`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
	hooks.
	"""
	if not hasattr(self.unet, "_hf_hook"):
	return self.device
	for module in self.unet.modules():
	if (
	hasattr(module, "_hf_hook")
	and hasattr(module._hf_hook, "execution_device")
	and module._hf_hook.execution_device is not None
	):
	return torch.device(module._hf_hook.execution_device)
	return self.device

	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
	def _encode_prompt(
	self,
	prompt,
	device,
	num_images_per_prompt,
	do_classifier_free_guidance,
	negative_prompt=None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	):
	r"""
	Encodes the prompt into text encoder hidden states.

	Args:
	prompt (`str` or `List[str]`, optional):
	prompt to be encoded
	device: (`torch.device`):
	torch device
	num_images_per_prompt (`int`):
	number of images that should be generated per prompt
	do_classifier_free_guidance (`bool`):
	whether to use classifier free guidance or not
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. If not defined, one has to pass
	`negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
	Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	negative_prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt
	weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
	argument.
	"""
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	if prompt_embeds is None:
	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids

	if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
	text_input_ids, untruncated_ids
	):
	removed_text = self.tokenizer.batch_decode(
	untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
	)
	logger.warning(
	"The following part of your input was truncated because CLIP can only handle sequences up to"
	f" {self.tokenizer.model_max_length} tokens: {removed_text}"
	)

	if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
	attention_mask = text_inputs.attention_mask.to(device)
	else:
	attention_mask = None

	prompt_embeds = self.text_encoder(
	text_input_ids.to(device),
	attention_mask=attention_mask,
	)
	prompt_embeds = prompt_embeds[0]

	prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)

	bs_embed, seq_len, _ = prompt_embeds.shape
	# duplicate text embeddings for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)

	# get unconditional embeddings for classifier free guidance
	if do_classifier_free_guidance and negative_prompt_embeds is None:
	uncond_tokens: List[str]
	if negative_prompt is None:
	uncond_tokens = [""] * batch_size
	elif type(prompt) is not type(negative_prompt):
	raise TypeError(
	f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
	f" {type(prompt)}."
	)
	elif isinstance(negative_prompt, str):
	uncond_tokens = [negative_prompt]
	elif batch_size != len(negative_prompt):
	raise ValueError(
	f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
	f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
	" the batch size of `prompt`."
	)
	else:
	uncond_tokens = negative_prompt

	max_length = prompt_embeds.shape[1]
	uncond_input = self.tokenizer(
	uncond_tokens,
	padding="max_length",
	max_length=max_length,
	truncation=True,
	return_tensors="pt",
	)

	if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
	attention_mask = uncond_input.attention_mask.to(device)
	else:
	attention_mask = None

	negative_prompt_embeds = self.text_encoder(
	uncond_input.input_ids.to(device),
	attention_mask=attention_mask,
	)
	negative_prompt_embeds = negative_prompt_embeds[0]

	if do_classifier_free_guidance:
	# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
	seq_len = negative_prompt_embeds.shape[1]

	negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)

	negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
	negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

	# For classifier free guidance, we need to do two forward passes.
	# Here we concatenate the unconditional and text embeddings into a single batch
	# to avoid doing two forward passes
	prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

	return prompt_embeds

	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
	def run_safety_checker(self, image, device, dtype):
	if self.safety_checker is not None:
	safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
	image, has_nsfw_concept = self.safety_checker(
	images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
	)
	else:
	has_nsfw_concept = None
	return image, has_nsfw_concept

	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
	def decode_latents(self, latents):
	latents = 1 / self.vae.config.scaling_factor * latents
	image = self.vae.decode(latents).sample
	image = (image / 2 + 0.5).clamp(0, 1)
	# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
	image = image.cpu().permute(0, 2, 3, 1).float().numpy()
	return image

	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
	def prepare_extra_step_kwargs(self, generator, eta):
	# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
	# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
	# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
	# and should be between [0, 1]

	accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
	extra_step_kwargs = {}
	if accepts_eta:
	extra_step_kwargs["eta"] = eta

	# check if the scheduler accepts generator
	accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
	if accepts_generator:
	extra_step_kwargs["generator"] = generator
	return extra_step_kwargs

	def check_inputs(
	self,
	prompt,
	image,
	source_embeds,
	target_embeds,
	callback_steps,
	prompt_embeds=None,
	):
	if (callback_steps is None) or (
	callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
	):
	raise ValueError(
	f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
	f" {type(callback_steps)}."
	)
	if source_embeds is None and target_embeds is None:
	raise ValueError("`source_embeds` and `target_embeds` cannot be undefined.")

	if prompt is not None and prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
	" only forward one of the two."
	)
	elif prompt is None and prompt_embeds is None:
	raise ValueError(
	"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
	)
	elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
	raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

	# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
	def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
	shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
	if isinstance(generator, list) and len(generator) != batch_size:
	raise ValueError(
	f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
	f" size of {batch_size}. Make sure the batch size matches the length of the generators."
	)

	if latents is None:
	latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
	else:
	latents = latents.to(device)

	# scale the initial noise by the standard deviation required by the scheduler
	latents = latents * self.scheduler.init_noise_sigma
	return latents

	@torch.no_grad()
	def generate_caption(self, images):
	"""Generates caption for a given image."""
	text = "a photography of"

	prev_device = self.caption_generator.device

	device = self._execution_device
	inputs = self.caption_processor(images, text, return_tensors="pt").to(
	device=device, dtype=self.caption_generator.dtype
	)
	self.caption_generator.to(device)
	outputs = self.caption_generator.generate(**inputs, max_new_tokens=128)

	# offload caption generator
	self.caption_generator.to(prev_device)

	caption = self.caption_processor.batch_decode(outputs, skip_special_tokens=True)[0]
	return caption

	def construct_direction(self, embs_source: torch.Tensor, embs_target: torch.Tensor):
	"""Constructs the edit direction to steer the image generation process semantically."""
	return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0)

	@torch.no_grad()
	def get_embeds(self, prompt: List[str], batch_size: int = 16) -> torch.FloatTensor:
	num_prompts = len(prompt)
	embeds = []
	for i in range(0, num_prompts, batch_size):
	prompt_slice = prompt[i : i + batch_size]

	input_ids = self.tokenizer(
	prompt_slice,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	).input_ids

	input_ids = input_ids.to(self.text_encoder.device)
	embeds.append(self.text_encoder(input_ids)[0])

	return torch.cat(embeds, dim=0).mean(0)[None]

	def prepare_image_latents(self, image, batch_size, dtype, device, generator=None):
	if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
	raise ValueError(
	f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
	)

	image = image.to(device=device, dtype=dtype)

	if isinstance(generator, list) and len(generator) != batch_size:
	raise ValueError(
	f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
	f" size of {batch_size}. Make sure the batch size matches the length of the generators."
	)

	if isinstance(generator, list):
	init_latents = [
	self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
	]
	init_latents = torch.cat(init_latents, dim=0)
	else:
	init_latents = self.vae.encode(image).latent_dist.sample(generator)

	init_latents = self.vae.config.scaling_factor * init_latents

	if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
	raise ValueError(
	f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
	)
	else:
	init_latents = torch.cat([init_latents], dim=0)

	latents = init_latents

	return latents

	def auto_corr_loss(self, hidden_states, generator=None):
	batch_size, channel, height, width = hidden_states.shape
	if batch_size > 1:
	raise ValueError("Only batch_size 1 is supported for now")

	hidden_states = hidden_states.squeeze(0)
	# hidden_states must be shape [C,H,W] now
	reg_loss = 0.0
	for i in range(hidden_states.shape[0]):
	noise = hidden_states[i][None, None, :, :]
	while True:
	roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
	reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
	reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2

	if noise.shape[2] <= 8:
	break
	noise = F.avg_pool2d(noise, kernel_size=2)
	return reg_loss

	def kl_divergence(self, hidden_states):
	mean = hidden_states.mean()
	var = hidden_states.var()
	return var + mean**2 - 1 - torch.log(var + 1e-7)

	@torch.no_grad()
	@replace_example_docstring(EXAMPLE_DOC_STRING)
	def __call__(
	self,
	prompt: Optional[Union[str, List[str]]] = None,
	image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None,
	source_embeds: torch.Tensor = None,
	target_embeds: torch.Tensor = None,
	height: Optional[int] = None,
	width: Optional[int] = None,
	num_inference_steps: int = 50,
	guidance_scale: float = 7.5,
	negative_prompt: Optional[Union[str, List[str]]] = None,
	num_images_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.FloatTensor] = None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	cross_attention_guidance_amount: float = 0.1,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
	callback_steps: Optional[int] = 1,
	cross_attention_kwargs: Optional[Dict[str, Any]] = None,
	):
	r"""
	Function invoked when calling the pipeline for generation.

	Args:
	prompt (`str` or `List[str]`, optional):
	The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
	instead.
	source_embeds (`torch.Tensor`):
	Source concept embeddings. Generation of the embeddings as per the [original
	paper](https://arxiv.org/abs/2302.03027). Used in discovering the edit direction.
	target_embeds (`torch.Tensor`):
	Target concept embeddings. Generation of the embeddings as per the [original
	paper](https://arxiv.org/abs/2302.03027). Used in discovering the edit direction.
	height (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The height in pixels of the generated image.
	width (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The width in pixels of the generated image.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
	`guidance_scale` is defined as `w` of equation 2. of [Imagen
	Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
	1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
	usually at the expense of lower image quality.
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. If not defined, one has to pass
	`negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
	Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
	num_images_per_prompt (`int`, optional, defaults to 1):
	The number of images to generate per prompt.
	eta (`float`, optional, defaults to 0.0):
	Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
	[`schedulers.DDIMScheduler`], will be ignored for others.
	generator (`torch.Generator` or `List[torch.Generator]`, optional):
	One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
	to make generation deterministic.
	latents (`torch.FloatTensor`, optional):
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random `generator`.
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	negative_prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt
	weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
	argument.
	cross_attention_guidance_amount (`float`, defaults to 0.1):
	Amount of guidance needed from the reference cross-attention maps.
	output_type (`str`, optional, defaults to `"pil"`):
	The output format of the generate image. Choose between
	[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
	plain tuple.
	callback (`Callable`, optional):
	A function that will be called every `callback_steps` steps during inference. The function will be
	called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
	callback_steps (`int`, optional, defaults to 1):
	The frequency at which the `callback` function will be called. If not specified, the callback will be
	called at every step.

	Examples:

	Returns:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
	When returning a tuple, the first element is a list with the generated images, and the second element is a
	list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
	(nsfw) content, according to the `safety_checker`.
	"""
	# 0. Define the spatial resolutions.
	height = height or self.unet.config.sample_size * self.vae_scale_factor
	width = width or self.unet.config.sample_size * self.vae_scale_factor

	# 1. Check inputs. Raise error if not correct
	self.check_inputs(
	prompt,
	image,
	source_embeds,
	target_embeds,
	callback_steps,
	prompt_embeds,
	)

	# 3. Define call parameters
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]
	if cross_attention_kwargs is None:
	cross_attention_kwargs = {}

	device = self._execution_device
	# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
	# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
	# corresponds to doing no classifier free guidance.
	do_classifier_free_guidance = guidance_scale > 1.0

	# 3. Encode input prompt
	prompt_embeds = self._encode_prompt(
	prompt,
	device,
	num_images_per_prompt,
	do_classifier_free_guidance,
	negative_prompt,
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	)

	# 4. Prepare timesteps
	self.scheduler.set_timesteps(num_inference_steps, device=device)
	timesteps = self.scheduler.timesteps

	# 5. Generate the inverted noise from the input image or any other image
	# generated from the input prompt.
	num_channels_latents = self.unet.in_channels
	latents = self.prepare_latents(
	batch_size * num_images_per_prompt,
	num_channels_latents,
	height,
	width,
	prompt_embeds.dtype,
	device,
	generator,
	latents,
	)
	latents_init = latents.clone()

	# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
	extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

	# 8. Rejig the UNet so that we can obtain the cross-attenion maps and
	# use them for guiding the subsequent image generation.
	self.unet = prepare_unet(self.unet)

	# 7. Denoising loop where we obtain the cross-attention maps.
	num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	# expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
	latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

	# predict the noise residual
	noise_pred = self.unet(
	latent_model_input,
	t,
	encoder_hidden_states=prompt_embeds,
	cross_attention_kwargs={"timestep": t},
	).sample

	# perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	# compute the previous noisy sample x_t -> x_t-1
	latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

	# call the callback, if provided
	if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
	progress_bar.update()
	if callback is not None and i % callback_steps == 0:
	callback(i, t, latents)

	# 8. Compute the edit directions.
	edit_direction = self.construct_direction(source_embeds, target_embeds).to(prompt_embeds.device)

	# 9. Edit the prompt embeddings as per the edit directions discovered.
	prompt_embeds_edit = prompt_embeds.clone()
	prompt_embeds_edit[1:2] += edit_direction

	# 10. Second denoising loop to generate the edited image.
	latents = latents_init
	num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	# expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
	latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

	# we want to learn the latent such that it steers the generation
	# process towards the edited direction, so make the make initial
	# noise learnable
	x_in = latent_model_input.detach().clone()
	x_in.requires_grad = True

	# optimizer
	opt = torch.optim.SGD([x_in], lr=cross_attention_guidance_amount)

	with torch.enable_grad():
	# initialize loss
	loss = Pix2PixZeroL2Loss()

	# predict the noise residual
	noise_pred = self.unet(
	x_in,
	t,
	encoder_hidden_states=prompt_embeds_edit.detach(),
	cross_attention_kwargs={"timestep": t, "loss": loss},
	).sample

	loss.loss.backward(retain_graph=False)
	opt.step()

	# recompute the noise
	noise_pred = self.unet(
	x_in.detach(),
	t,
	encoder_hidden_states=prompt_embeds_edit,
	cross_attention_kwargs={"timestep": None},
	).sample

	latents = x_in.detach().chunk(2)[0]

	# perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	# compute the previous noisy sample x_t -> x_t-1
	latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

	# call the callback, if provided
	if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
	progress_bar.update()

	# 11. Post-process the latents.
	edited_image = self.decode_latents(latents)

	# 12. Run the safety checker.
	edited_image, has_nsfw_concept = self.run_safety_checker(edited_image, device, prompt_embeds.dtype)

	# 13. Convert to PIL.
	if output_type == "pil":
	edited_image = self.numpy_to_pil(edited_image)

	# Offload last model to CPU
	if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
	self.final_offload_hook.offload()

	if not return_dict:
	return (edited_image, has_nsfw_concept)

	return StableDiffusionPipelineOutput(images=edited_image, nsfw_content_detected=has_nsfw_concept)

	@torch.no_grad()
	@replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
	def invert(
	self,
	prompt: Optional[str] = None,
	image: Union[torch.FloatTensor, PIL.Image.Image] = None,
	num_inference_steps: int = 50,
	guidance_scale: float = 1,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.FloatTensor] = None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	cross_attention_guidance_amount: float = 0.1,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
	callback_steps: Optional[int] = 1,
	cross_attention_kwargs: Optional[Dict[str, Any]] = None,
	lambda_auto_corr: float = 20.0,
	lambda_kl: float = 20.0,
	num_reg_steps: int = 5,
	num_auto_corr_rolls: int = 5,
	):
	r"""
	Function used to generate inverted latents given a prompt and image.

	Args:
	prompt (`str` or `List[str]`, optional):
	The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
	instead.
	image (`PIL.Image.Image`, optional):
	`Image`, or tensor representing an image batch which will be used for conditioning.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
	`guidance_scale` is defined as `w` of equation 2. of [Imagen
	Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
	1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
	usually at the expense of lower image quality.
	generator (`torch.Generator` or `List[torch.Generator]`, optional):
	One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
	to make generation deterministic.
	latents (`torch.FloatTensor`, optional):
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random `generator`.
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	cross_attention_guidance_amount (`float`, defaults to 0.1):
	Amount of guidance needed from the reference cross-attention maps.
	output_type (`str`, optional, defaults to `"pil"`):
	The output format of the generate image. Choose between
	[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
	plain tuple.
	callback (`Callable`, optional):
	A function that will be called every `callback_steps` steps during inference. The function will be
	called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
	callback_steps (`int`, optional, defaults to 1):
	The frequency at which the `callback` function will be called. If not specified, the callback will be
	called at every step.
	lambda_auto_corr (`float`, optional, defaults to 20.0):
	Lambda parameter to control auto correction
	lambda_kl (`float`, optional, defaults to 20.0):
	Lambda parameter to control Kullback–Leibler divergence output
	num_reg_steps (`int`, optional, defaults to 5):
	Number of regularization loss steps
	num_auto_corr_rolls (`int`, optional, defaults to 5):
	Number of auto correction roll steps

	Examples:

	Returns:
	[`~pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.Pix2PixInversionPipelineOutput`] or
	`tuple`:
	[`~pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.Pix2PixInversionPipelineOutput`] if
	`return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is the inverted
	latents tensor and then second is the corresponding decoded image.
	"""
	# 1. Define call parameters
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]
	if cross_attention_kwargs is None:
	cross_attention_kwargs = {}

	device = self._execution_device
	# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
	# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
	# corresponds to doing no classifier free guidance.
	do_classifier_free_guidance = guidance_scale > 1.0

	# 3. Preprocess image
	image = preprocess(image)

	# 4. Prepare latent variables
	latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, device, generator)

	# 5. Encode input prompt
	num_images_per_prompt = 1
	prompt_embeds = self._encode_prompt(
	prompt,
	device,
	num_images_per_prompt,
	do_classifier_free_guidance,
	prompt_embeds=prompt_embeds,
	)

	# 4. Prepare timesteps
	self.inverse_scheduler.set_timesteps(num_inference_steps, device=device)
	timesteps = self.inverse_scheduler.timesteps

	# 6. Rejig the UNet so that we can obtain the cross-attenion maps and
	# use them for guiding the subsequent image generation.
	self.unet = prepare_unet(self.unet)

	# 7. Denoising loop where we obtain the cross-attention maps.
	num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
	with self.progress_bar(total=num_inference_steps - 2) as progress_bar:
	for i, t in enumerate(timesteps[1:-1]):
	# expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
	latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)

	# predict the noise residual
	noise_pred = self.unet(
	latent_model_input,
	t,
	encoder_hidden_states=prompt_embeds,
	cross_attention_kwargs={"timestep": t},
	).sample

	# perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	# regularization of the noise prediction
	with torch.enable_grad():
	for _ in range(num_reg_steps):
	if lambda_auto_corr > 0:
	for _ in range(num_auto_corr_rolls):
	var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
	l_ac = self.auto_corr_loss(var, generator=generator)
	l_ac.backward()

	grad = var.grad.detach() / num_auto_corr_rolls
	noise_pred = noise_pred - lambda_auto_corr * grad

	if lambda_kl > 0:
	var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)

	l_kld = self.kl_divergence(var)
	l_kld.backward()

	grad = var.grad.detach()
	noise_pred = noise_pred - lambda_kl * grad

	noise_pred = noise_pred.detach()

	# compute the previous noisy sample x_t -> x_t-1
	latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample

	# call the callback, if provided
	if i == len(timesteps) - 1 or (
	(i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
	):
	progress_bar.update()
	if callback is not None and i % callback_steps == 0:
	callback(i, t, latents)

	inverted_latents = latents.detach().clone()

	# 8. Post-processing
	image = self.decode_latents(latents.detach())

	# Offload last model to CPU
	if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
	self.final_offload_hook.offload()

	# 9. Convert to PIL.
	if output_type == "pil":
	image = self.numpy_to_pil(image)

	if not return_dict:
	return (inverted_latents, image)

	return Pix2PixInversionPipelineOutput(latents=inverted_latents, images=image)