ImagineV4.1 / stable_diffusion_custom_v4_1.py

Upload 5 files

d8e2f70 about 1 year ago

38.4 kB

	import random
	from diffusers import StableDiffusionPipeline
	# from diffusers.schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
	from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput, AutoencoderKL, CLIPTextModel, CLIPTokenizer, UNet2DConditionModel, KarrasDiffusionSchedulers, StableDiffusionSafetyChecker, CLIPImageProcessor
	from compel import Compel
	from tokenizer_util import TextualInversionLoaderMixin, MultiTokenCLIPTokenizer
	import torch
	from typing import Any, Callable, Dict, List, Optional, Union
	from dynamicprompts.generators import RandomPromptGenerator
	import time
	from compel import Compel
	from prompt_parser import ScheduledPromptConditioning
	from prompt_parser import get_learned_conditioning_prompt_schedules
	from dynamicprompts.generators import RandomPromptGenerator
	import tqdm
	from cachetools import LRUCache
	from image_processor import VaeImageProcessor


	class CustomStableDiffusionPipeline4_1(TextualInversionLoaderMixin, StableDiffusionPipeline):
	def __init__(
	self,
	vae: AutoencoderKL,
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	unet: UNet2DConditionModel,
	scheduler: KarrasDiffusionSchedulers,
	safety_checker: StableDiffusionSafetyChecker,
	feature_extractor: CLIPImageProcessor,
	requires_safety_checker: bool = True,
	prompt_cache_size: int = 1024,
	prompt_cache_ttl: int = 60 * 2,
	) -> None:
	super().__init__(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler,
	safety_checker=safety_checker, feature_extractor=feature_extractor, requires_safety_checker=requires_safety_checker)

	self.vae_scale_factor = 2 ** (
	len(self.vae.config.block_out_channels) - 1)
	self.image_processor = VaeImageProcessor(
	vae_scale_factor=self.vae_scale_factor)
	self.register_to_config(
	requires_safety_checker=requires_safety_checker)

	self.compel = Compel(tokenizer=self.tokenizer,
	text_encoder=self.text_encoder, truncate_long_prompts=False)
	self.cache = LRUCache(maxsize=prompt_cache_size)

	self.cached_uc = [None, None]
	self.cached_c = [None, None]

	self.prompt_handler = None

	def build_scheduled_cond(self, prompt, steps, key):
	prompt_schedule = get_learned_conditioning_prompt_schedules([prompt], steps)[
	0]

	cached = self.cache.get(key, None)
	if cached is not None:
	return cached

	texts = [x[1] for x in prompt_schedule]
	conds = [self.compel.build_conditioning_tensor(
	text).to('cpu') for text in texts]

	cond_schedule = []
	for i, s in enumerate(prompt_schedule):
	cond_schedule.append(ScheduledPromptConditioning(s[0], conds[i]))

	self.cache[key] = cond_schedule
	return cond_schedule

	def initialize_magic_prompt_cache(self, pos_prompt_template: str, plain_prompt_template: str, neg_prompt_template: str, num_to_generate: int, steps: int):
	r"""
	Initializes the magic prompt cache for the forward pass.
	Must be called immedaitely after Compel is loaded and embeds are initalized.
	"""
	rpg = RandomPromptGenerator(ignore_whitespace=True, seed=555)
	positive_prompts = rpg.generate(
	template=pos_prompt_template, num_images=num_to_generate)
	scheduled_conds = []
	with torch.no_grad():
	cache = {}
	for i in tqdm.tqdm(range(len(positive_prompts))):
	scheduled_conds.append(self.build_scheduled_cond(
	positive_prompts[i], steps, cache))

	plain_scheduled_cond = self.build_scheduled_cond(
	plain_prompt_template, steps, cache)

	scheduled_uncond = self.build_scheduled_cond(
	neg_prompt_template, steps, cache)

	self.scheduled_conds = scheduled_conds
	self.plain_scheduled_cond = plain_scheduled_cond
	self.scheduled_uncond = scheduled_uncond

	def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
	r"""
	Encodes the prompt into text encoder hidden states.

	Args:
	prompt (`str` or `list(int)`):
	prompt to be encoded
	device: (`torch.device`):
	torch device
	num_images_per_prompt (`int`):
	number of images that should be generated per prompt
	do_classifier_free_guidance (`bool`):
	whether to use classifier free guidance or not
	negative_prompt (`str` or `List[str]`):
	The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
	if `guidance_scale` is less than `1`).
	"""
	batch_size = len(prompt) if isinstance(prompt, list) else 1

	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="np",
	)
	text_input_ids = text_inputs.input_ids
	text_input_ids = torch.from_numpy(text_input_ids)
	untruncated_ids = self.tokenizer(
	prompt, padding="max_length", return_tensors="np").input_ids
	untruncated_ids = torch.from_numpy(untruncated_ids)

	if (
	text_input_ids.shape == untruncated_ids.shape
	and text_input_ids.numel() == untruncated_ids.numel()
	and not torch.equal(text_input_ids, untruncated_ids)
	):
	removed_text = self.tokenizer.batch_decode(
	untruncated_ids[:, self.tokenizer.model_max_length - 1: -1])
	logger.warning(
	"The following part of your input was truncated because CLIP can only handle sequences up to"
	f" {self.tokenizer.model_max_length} tokens: {removed_text}"
	)

	if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
	attention_mask = text_inputs.attention_mask.to(device)
	else:
	attention_mask = None

	text_embeddings = self.text_encoder(
	text_input_ids.to(device), attention_mask=attention_mask)
	text_embeddings = text_embeddings[0]

	# duplicate text embeddings for each generation per prompt, using mps friendly method
	bs_embed, seq_len, _ = text_embeddings.shape
	text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
	text_embeddings = text_embeddings.view(
	bs_embed * num_images_per_prompt, seq_len, -1)

	# get unconditional embeddings for classifier free guidance
	if do_classifier_free_guidance:
	uncond_tokens: List[str]
	if negative_prompt is None:
	uncond_tokens = [""] * batch_size
	elif type(prompt) is not type(negative_prompt):
	raise TypeError(
	f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
	f" {type(prompt)}."
	)
	elif isinstance(negative_prompt, str):
	uncond_tokens = [negative_prompt]
	elif batch_size != len(negative_prompt):
	raise ValueError(
	f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
	f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
	" the batch size of `prompt`."
	)
	else:
	uncond_tokens = negative_prompt

	max_length = text_input_ids.shape[-1]
	uncond_input = self.tokenizer(
	uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_tensors="np",
	)

	if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
	attention_mask = torch.from_numpy(
	uncond_input.attention_mask).to(device)
	else:
	attention_mask = None

	uncond_embeddings = self.text_encoder(
	torch.from_numpy(uncond_input.input_ids).to(device), attention_mask=attention_mask,
	)
	uncond_embeddings = uncond_embeddings[0]

	# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
	seq_len = uncond_embeddings.shape[1]
	uncond_embeddings = uncond_embeddings.repeat(
	1, num_images_per_prompt, 1)
	uncond_embeddings = uncond_embeddings.view(
	batch_size * num_images_per_prompt, seq_len, -1)

	# For classifier free guidance, we need to do two forward passes.
	# Here we concatenate the unconditional and text embeddings into a single batch
	# to avoid doing two forward passes
	text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

	return text_embeddings

	def _encode_promptv2(
	self,
	prompt,
	device,
	num_images_per_prompt,
	do_classifier_free_guidance,
	negative_prompt=None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	):

	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	if prompt_embeds is None:
	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	untruncated_ids = self.tokenizer(
	prompt, padding="longest", return_tensors="pt").input_ids

	if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
	text_input_ids, untruncated_ids
	):
	removed_text = self.tokenizer.batch_decode(
	untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
	)

	if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
	attention_mask = text_inputs.attention_mask.to(device)
	else:
	attention_mask = None

	prompt_embeds = self.text_encoder(
	text_input_ids.to(device),
	attention_mask=attention_mask,
	)
	prompt_embeds = prompt_embeds[0]

	prompt_embeds = prompt_embeds.to(
	dtype=self.text_encoder.dtype, device=device)

	bs_embed, seq_len, _ = prompt_embeds.shape
	# duplicate text embeddings for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(
	bs_embed * num_images_per_prompt, seq_len, -1)

	# get unconditional embeddings for classifier free guidance
	if do_classifier_free_guidance and negative_prompt_embeds is None:
	uncond_tokens: List[str]
	if negative_prompt is None:
	uncond_tokens = [""] * batch_size
	elif type(prompt) is not type(negative_prompt):
	raise TypeError(
	f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
	f" {type(prompt)}."
	)
	elif isinstance(negative_prompt, str):
	uncond_tokens = [negative_prompt]
	elif batch_size != len(negative_prompt):
	raise ValueError(
	f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
	f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
	" the batch size of `prompt`."
	)
	else:
	uncond_tokens = negative_prompt

	max_length = prompt_embeds.shape[1]
	uncond_input = self.tokenizer(
	uncond_tokens,
	padding="max_length",
	max_length=max_length,
	truncation=True,
	return_tensors="pt",
	)

	if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
	attention_mask = uncond_input.attention_mask.to(device)
	else:
	attention_mask = None

	negative_prompt_embeds = self.text_encoder(
	uncond_input.input_ids.to(device),
	attention_mask=attention_mask,
	)
	negative_prompt_embeds = negative_prompt_embeds[0]

	if do_classifier_free_guidance:
	# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
	seq_len = negative_prompt_embeds.shape[1]

	negative_prompt_embeds = negative_prompt_embeds.to(
	dtype=self.text_encoder.dtype, device=device)

	negative_prompt_embeds = negative_prompt_embeds.repeat(
	1, num_images_per_prompt, 1)
	negative_prompt_embeds = negative_prompt_embeds.view(
	batch_size * num_images_per_prompt, seq_len, -1)

	negative_prompt_embeds, prompt_embeds = self.compel.pad_conditioning_tensors_to_same_length(
	[negative_prompt_embeds, prompt_embeds])
	# For classifier free guidance, we need to do two forward passes.
	# Here we concatenate the unconditional and text embeddings into a single batch
	# to avoid doing two forward passes
	prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

	return prompt_embeds

	def _pyramid_noise_like(self, noise, device, seed, iterations=6, discount=0.4):
	gen = torch.manual_seed(seed)
	# EDIT: w and h get over-written, rename for a different variant!
	b, c, w, h = noise.shape
	u = torch.nn.Upsample(size=(w, h), mode="bilinear").to(device)
	for i in range(iterations):
	r = random.random() * 2 + 2 # Rather than always going 2x,
	wn, hn = max(1, int(w / (ri))), max(1, int(h / (ri)))
	noise += u(torch.randn(b, c, wn, hn,
	generator=gen).to(device)) * discount**i
	if wn == 1 or hn == 1:
	break # Lowest resolution is 1x1
	return noise / noise.std() # Scaled back to roughly unit variance

	@torch.no_grad()
	def inferV4(
	self,
	prompt: Union[str, List[str]],
	height: Optional[int] = None,
	width: Optional[int] = None,
	num_inference_steps: int = 50,
	guidance_scale: float = 7.5,
	negative_prompt: Optional[Union[str, List[str]]] = None,
	num_images_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[torch.Generator] = None,
	latents: Optional[torch.FloatTensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	callback: Optional[Callable[[
	int, int, torch.FloatTensor], None]] = None,
	callback_steps: Optional[int] = 1,
	compile_unet: bool = True,
	compile_vae: bool = True,
	compile_tenc: bool = True,
	max_tokens=0,
	seed=-1,
	flags=[],
	og_prompt=None,
	og_neg_prompt=None,
	disc=0.4,
	iter=6,
	pyramid=0, # disabled by default unless specified
	):
	r"""
	Function invoked when calling the pipeline for generation.

	Args:
	prompt (`str` or `List[str]`):
	The prompt or prompts to guide the image generation.
	height (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The height in pixels of the generated image.
	width (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The width in pixels of the generated image.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
	`guidance_scale` is defined as `w` of equation 2. of [Imagen
	Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
	1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
	usually at the expense of lower image quality.
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
	if `guidance_scale` is less than `1`).
	num_images_per_prompt (`int`, optional, defaults to 1):
	The number of images to generate per prompt.
	eta (`float`, optional, defaults to 0.0):
	Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
	[`schedulers.DDIMScheduler`], will be ignored for others.
	generator (`torch.Generator`, optional):
	A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
	deterministic.
	latents (`torch.FloatTensor`, optional):
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random `generator`.
	output_type (`str`, optional, defaults to `"pil"`):
	The output format of the generate image. Choose between
	[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
	plain tuple.
	callback (`Callable`, optional):
	A function that will be called every `callback_steps` steps during inference. The function will be
	called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
	callback_steps (`int`, optional, defaults to 1):
	The frequency at which the `callback` function will be called. If not specified, the callback will be
	called at every step.

	Returns:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
	When returning a tuple, the first element is a list with the generated images, and the second element is a
	list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
	(nsfw) content, according to the `safety_checker`.
	"""
	# 0. Default height and width to unet

	height = height or self.unet.config.sample_size * self.vae_scale_factor
	width = width or self.unet.config.sample_size * self.vae_scale_factor

	self.check_inputs(prompt, height, width, callback_steps)
	if negative_prompt == None:
	negative_prompt = ['']
	# 2. Define call parameters
	batch_size = 1 if isinstance(prompt, str) else len(prompt)
	device = self._execution_device
	# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
	# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
	# corresponds to doing no classifier free guidance.
	do_classifier_free_guidance = guidance_scale > 1.0

	# # 3. Encode input prompt

	self.scheduler.set_timesteps(num_inference_steps, device=device)
	timesteps = self.scheduler.timesteps

	# Cache key for flags
	plain = "plain" in flags
	flair = None
	for flag in flags:
	if "flair" in flag:
	flair = flag
	break

	with torch.no_grad():
	c_time = time.time()
	user_cond = self.build_scheduled_cond(
	prompt[0], num_inference_steps, ('pos', og_prompt, seed, plain, flair))
	c_time = time.time()
	user_uncond = self.build_scheduled_cond(
	negative_prompt[0], num_inference_steps, ('neg', negative_prompt[0], 0))

	c = []
	c.extend(user_cond)
	uc = []
	uc.extend(user_uncond)
	max_token_count = 0

	for cond in uc:
	if cond.cond.shape[1] > max_token_count:
	max_token_count = cond.cond.shape[1]
	for cond in c:
	if cond.cond.shape[1] > max_token_count:
	max_token_count = cond.cond.shape[1]

	def pad_tensor(conditionings: List[ScheduledPromptConditioning], max_token_count: int) -> List[ScheduledPromptConditioning]:

	c0_shape = conditionings[0].cond.shape
	if not all([len(c.cond.shape) == len(c0_shape) for c in conditionings]):
	raise ValueError(
	"Conditioning tensors must all have either 2 dimensions (unbatched) or 3 dimensions (batched)")

	if len(c0_shape) == 2:
	# need to be unsqueezed
	for c in conditionings:
	c.cond = c.cond.unsqueeze(0)
	c0_shape = conditionings[0].cond.shape
	if len(c0_shape) != 3:
	raise ValueError(
	f"All conditioning tensors must have the same number of dimensions (2 or 3)")

	if not all([c.cond.shape[0] == c0_shape[0] and c.cond.shape[2] == c0_shape[2] for c in conditionings]):
	raise ValueError(
	f"All conditioning tensors must have the same batch size ({c0_shape[0]}) and number of embeddings per token ({c0_shape[1]}")

	# if necessary, pad shorter tensors out with an emptystring tensor
	empty_z = torch.cat(
	[self.compel.build_conditioning_tensor("")] * c0_shape[0])
	for i, c in enumerate(conditionings):
	cond = c.cond.to(self.device)
	while cond.shape[1] < max_token_count:
	cond = torch.cat([cond, empty_z], dim=1)
	conditionings[i] = ScheduledPromptConditioning(
	c.end_at_step, cond)
	return conditionings

	uc = pad_tensor(uc, max_token_count)
	c = pad_tensor(c, max_token_count)

	next_uc = uc.pop(0)
	next_c = c.pop(0)
	prompt_embeds = None
	new_embeds = True
	embed_per_step = []
	for i in range(len(timesteps)):
	if i > next_uc.end_at_step:
	next_uc = uc.pop(0)
	new_embeds = True
	if i > next_c.end_at_step:
	next_c = c.pop(0)
	new_embeds = True

	if new_embeds:
	negative_prompt_embeds, prompt_embeds = self.compel.pad_conditioning_tensors_to_same_length([
	next_uc.cond, next_c.cond])
	prompt_embeds = torch.cat(
	[negative_prompt_embeds, prompt_embeds])
	new_embeds = False

	embed_per_step.append(prompt_embeds)

	# 5. Prepare latent variables
	num_channels_latents = self.unet.in_channels
	latents = self.prepare_latents(
	batch_size * num_images_per_prompt,
	num_channels_latents,
	height,
	width,
	prompt_embeds.dtype,
	device,
	generator,
	latents,
	)

	# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
	extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

	# 7. Denoising loop
	num_warmup_steps = len(timesteps) - \
	num_inference_steps * self.scheduler.order
	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	# expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat(
	[latents] * 2) if do_classifier_free_guidance else latents
	latent_model_input = self.scheduler.scale_model_input(
	latent_model_input, t)

	prompt_embeds = embed_per_step[i]
	# predict the noise residual

	noise_pred = self.unet(
	latent_model_input, t, encoder_hidden_states=prompt_embeds).sample

	# perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * \
	(noise_pred_text - noise_pred_uncond)

	if (i < pyramid*num_inference_steps):
	noise_pred = self._pyramid_noise_like(
	noise_pred, device, seed, iterations=iter, discount=disc)

	# compute the previous noisy sample x_t -> x_t-1
	latents = self.scheduler.step(
	noise_pred, t, latents, **extra_step_kwargs).prev_sample

	# call the callback, if provided
	if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
	progress_bar.update()
	if callback is not None and i % callback_steps == 0:
	callback(i, t, latents)

	if not output_type == "latent":
	image = self.vae.decode(
	latents / self.vae.config.scaling_factor, return_dict=False)[0]
	image, has_nsfw_concept = self.run_safety_checker(
	image, device, prompt_embeds.dtype)
	else:
	image = latents
	has_nsfw_concept = None

	if has_nsfw_concept is None:
	do_denormalize = [True] * image.shape[0]
	else:
	do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]

	image = self.image_processor.postprocess(
	image, output_type=output_type, do_denormalize=do_denormalize)

	# Offload last model to CPU
	if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
	self.final_offload_hook.offload()

	if not return_dict:
	return (image, has_nsfw_concept)

	return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

	@torch.no_grad()
	def inferPipe(
	self,
	prompt: Union[str, List[str]] = None,
	height: Optional[int] = None,
	width: Optional[int] = None,
	num_inference_steps: int = 50,
	guidance_scale: float = 7.5,
	negative_prompt: Optional[Union[str, List[str]]] = None,
	num_images_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[Union[torch.Generator,
	List[torch.Generator]]] = None,
	latents: Optional[torch.FloatTensor] = None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	callback: Optional[Callable[[
	int, int, torch.FloatTensor], None]] = None,
	callback_steps: int = 1,
	cross_attention_kwargs: Optional[Dict[str, Any]] = None,
	):
	r"""
	Function invoked when calling the pipeline for generation.

	Args:
	prompt (`str` or `List[str]`, optional):
	The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
	instead.
	height (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The height in pixels of the generated image.
	width (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The width in pixels of the generated image.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
	`guidance_scale` is defined as `w` of equation 2. of [Imagen
	Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
	1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
	usually at the expense of lower image quality.
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. If not defined, one has to pass
	`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
	less than `1`).
	num_images_per_prompt (`int`, optional, defaults to 1):
	The number of images to generate per prompt.
	eta (`float`, optional, defaults to 0.0):
	Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
	[`schedulers.DDIMScheduler`], will be ignored for others.
	generator (`torch.Generator` or `List[torch.Generator]`, optional):
	One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
	to make generation deterministic.
	latents (`torch.FloatTensor`, optional):
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random `generator`.
	prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	negative_prompt_embeds (`torch.FloatTensor`, optional):
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt
	weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
	argument.
	output_type (`str`, optional, defaults to `"pil"`):
	The output format of the generate image. Choose between
	[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
	plain tuple.
	callback (`Callable`, optional):
	A function that will be called every `callback_steps` steps during inference. The function will be
	called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
	callback_steps (`int`, optional, defaults to 1):
	The frequency at which the `callback` function will be called. If not specified, the callback will be
	called at every step.
	cross_attention_kwargs (`dict`, optional):
	A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
	`self.processor` in
	[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).

	Examples:

	Returns:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
	When returning a tuple, the first element is a list with the generated images, and the second element is a
	list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
	(nsfw) content, according to the `safety_checker`.
	"""
	# 0. Default height and width to unet
	height = height or self.unet.config.sample_size * self.vae_scale_factor
	width = width or self.unet.config.sample_size * self.vae_scale_factor

	# 1. Check inputs. Raise error if not correct
	self.check_inputs(prompt, height, width, callback_steps)

	# 2. Define call parameters
	batch_size = 1 if isinstance(prompt, str) else len(prompt)
	device = self._execution_device
	# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
	# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
	# corresponds to doing no classifier free guidance.
	do_classifier_free_guidance = guidance_scale > 1.0

	# 3. Encode input prompt
	text_embeddings = self._encode_prompt(
	prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
	)

	# 4. Prepare timesteps
	self.scheduler.set_timesteps(num_inference_steps)
	timesteps = self.scheduler.timesteps

	# 5. Prepare latent variables
	num_channels_latents = self.unet.in_channels
	latents = self.prepare_latents(
	batch_size * num_images_per_prompt,
	num_channels_latents,
	height,
	width,
	text_embeddings.dtype,
	device,
	generator,
	latents,
	)

	# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
	extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

	# 7. Denoising loop
	num_warmup_steps = len(timesteps) - \
	num_inference_steps * self.scheduler.order
	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	# expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat(
	[latents] * 2) if do_classifier_free_guidance else latents
	latent_model_input = self.scheduler.scale_model_input(
	latent_model_input, t)

	noise_pred = self.unet(
	latent_model_input, t, encoder_hidden_states=text_embeddings).sample

	# perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * \
	(noise_pred_text - noise_pred_uncond)

	# compute the previous noisy sample x_t -> x_t-1
	latents = self.scheduler.step(
	noise_pred, t, latents, **extra_step_kwargs).prev_sample

	# call the callback, if provided
	if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
	progress_bar.update()
	if callback is not None and i % callback_steps == 0:
	callback(i, t, latents)

	if not output_type == "latent":
	image = self.vae.decode(
	latents / self.vae.config.scaling_factor, return_dict=False)[0]
	image, has_nsfw_concept = self.run_safety_checker(
	image, device, text_embeddings.dtype)
	else:
	image = latents
	has_nsfw_concept = None

	if has_nsfw_concept is None:
	do_denormalize = [True] * image.shape[0]
	else:
	do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]

	image = self.image_processor.postprocess(
	image, output_type=output_type, do_denormalize=do_denormalize)

	# Offload last model to CPU
	if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
	self.final_offload_hook.offload()

	if not return_dict:
	return (image, has_nsfw_concept)

	return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)