radames commited on
Commit
f1466df
·
2 Parent(s): a7d6e4d edcf6dc

erge branch 'main' into text2igm

Browse files
README.md CHANGED
@@ -16,10 +16,11 @@ You need a webcam to run this demo. 🤗
16
 
17
  ## Running Locally
18
 
19
- You need CUDA and Python 3.10 or a Mac with an M1/M2/M3 chip
20
- `TIMEOUT`: limit user session timeout
21
- `SAFETY_CHECKER`: disabled if you want NSFW filter off
22
- `MAX_QUEUE_SIZE`: limit number of users on current app instance
 
23
 
24
  ### image to image
25
 
 
16
 
17
  ## Running Locally
18
 
19
+ You need CUDA and Python 3.10, Mac with an M1/M2/M3 chip or Intel Arc GPU
20
+
21
+ `TIMEOUT`: limit user session timeout
22
+ `SAFETY_CHECKER`: disabled if you want NSFW filter off
23
+ `MAX_QUEUE_SIZE`: limit number of users on current app instance
24
 
25
  ### image to image
26
 
app-img2img.py CHANGED
@@ -9,9 +9,14 @@ from fastapi.middleware.cors import CORSMiddleware
9
  from fastapi.responses import StreamingResponse, JSONResponse
10
  from fastapi.staticfiles import StaticFiles
11
 
12
- from diffusers import DiffusionPipeline, AutoencoderTiny
13
  from compel import Compel
14
  import torch
 
 
 
 
 
15
  from PIL import Image
16
  import numpy as np
17
  import gradio as gr
@@ -27,11 +32,14 @@ SAFETY_CHECKER = os.environ.get("SAFETY_CHECKER", None)
27
  WIDTH = 512
28
  HEIGHT = 512
29
  # disable tiny autoencoder for better quality speed tradeoff
30
- USE_TINY_AUTOENCODER=True
31
 
32
  # check if MPS is available OSX only M1/M2/M3 chips
33
  mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
34
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
35
  torch_device = device
36
 
37
  # change to torch.float16 to save GPU memory
@@ -48,17 +56,13 @@ if mps_available:
48
  torch_dtype = torch.float32
49
 
50
  if SAFETY_CHECKER == "True":
51
- pipe = DiffusionPipeline.from_pretrained(
52
  "SimianLuo/LCM_Dreamshaper_v7",
53
- custom_pipeline="latent_consistency_img2img.py",
54
- custom_revision="main",
55
  )
56
  else:
57
- pipe = DiffusionPipeline.from_pretrained(
58
  "SimianLuo/LCM_Dreamshaper_v7",
59
  safety_checker=None,
60
- custom_pipeline="latent_consistency_img2img.py",
61
- custom_revision="main",
62
  )
63
 
64
  if USE_TINY_AUTOENCODER:
@@ -66,13 +70,13 @@ if USE_TINY_AUTOENCODER:
66
  "madebyollin/taesd", torch_dtype=torch_dtype, use_safetensors=True
67
  )
68
  pipe.set_progress_bar_config(disable=True)
69
- pipe.to(torch_device=torch_device, torch_dtype=torch_dtype).to(device)
70
  pipe.unet.to(memory_format=torch.channels_last)
71
 
72
  if psutil.virtual_memory().total < 64 * 1024**3:
73
  pipe.enable_attention_slicing()
74
 
75
- if not mps_available:
76
  pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
77
  pipe(prompt="warmup", image=[Image.new("RGB", (512, 512))])
78
 
@@ -93,7 +97,9 @@ class InputParams(BaseModel):
93
  height: int = HEIGHT
94
 
95
 
96
- def predict(input_image: Image.Image, params: InputParams, prompt_embeds: torch.Tensor = None):
 
 
97
  generator = torch.manual_seed(params.seed)
98
  # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
99
  num_inference_steps = 3
@@ -106,7 +112,7 @@ def predict(input_image: Image.Image, params: InputParams, prompt_embeds: torch.
106
  guidance_scale=params.guidance_scale,
107
  width=params.width,
108
  height=params.height,
109
- lcm_origin_steps=50,
110
  output_type="pil",
111
  )
112
  nsfw_content_detected = (
@@ -176,6 +182,7 @@ async def stream(user_id: uuid.UUID):
176
  try:
177
  user_queue = user_queue_map[uid]
178
  queue = user_queue["queue"]
 
179
  async def generate():
180
  last_prompt: str = None
181
  prompt_embeds: torch.Tensor = None
 
9
  from fastapi.responses import StreamingResponse, JSONResponse
10
  from fastapi.staticfiles import StaticFiles
11
 
12
+ from diffusers import AutoPipelineForImage2Image, AutoencoderTiny
13
  from compel import Compel
14
  import torch
15
+
16
+ try:
17
+ import intel_extension_for_pytorch as ipex
18
+ except:
19
+ pass
20
  from PIL import Image
21
  import numpy as np
22
  import gradio as gr
 
32
  WIDTH = 512
33
  HEIGHT = 512
34
  # disable tiny autoencoder for better quality speed tradeoff
35
+ USE_TINY_AUTOENCODER = True
36
 
37
  # check if MPS is available OSX only M1/M2/M3 chips
38
  mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
39
+ xpu_available = hasattr(torch, "xpu") and torch.xpu.is_available()
40
+ device = torch.device(
41
+ "cuda" if torch.cuda.is_available() else "xpu" if xpu_available else "cpu"
42
+ )
43
  torch_device = device
44
 
45
  # change to torch.float16 to save GPU memory
 
56
  torch_dtype = torch.float32
57
 
58
  if SAFETY_CHECKER == "True":
59
+ pipe = AutoPipelineForImage2Image.from_pretrained(
60
  "SimianLuo/LCM_Dreamshaper_v7",
 
 
61
  )
62
  else:
63
+ pipe = AutoPipelineForImage2Image.from_pretrained(
64
  "SimianLuo/LCM_Dreamshaper_v7",
65
  safety_checker=None,
 
 
66
  )
67
 
68
  if USE_TINY_AUTOENCODER:
 
70
  "madebyollin/taesd", torch_dtype=torch_dtype, use_safetensors=True
71
  )
72
  pipe.set_progress_bar_config(disable=True)
73
+ pipe.to(device=torch_device, dtype=torch_dtype).to(device)
74
  pipe.unet.to(memory_format=torch.channels_last)
75
 
76
  if psutil.virtual_memory().total < 64 * 1024**3:
77
  pipe.enable_attention_slicing()
78
 
79
+ if not mps_available and not xpu_available:
80
  pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
81
  pipe(prompt="warmup", image=[Image.new("RGB", (512, 512))])
82
 
 
97
  height: int = HEIGHT
98
 
99
 
100
+ def predict(
101
+ input_image: Image.Image, params: InputParams, prompt_embeds: torch.Tensor = None
102
+ ):
103
  generator = torch.manual_seed(params.seed)
104
  # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
105
  num_inference_steps = 3
 
112
  guidance_scale=params.guidance_scale,
113
  width=params.width,
114
  height=params.height,
115
+ original_inference_steps=50,
116
  output_type="pil",
117
  )
118
  nsfw_content_detected = (
 
182
  try:
183
  user_queue = user_queue_map[uid]
184
  queue = user_queue["queue"]
185
+
186
  async def generate():
187
  last_prompt: str = None
188
  prompt_embeds: torch.Tensor = None
app-txt2img.py CHANGED
@@ -12,6 +12,11 @@ from fastapi.staticfiles import StaticFiles
12
  from diffusers import DiffusionPipeline, AutoencoderTiny
13
  from compel import Compel
14
  import torch
 
 
 
 
 
15
  from PIL import Image
16
  import numpy as np
17
  import gradio as gr
@@ -25,14 +30,17 @@ import psutil
25
  MAX_QUEUE_SIZE = int(os.environ.get("MAX_QUEUE_SIZE", 0))
26
  TIMEOUT = float(os.environ.get("TIMEOUT", 0))
27
  SAFETY_CHECKER = os.environ.get("SAFETY_CHECKER", None)
28
- WIDTH = 512
29
- HEIGHT = 512
30
  # disable tiny autoencoder for better quality speed tradeoff
31
- USE_TINY_AUTOENCODER=True
32
 
33
  # check if MPS is available OSX only M1/M2/M3 chips
34
  mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
35
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
36
  torch_device = device
37
  # change to torch.float16 to save GPU memory
38
  torch_dtype = torch.float32
@@ -50,29 +58,25 @@ if mps_available:
50
  if SAFETY_CHECKER == "True":
51
  pipe = DiffusionPipeline.from_pretrained(
52
  "SimianLuo/LCM_Dreamshaper_v7",
53
- custom_pipeline="latent_consistency_txt2img.py",
54
- custom_revision="main",
55
  )
56
  else:
57
  pipe = DiffusionPipeline.from_pretrained(
58
  "SimianLuo/LCM_Dreamshaper_v7",
59
  safety_checker=None,
60
- custom_pipeline="latent_consistency_txt2img.py",
61
- custom_revision="main",
62
  )
63
  if USE_TINY_AUTOENCODER:
64
  pipe.vae = AutoencoderTiny.from_pretrained(
65
  "madebyollin/taesd", torch_dtype=torch_dtype, use_safetensors=True
66
  )
67
  pipe.set_progress_bar_config(disable=True)
68
- pipe.to(torch_device=torch_device, torch_dtype=torch_dtype).to(device)
69
  pipe.unet.to(memory_format=torch.channels_last)
70
 
71
  # check if computer has less than 64GB of RAM using sys or os
72
  if psutil.virtual_memory().total < 64 * 1024**3:
73
  pipe.enable_attention_slicing()
74
 
75
- if not mps_available:
76
  pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
77
  pipe(prompt="warmup", num_inference_steps=1, guidance_scale=8.0)
78
 
@@ -83,6 +87,7 @@ compel_proc = Compel(
83
  )
84
  user_queue_map = {}
85
 
 
86
  class InputParams(BaseModel):
87
  prompt: str
88
  seed: int = 2159232
@@ -90,6 +95,7 @@ class InputParams(BaseModel):
90
  width: int = WIDTH
91
  height: int = HEIGHT
92
 
 
93
  def predict(params: InputParams):
94
  generator = torch.manual_seed(params.seed)
95
  prompt_embeds = compel_proc(params.prompt)
@@ -102,7 +108,7 @@ def predict(params: InputParams):
102
  guidance_scale=params.guidance_scale,
103
  width=params.width,
104
  height=params.height,
105
- lcm_origin_steps=50,
106
  output_type="pil",
107
  )
108
  nsfw_content_detected = (
@@ -124,6 +130,7 @@ app.add_middleware(
124
  allow_headers=["*"],
125
  )
126
 
 
127
  @app.websocket("/ws")
128
  async def websocket_endpoint(websocket: WebSocket):
129
  await websocket.accept()
 
12
  from diffusers import DiffusionPipeline, AutoencoderTiny
13
  from compel import Compel
14
  import torch
15
+
16
+ try:
17
+ import intel_extension_for_pytorch as ipex
18
+ except:
19
+ pass
20
  from PIL import Image
21
  import numpy as np
22
  import gradio as gr
 
30
  MAX_QUEUE_SIZE = int(os.environ.get("MAX_QUEUE_SIZE", 0))
31
  TIMEOUT = float(os.environ.get("TIMEOUT", 0))
32
  SAFETY_CHECKER = os.environ.get("SAFETY_CHECKER", None)
33
+ WIDTH = 768
34
+ HEIGHT = 768
35
  # disable tiny autoencoder for better quality speed tradeoff
36
+ USE_TINY_AUTOENCODER = False
37
 
38
  # check if MPS is available OSX only M1/M2/M3 chips
39
  mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
40
+ xpu_available = hasattr(torch, "xpu") and torch.xpu.is_available()
41
+ device = torch.device(
42
+ "cuda" if torch.cuda.is_available() else "xpu" if xpu_available else "cpu"
43
+ )
44
  torch_device = device
45
  # change to torch.float16 to save GPU memory
46
  torch_dtype = torch.float32
 
58
  if SAFETY_CHECKER == "True":
59
  pipe = DiffusionPipeline.from_pretrained(
60
  "SimianLuo/LCM_Dreamshaper_v7",
 
 
61
  )
62
  else:
63
  pipe = DiffusionPipeline.from_pretrained(
64
  "SimianLuo/LCM_Dreamshaper_v7",
65
  safety_checker=None,
 
 
66
  )
67
  if USE_TINY_AUTOENCODER:
68
  pipe.vae = AutoencoderTiny.from_pretrained(
69
  "madebyollin/taesd", torch_dtype=torch_dtype, use_safetensors=True
70
  )
71
  pipe.set_progress_bar_config(disable=True)
72
+ pipe.to(device=torch_device, dtype=torch_dtype).to(device)
73
  pipe.unet.to(memory_format=torch.channels_last)
74
 
75
  # check if computer has less than 64GB of RAM using sys or os
76
  if psutil.virtual_memory().total < 64 * 1024**3:
77
  pipe.enable_attention_slicing()
78
 
79
+ if not mps_available and not xpu_available:
80
  pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
81
  pipe(prompt="warmup", num_inference_steps=1, guidance_scale=8.0)
82
 
 
87
  )
88
  user_queue_map = {}
89
 
90
+
91
  class InputParams(BaseModel):
92
  prompt: str
93
  seed: int = 2159232
 
95
  width: int = WIDTH
96
  height: int = HEIGHT
97
 
98
+
99
  def predict(params: InputParams):
100
  generator = torch.manual_seed(params.seed)
101
  prompt_embeds = compel_proc(params.prompt)
 
108
  guidance_scale=params.guidance_scale,
109
  width=params.width,
110
  height=params.height,
111
+ original_inference_steps=50,
112
  output_type="pil",
113
  )
114
  nsfw_content_detected = (
 
130
  allow_headers=["*"],
131
  )
132
 
133
+
134
  @app.websocket("/ws")
135
  async def websocket_endpoint(websocket: WebSocket):
136
  await websocket.accept()
img2img/index.html CHANGED
@@ -257,7 +257,7 @@
257
  <output class="text-xs w-[50px] text-center font-light px-1 py-1 border border-gray-700 rounded-md">
258
  8.0</output>
259
  <label class="text-sm font-medium" for="strength">Strength</label>
260
- <input type="range" id="strength" name="strength" min="0.20" max="1" step="0.001" value="0.50"
261
  oninput="this.nextElementSibling.value = Number(this.value).toFixed(2)">
262
  <output class="text-xs w-[50px] text-center font-light px-1 py-1 border border-gray-700 rounded-md">
263
  0.5</output>
 
257
  <output class="text-xs w-[50px] text-center font-light px-1 py-1 border border-gray-700 rounded-md">
258
  8.0</output>
259
  <label class="text-sm font-medium" for="strength">Strength</label>
260
+ <input type="range" id="strength" name="strength" min="0.02" max="1" step="0.001" value="0.50"
261
  oninput="this.nextElementSibling.value = Number(this.value).toFixed(2)">
262
  <output class="text-xs w-[50px] text-center font-light px-1 py-1 border border-gray-700 rounded-md">
263
  0.5</output>
latent_consistency_img2img.py DELETED
@@ -1,934 +0,0 @@
1
- # Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- # DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
16
- # and https://github.com/hojonathanho/diffusion
17
-
18
- import math
19
- from dataclasses import dataclass
20
- from typing import Any, Dict, List, Optional, Tuple, Union
21
-
22
- import numpy as np
23
- import PIL.Image
24
- import torch
25
- from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
26
-
27
- from diffusers import (
28
- AutoencoderTiny,
29
- AutoencoderKL,
30
- ConfigMixin,
31
- DiffusionPipeline,
32
- SchedulerMixin,
33
- UNet2DConditionModel,
34
- logging,
35
- )
36
- from diffusers.configuration_utils import register_to_config
37
- from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
38
- from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
39
- from diffusers.pipelines.stable_diffusion.safety_checker import (
40
- StableDiffusionSafetyChecker,
41
- )
42
- from diffusers.utils import BaseOutput
43
- from diffusers.utils.torch_utils import randn_tensor
44
-
45
-
46
- logger = logging.get_logger(__name__) # pylint: disable=invalid-name
47
-
48
-
49
- class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):
50
- _optional_components = ["scheduler"]
51
-
52
- def __init__(
53
- self,
54
- vae: AutoencoderKL,
55
- text_encoder: CLIPTextModel,
56
- tokenizer: CLIPTokenizer,
57
- unet: UNet2DConditionModel,
58
- scheduler: "LCMSchedulerWithTimestamp",
59
- safety_checker: StableDiffusionSafetyChecker,
60
- feature_extractor: CLIPImageProcessor,
61
- requires_safety_checker: bool = True,
62
- ):
63
- super().__init__()
64
-
65
- scheduler = (
66
- scheduler
67
- if scheduler is not None
68
- else LCMSchedulerWithTimestamp(
69
- beta_start=0.00085,
70
- beta_end=0.0120,
71
- beta_schedule="scaled_linear",
72
- prediction_type="epsilon",
73
- )
74
- )
75
-
76
- self.register_modules(
77
- vae=vae,
78
- text_encoder=text_encoder,
79
- tokenizer=tokenizer,
80
- unet=unet,
81
- scheduler=scheduler,
82
- safety_checker=safety_checker,
83
- feature_extractor=feature_extractor,
84
- )
85
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
86
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
87
-
88
- def _encode_prompt(
89
- self,
90
- prompt,
91
- device,
92
- num_images_per_prompt,
93
- prompt_embeds: None,
94
- ):
95
- r"""
96
- Encodes the prompt into text encoder hidden states.
97
- Args:
98
- prompt (`str` or `List[str]`, *optional*):
99
- prompt to be encoded
100
- device: (`torch.device`):
101
- torch device
102
- num_images_per_prompt (`int`):
103
- number of images that should be generated per prompt
104
- prompt_embeds (`torch.FloatTensor`, *optional*):
105
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
106
- provided, text embeddings will be generated from `prompt` input argument.
107
- """
108
-
109
- if prompt is not None and isinstance(prompt, str):
110
- pass
111
- elif prompt is not None and isinstance(prompt, list):
112
- len(prompt)
113
- else:
114
- prompt_embeds.shape[0]
115
-
116
- if prompt_embeds is None:
117
- text_inputs = self.tokenizer(
118
- prompt,
119
- padding="max_length",
120
- max_length=self.tokenizer.model_max_length,
121
- truncation=True,
122
- return_tensors="pt",
123
- )
124
- text_input_ids = text_inputs.input_ids
125
- untruncated_ids = self.tokenizer(
126
- prompt, padding="longest", return_tensors="pt"
127
- ).input_ids
128
-
129
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
130
- -1
131
- ] and not torch.equal(text_input_ids, untruncated_ids):
132
- removed_text = self.tokenizer.batch_decode(
133
- untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
134
- )
135
- logger.warning(
136
- "The following part of your input was truncated because CLIP can only handle sequences up to"
137
- f" {self.tokenizer.model_max_length} tokens: {removed_text}"
138
- )
139
-
140
- if (
141
- hasattr(self.text_encoder.config, "use_attention_mask")
142
- and self.text_encoder.config.use_attention_mask
143
- ):
144
- attention_mask = text_inputs.attention_mask.to(device)
145
- else:
146
- attention_mask = None
147
-
148
- prompt_embeds = self.text_encoder(
149
- text_input_ids.to(device),
150
- attention_mask=attention_mask,
151
- )
152
- prompt_embeds = prompt_embeds[0]
153
-
154
- if self.text_encoder is not None:
155
- prompt_embeds_dtype = self.text_encoder.dtype
156
- elif self.unet is not None:
157
- prompt_embeds_dtype = self.unet.dtype
158
- else:
159
- prompt_embeds_dtype = prompt_embeds.dtype
160
-
161
- prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
162
-
163
- bs_embed, seq_len, _ = prompt_embeds.shape
164
- # duplicate text embeddings for each generation per prompt, using mps friendly method
165
- prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
166
- prompt_embeds = prompt_embeds.view(
167
- bs_embed * num_images_per_prompt, seq_len, -1
168
- )
169
-
170
- # Don't need to get uncond prompt embedding because of LCM Guided Distillation
171
- return prompt_embeds
172
-
173
- def run_safety_checker(self, image, device, dtype):
174
- if self.safety_checker is None:
175
- has_nsfw_concept = None
176
- else:
177
- if torch.is_tensor(image):
178
- feature_extractor_input = self.image_processor.postprocess(
179
- image, output_type="pil"
180
- )
181
- else:
182
- feature_extractor_input = self.image_processor.numpy_to_pil(image)
183
- safety_checker_input = self.feature_extractor(
184
- feature_extractor_input, return_tensors="pt"
185
- ).to(device)
186
- image, has_nsfw_concept = self.safety_checker(
187
- images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
188
- )
189
- return image, has_nsfw_concept
190
-
191
- def prepare_latents(
192
- self,
193
- image,
194
- timestep,
195
- batch_size,
196
- num_channels_latents,
197
- height,
198
- width,
199
- dtype,
200
- device,
201
- latents=None,
202
- generator=None,
203
- ):
204
- shape = (
205
- batch_size,
206
- num_channels_latents,
207
- height // self.vae_scale_factor,
208
- width // self.vae_scale_factor,
209
- )
210
-
211
- if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
212
- raise ValueError(
213
- f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
214
- )
215
-
216
- image = image.to(device=device, dtype=dtype)
217
-
218
- # batch_size = batch_size * num_images_per_prompt
219
- if image.shape[1] == 4:
220
- init_latents = image
221
-
222
- else:
223
- if isinstance(generator, list) and len(generator) != batch_size:
224
- raise ValueError(
225
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
226
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
227
- )
228
-
229
- elif isinstance(generator, list):
230
- if isinstance(self.vae, AutoencoderTiny):
231
- init_latents = [
232
- self.vae.encode(image[i : i + 1]).latents
233
- for i in range(batch_size)
234
- ]
235
- else:
236
- init_latents = [
237
- self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i])
238
- for i in range(batch_size)
239
- ]
240
- init_latents = torch.cat(init_latents, dim=0)
241
- else:
242
- if isinstance(self.vae, AutoencoderTiny):
243
- init_latents = self.vae.encode(image).latents
244
- else:
245
- init_latents = self.vae.encode(image).latent_dist.sample(generator)
246
-
247
- init_latents = self.vae.config.scaling_factor * init_latents
248
-
249
- if (
250
- batch_size > init_latents.shape[0]
251
- and batch_size % init_latents.shape[0] == 0
252
- ):
253
- # expand init_latents for batch_size
254
- (
255
- f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
256
- " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
257
- " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
258
- " your script to pass as many initial images as text prompts to suppress this warning."
259
- )
260
- # deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
261
- additional_image_per_prompt = batch_size // init_latents.shape[0]
262
- init_latents = torch.cat(
263
- [init_latents] * additional_image_per_prompt, dim=0
264
- )
265
- elif (
266
- batch_size > init_latents.shape[0]
267
- and batch_size % init_latents.shape[0] != 0
268
- ):
269
- raise ValueError(
270
- f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
271
- )
272
- else:
273
- init_latents = torch.cat([init_latents], dim=0)
274
-
275
- shape = init_latents.shape
276
- noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
277
-
278
- # get latents
279
- init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
280
- latents = init_latents
281
-
282
- if latents is None:
283
- latents = torch.randn(shape, dtype=dtype).to(device)
284
- else:
285
- latents = latents.to(device)
286
- # scale the initial noise by the standard deviation required by the scheduler
287
- latents = latents * self.scheduler.init_noise_sigma
288
- return latents
289
-
290
- def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
291
- """
292
- see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
293
- Args:
294
- timesteps: torch.Tensor: generate embedding vectors at these timesteps
295
- embedding_dim: int: dimension of the embeddings to generate
296
- dtype: data type of the generated embeddings
297
- Returns:
298
- embedding vectors with shape `(len(timesteps), embedding_dim)`
299
- """
300
- assert len(w.shape) == 1
301
- w = w * 1000.0
302
-
303
- half_dim = embedding_dim // 2
304
- emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
305
- emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
306
- emb = w.to(dtype)[:, None] * emb[None, :]
307
- emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
308
- if embedding_dim % 2 == 1: # zero pad
309
- emb = torch.nn.functional.pad(emb, (0, 1))
310
- assert emb.shape == (w.shape[0], embedding_dim)
311
- return emb
312
-
313
- def get_timesteps(self, num_inference_steps, strength, device):
314
- # get the original timestep using init_timestep
315
- init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
316
-
317
- t_start = max(num_inference_steps - init_timestep, 0)
318
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
319
-
320
- return timesteps, num_inference_steps - t_start
321
-
322
- @torch.no_grad()
323
- def __call__(
324
- self,
325
- prompt: Union[str, List[str]] = None,
326
- image: PipelineImageInput = None,
327
- strength: float = 0.8,
328
- height: Optional[int] = 768,
329
- width: Optional[int] = 768,
330
- guidance_scale: float = 7.5,
331
- num_images_per_prompt: Optional[int] = 1,
332
- latents: Optional[torch.FloatTensor] = None,
333
- generator: Optional[torch.Generator] = None,
334
- num_inference_steps: int = 4,
335
- lcm_origin_steps: int = 50,
336
- prompt_embeds: Optional[torch.FloatTensor] = None,
337
- output_type: Optional[str] = "pil",
338
- return_dict: bool = True,
339
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
340
- ):
341
- # 0. Default height and width to unet
342
- height = height or self.unet.config.sample_size * self.vae_scale_factor
343
- width = width or self.unet.config.sample_size * self.vae_scale_factor
344
-
345
- # 2. Define call parameters
346
- if prompt is not None and isinstance(prompt, str):
347
- batch_size = 1
348
- elif prompt is not None and isinstance(prompt, list):
349
- batch_size = len(prompt)
350
- else:
351
- batch_size = prompt_embeds.shape[0]
352
-
353
- device = self._execution_device
354
- # do_classifier_free_guidance = guidance_scale > 0.0 # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
355
-
356
- # 3. Encode input prompt
357
- prompt_embeds = self._encode_prompt(
358
- prompt,
359
- device,
360
- num_images_per_prompt,
361
- prompt_embeds=prompt_embeds,
362
- )
363
-
364
- # 3.5 encode image
365
- image = self.image_processor.preprocess(image)
366
-
367
- # 4. Prepare timesteps
368
- self.scheduler.set_timesteps(strength, num_inference_steps, lcm_origin_steps)
369
- # timesteps = self.scheduler.timesteps
370
- # timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, 1.0, device)
371
- timesteps = self.scheduler.timesteps
372
- latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
373
-
374
- print("timesteps: ", timesteps)
375
-
376
- # 5. Prepare latent variable
377
- num_channels_latents = self.unet.config.in_channels
378
- latents = self.prepare_latents(
379
- image,
380
- latent_timestep,
381
- batch_size * num_images_per_prompt,
382
- num_channels_latents,
383
- height,
384
- width,
385
- prompt_embeds.dtype,
386
- device,
387
- latents,
388
- generator
389
- )
390
- bs = batch_size * num_images_per_prompt
391
-
392
- # 6. Get Guidance Scale Embedding
393
- w = torch.tensor(guidance_scale).repeat(bs)
394
- w_embedding = self.get_w_embedding(w, embedding_dim=256).to(
395
- device=device, dtype=latents.dtype
396
- )
397
-
398
- # 7. LCM MultiStep Sampling Loop:
399
- with self.progress_bar(total=num_inference_steps) as progress_bar:
400
- for i, t in enumerate(timesteps):
401
- ts = torch.full((bs,), t, device=device, dtype=torch.long)
402
- latents = latents.to(prompt_embeds.dtype)
403
-
404
- # model prediction (v-prediction, eps, x)
405
- model_pred = self.unet(
406
- latents,
407
- ts,
408
- timestep_cond=w_embedding,
409
- encoder_hidden_states=prompt_embeds,
410
- cross_attention_kwargs=cross_attention_kwargs,
411
- return_dict=False,
412
- )[0]
413
-
414
- # compute the previous noisy sample x_t -> x_t-1
415
- latents, denoised = self.scheduler.step(
416
- model_pred, i, t, latents, return_dict=False
417
- )
418
-
419
- # # call the callback, if provided
420
- # if i == len(timesteps) - 1:
421
- progress_bar.update()
422
-
423
- denoised = denoised.to(prompt_embeds.dtype)
424
- if not output_type == "latent":
425
- image = self.vae.decode(
426
- denoised / self.vae.config.scaling_factor, return_dict=False
427
- )[0]
428
- image, has_nsfw_concept = self.run_safety_checker(
429
- image, device, prompt_embeds.dtype
430
- )
431
- else:
432
- image = denoised
433
- has_nsfw_concept = None
434
-
435
- if has_nsfw_concept is None:
436
- do_denormalize = [True] * image.shape[0]
437
- else:
438
- do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
439
-
440
- image = self.image_processor.postprocess(
441
- image, output_type=output_type, do_denormalize=do_denormalize
442
- )
443
-
444
- if not return_dict:
445
- return (image, has_nsfw_concept)
446
-
447
- return StableDiffusionPipelineOutput(
448
- images=image, nsfw_content_detected=has_nsfw_concept
449
- )
450
-
451
-
452
- @dataclass
453
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
454
- class LCMSchedulerOutput(BaseOutput):
455
- """
456
- Output class for the scheduler's `step` function output.
457
- Args:
458
- prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
459
- Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
460
- denoising loop.
461
- pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
462
- The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
463
- `pred_original_sample` can be used to preview progress or for guidance.
464
- """
465
-
466
- prev_sample: torch.FloatTensor
467
- denoised: Optional[torch.FloatTensor] = None
468
-
469
-
470
- # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
471
- def betas_for_alpha_bar(
472
- num_diffusion_timesteps,
473
- max_beta=0.999,
474
- alpha_transform_type="cosine",
475
- ):
476
- """
477
- Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
478
- (1-beta) over time from t = [0,1].
479
- Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
480
- to that part of the diffusion process.
481
- Args:
482
- num_diffusion_timesteps (`int`): the number of betas to produce.
483
- max_beta (`float`): the maximum beta to use; use values lower than 1 to
484
- prevent singularities.
485
- alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
486
- Choose from `cosine` or `exp`
487
- Returns:
488
- betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
489
- """
490
- if alpha_transform_type == "cosine":
491
-
492
- def alpha_bar_fn(t):
493
- return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
494
-
495
- elif alpha_transform_type == "exp":
496
-
497
- def alpha_bar_fn(t):
498
- return math.exp(t * -12.0)
499
-
500
- else:
501
- raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
502
-
503
- betas = []
504
- for i in range(num_diffusion_timesteps):
505
- t1 = i / num_diffusion_timesteps
506
- t2 = (i + 1) / num_diffusion_timesteps
507
- betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
508
- return torch.tensor(betas, dtype=torch.float32)
509
-
510
-
511
- def rescale_zero_terminal_snr(betas):
512
- """
513
- Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
514
- Args:
515
- betas (`torch.FloatTensor`):
516
- the betas that the scheduler is being initialized with.
517
- Returns:
518
- `torch.FloatTensor`: rescaled betas with zero terminal SNR
519
- """
520
- # Convert betas to alphas_bar_sqrt
521
- alphas = 1.0 - betas
522
- alphas_cumprod = torch.cumprod(alphas, dim=0)
523
- alphas_bar_sqrt = alphas_cumprod.sqrt()
524
-
525
- # Store old values.
526
- alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
527
- alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
528
-
529
- # Shift so the last timestep is zero.
530
- alphas_bar_sqrt -= alphas_bar_sqrt_T
531
-
532
- # Scale so the first timestep is back to the old value.
533
- alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
534
-
535
- # Convert alphas_bar_sqrt to betas
536
- alphas_bar = alphas_bar_sqrt**2 # Revert sqrt
537
- alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod
538
- alphas = torch.cat([alphas_bar[0:1], alphas])
539
- betas = 1 - alphas
540
-
541
- return betas
542
-
543
-
544
- class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
545
- """
546
- This class modifies LCMScheduler to add a timestamp argument to set_timesteps
547
-
548
-
549
- `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
550
- non-Markovian guidance.
551
- This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
552
- methods the library implements for all schedulers such as loading and saving.
553
- Args:
554
- num_train_timesteps (`int`, defaults to 1000):
555
- The number of diffusion steps to train the model.
556
- beta_start (`float`, defaults to 0.0001):
557
- The starting `beta` value of inference.
558
- beta_end (`float`, defaults to 0.02):
559
- The final `beta` value.
560
- beta_schedule (`str`, defaults to `"linear"`):
561
- The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
562
- `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
563
- trained_betas (`np.ndarray`, *optional*):
564
- Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
565
- clip_sample (`bool`, defaults to `True`):
566
- Clip the predicted sample for numerical stability.
567
- clip_sample_range (`float`, defaults to 1.0):
568
- The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
569
- set_alpha_to_one (`bool`, defaults to `True`):
570
- Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
571
- there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
572
- otherwise it uses the alpha value at step 0.
573
- steps_offset (`int`, defaults to 0):
574
- An offset added to the inference steps. You can use a combination of `offset=1` and
575
- `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
576
- Diffusion.
577
- prediction_type (`str`, defaults to `epsilon`, *optional*):
578
- Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
579
- `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
580
- Video](https://imagen.research.google/video/paper.pdf) paper).
581
- thresholding (`bool`, defaults to `False`):
582
- Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
583
- as Stable Diffusion.
584
- dynamic_thresholding_ratio (`float`, defaults to 0.995):
585
- The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
586
- sample_max_value (`float`, defaults to 1.0):
587
- The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
588
- timestep_spacing (`str`, defaults to `"leading"`):
589
- The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
590
- Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
591
- rescale_betas_zero_snr (`bool`, defaults to `False`):
592
- Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
593
- dark samples instead of limiting it to samples with medium brightness. Loosely related to
594
- [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
595
- """
596
-
597
- # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
598
- order = 1
599
-
600
- @register_to_config
601
- def __init__(
602
- self,
603
- num_train_timesteps: int = 1000,
604
- beta_start: float = 0.0001,
605
- beta_end: float = 0.02,
606
- beta_schedule: str = "linear",
607
- trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
608
- clip_sample: bool = True,
609
- set_alpha_to_one: bool = True,
610
- steps_offset: int = 0,
611
- prediction_type: str = "epsilon",
612
- thresholding: bool = False,
613
- dynamic_thresholding_ratio: float = 0.995,
614
- clip_sample_range: float = 1.0,
615
- sample_max_value: float = 1.0,
616
- timestep_spacing: str = "leading",
617
- rescale_betas_zero_snr: bool = False,
618
- ):
619
- if trained_betas is not None:
620
- self.betas = torch.tensor(trained_betas, dtype=torch.float32)
621
- elif beta_schedule == "linear":
622
- self.betas = torch.linspace(
623
- beta_start, beta_end, num_train_timesteps, dtype=torch.float32
624
- )
625
- elif beta_schedule == "scaled_linear":
626
- # this schedule is very specific to the latent diffusion model.
627
- self.betas = (
628
- torch.linspace(
629
- beta_start**0.5,
630
- beta_end**0.5,
631
- num_train_timesteps,
632
- dtype=torch.float32,
633
- )
634
- ** 2
635
- )
636
- elif beta_schedule == "squaredcos_cap_v2":
637
- # Glide cosine schedule
638
- self.betas = betas_for_alpha_bar(num_train_timesteps)
639
- else:
640
- raise NotImplementedError(
641
- f"{beta_schedule} does is not implemented for {self.__class__}"
642
- )
643
-
644
- # Rescale for zero SNR
645
- if rescale_betas_zero_snr:
646
- self.betas = rescale_zero_terminal_snr(self.betas)
647
-
648
- self.alphas = 1.0 - self.betas
649
- self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
650
-
651
- # At every step in ddim, we are looking into the previous alphas_cumprod
652
- # For the final step, there is no previous alphas_cumprod because we are already at 0
653
- # `set_alpha_to_one` decides whether we set this parameter simply to one or
654
- # whether we use the final alpha of the "non-previous" one.
655
- self.final_alpha_cumprod = (
656
- torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
657
- )
658
-
659
- # standard deviation of the initial noise distribution
660
- self.init_noise_sigma = 1.0
661
-
662
- # setable values
663
- self.num_inference_steps = None
664
- self.timesteps = torch.from_numpy(
665
- np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)
666
- )
667
-
668
- def scale_model_input(
669
- self, sample: torch.FloatTensor, timestep: Optional[int] = None
670
- ) -> torch.FloatTensor:
671
- """
672
- Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
673
- current timestep.
674
- Args:
675
- sample (`torch.FloatTensor`):
676
- The input sample.
677
- timestep (`int`, *optional*):
678
- The current timestep in the diffusion chain.
679
- Returns:
680
- `torch.FloatTensor`:
681
- A scaled input sample.
682
- """
683
- return sample
684
-
685
- def _get_variance(self, timestep, prev_timestep):
686
- alpha_prod_t = self.alphas_cumprod[timestep]
687
- alpha_prod_t_prev = (
688
- self.alphas_cumprod[prev_timestep]
689
- if prev_timestep >= 0
690
- else self.final_alpha_cumprod
691
- )
692
- beta_prod_t = 1 - alpha_prod_t
693
- beta_prod_t_prev = 1 - alpha_prod_t_prev
694
-
695
- variance = (beta_prod_t_prev / beta_prod_t) * (
696
- 1 - alpha_prod_t / alpha_prod_t_prev
697
- )
698
-
699
- return variance
700
-
701
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
702
- def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
703
- """
704
- "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
705
- prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
706
- s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
707
- pixels from saturation at each step. We find that dynamic thresholding results in significantly better
708
- photorealism as well as better image-text alignment, especially when using very large guidance weights."
709
- https://arxiv.org/abs/2205.11487
710
- """
711
- dtype = sample.dtype
712
- batch_size, channels, height, width = sample.shape
713
-
714
- if dtype not in (torch.float32, torch.float64):
715
- sample = (
716
- sample.float()
717
- ) # upcast for quantile calculation, and clamp not implemented for cpu half
718
-
719
- # Flatten sample for doing quantile calculation along each image
720
- sample = sample.reshape(batch_size, channels * height * width)
721
-
722
- abs_sample = sample.abs() # "a certain percentile absolute pixel value"
723
-
724
- s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
725
- s = torch.clamp(
726
- s, min=1, max=self.config.sample_max_value
727
- ) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
728
-
729
- s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
730
- sample = (
731
- torch.clamp(sample, -s, s) / s
732
- ) # "we threshold xt0 to the range [-s, s] and then divide by s"
733
-
734
- sample = sample.reshape(batch_size, channels, height, width)
735
- sample = sample.to(dtype)
736
-
737
- return sample
738
-
739
- def set_timesteps(
740
- self,
741
- stength,
742
- num_inference_steps: int,
743
- lcm_origin_steps: int,
744
- device: Union[str, torch.device] = None,
745
- ):
746
- """
747
- Sets the discrete timesteps used for the diffusion chain (to be run before inference).
748
- Args:
749
- num_inference_steps (`int`):
750
- The number of diffusion steps used when generating samples with a pre-trained model.
751
- """
752
-
753
- if num_inference_steps > self.config.num_train_timesteps:
754
- raise ValueError(
755
- f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
756
- f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
757
- f" maximal {self.config.num_train_timesteps} timesteps."
758
- )
759
-
760
- self.num_inference_steps = num_inference_steps
761
-
762
- # LCM Timesteps Setting: # Linear Spacing
763
- c = self.config.num_train_timesteps // lcm_origin_steps
764
- lcm_origin_timesteps = (
765
- np.asarray(list(range(1, int(lcm_origin_steps * stength) + 1))) * c - 1
766
- ) # LCM Training Steps Schedule
767
- skipping_step = len(lcm_origin_timesteps) // num_inference_steps
768
- timesteps = lcm_origin_timesteps[::-skipping_step][
769
- :num_inference_steps
770
- ] # LCM Inference Steps Schedule
771
-
772
- self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
773
-
774
- def get_scalings_for_boundary_condition_discrete(self, t):
775
- self.sigma_data = 0.5 # Default: 0.5
776
-
777
- # By dividing 0.1: This is almost a delta function at t=0.
778
- c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
779
- c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
780
- return c_skip, c_out
781
-
782
- def step(
783
- self,
784
- model_output: torch.FloatTensor,
785
- timeindex: int,
786
- timestep: int,
787
- sample: torch.FloatTensor,
788
- eta: float = 0.0,
789
- use_clipped_model_output: bool = False,
790
- generator=None,
791
- variance_noise: Optional[torch.FloatTensor] = None,
792
- return_dict: bool = True,
793
- ) -> Union[LCMSchedulerOutput, Tuple]:
794
- """
795
- Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
796
- process from the learned model outputs (most often the predicted noise).
797
- Args:
798
- model_output (`torch.FloatTensor`):
799
- The direct output from learned diffusion model.
800
- timestep (`float`):
801
- The current discrete timestep in the diffusion chain.
802
- sample (`torch.FloatTensor`):
803
- A current instance of a sample created by the diffusion process.
804
- eta (`float`):
805
- The weight of noise for added noise in diffusion step.
806
- use_clipped_model_output (`bool`, defaults to `False`):
807
- If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
808
- because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
809
- clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
810
- `use_clipped_model_output` has no effect.
811
- generator (`torch.Generator`, *optional*):
812
- A random number generator.
813
- variance_noise (`torch.FloatTensor`):
814
- Alternative to generating noise with `generator` by directly providing the noise for the variance
815
- itself. Useful for methods such as [`CycleDiffusion`].
816
- return_dict (`bool`, *optional*, defaults to `True`):
817
- Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
818
- Returns:
819
- [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
820
- If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
821
- tuple is returned where the first element is the sample tensor.
822
- """
823
- if self.num_inference_steps is None:
824
- raise ValueError(
825
- "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
826
- )
827
-
828
- # 1. get previous step value
829
- prev_timeindex = timeindex + 1
830
- if prev_timeindex < len(self.timesteps):
831
- prev_timestep = self.timesteps[prev_timeindex]
832
- else:
833
- prev_timestep = timestep
834
-
835
- # 2. compute alphas, betas
836
- alpha_prod_t = self.alphas_cumprod[timestep]
837
- alpha_prod_t_prev = (
838
- self.alphas_cumprod[prev_timestep]
839
- if prev_timestep >= 0
840
- else self.final_alpha_cumprod
841
- )
842
-
843
- beta_prod_t = 1 - alpha_prod_t
844
- beta_prod_t_prev = 1 - alpha_prod_t_prev
845
-
846
- # 3. Get scalings for boundary conditions
847
- c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
848
-
849
- # 4. Different Parameterization:
850
- parameterization = self.config.prediction_type
851
-
852
- if parameterization == "epsilon": # noise-prediction
853
- pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
854
-
855
- elif parameterization == "sample": # x-prediction
856
- pred_x0 = model_output
857
-
858
- elif parameterization == "v_prediction": # v-prediction
859
- pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
860
-
861
- # 4. Denoise model output using boundary conditions
862
- denoised = c_out * pred_x0 + c_skip * sample
863
-
864
- # 5. Sample z ~ N(0, I), For MultiStep Inference
865
- # Noise is not used for one-step sampling.
866
- if len(self.timesteps) > 1:
867
- noise = torch.randn(model_output.shape).to(model_output.device)
868
- prev_sample = (
869
- alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
870
- )
871
- else:
872
- prev_sample = denoised
873
-
874
- if not return_dict:
875
- return (prev_sample, denoised)
876
-
877
- return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
878
-
879
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
880
- def add_noise(
881
- self,
882
- original_samples: torch.FloatTensor,
883
- noise: torch.FloatTensor,
884
- timesteps: torch.IntTensor,
885
- ) -> torch.FloatTensor:
886
- # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
887
- alphas_cumprod = self.alphas_cumprod.to(
888
- device=original_samples.device, dtype=original_samples.dtype
889
- )
890
- timesteps = timesteps.to(original_samples.device)
891
-
892
- sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
893
- sqrt_alpha_prod = sqrt_alpha_prod.flatten()
894
- while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
895
- sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
896
-
897
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
898
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
899
- while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
900
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
901
-
902
- noisy_samples = (
903
- sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
904
- )
905
- return noisy_samples
906
-
907
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
908
- def get_velocity(
909
- self,
910
- sample: torch.FloatTensor,
911
- noise: torch.FloatTensor,
912
- timesteps: torch.IntTensor,
913
- ) -> torch.FloatTensor:
914
- # Make sure alphas_cumprod and timestep have same device and dtype as sample
915
- alphas_cumprod = self.alphas_cumprod.to(
916
- device=sample.device, dtype=sample.dtype
917
- )
918
- timesteps = timesteps.to(sample.device)
919
-
920
- sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
921
- sqrt_alpha_prod = sqrt_alpha_prod.flatten()
922
- while len(sqrt_alpha_prod.shape) < len(sample.shape):
923
- sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
924
-
925
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
926
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
927
- while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
928
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
929
-
930
- velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
931
- return velocity
932
-
933
- def __len__(self):
934
- return self.config.num_train_timesteps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
latent_consistency_txt2img.py DELETED
@@ -1,836 +0,0 @@
1
- # Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- # DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
16
- # and https://github.com/hojonathanho/diffusion
17
-
18
- import math
19
- from dataclasses import dataclass
20
- from typing import Any, Dict, List, Optional, Tuple, Union
21
-
22
- import numpy as np
23
- import torch
24
- from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
25
-
26
- from diffusers import (
27
- AutoencoderKL,
28
- ConfigMixin,
29
- DiffusionPipeline,
30
- SchedulerMixin,
31
- UNet2DConditionModel,
32
- logging,
33
- )
34
- from diffusers.configuration_utils import register_to_config
35
- from diffusers.image_processor import VaeImageProcessor
36
- from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
37
- from diffusers.pipelines.stable_diffusion.safety_checker import (
38
- StableDiffusionSafetyChecker,
39
- )
40
- from diffusers.utils import BaseOutput
41
-
42
-
43
- logger = logging.get_logger(__name__) # pylint: disable=invalid-name
44
-
45
-
46
- class LatentConsistencyModelPipeline(DiffusionPipeline):
47
- _optional_components = ["scheduler"]
48
-
49
- def __init__(
50
- self,
51
- vae: AutoencoderKL,
52
- text_encoder: CLIPTextModel,
53
- tokenizer: CLIPTokenizer,
54
- unet: UNet2DConditionModel,
55
- scheduler: "LCMScheduler",
56
- safety_checker: StableDiffusionSafetyChecker,
57
- feature_extractor: CLIPImageProcessor,
58
- requires_safety_checker: bool = True,
59
- ):
60
- super().__init__()
61
-
62
- scheduler = (
63
- scheduler
64
- if scheduler is not None
65
- else LCMScheduler(
66
- beta_start=0.00085,
67
- beta_end=0.0120,
68
- beta_schedule="scaled_linear",
69
- prediction_type="epsilon",
70
- )
71
- )
72
-
73
- self.register_modules(
74
- vae=vae,
75
- text_encoder=text_encoder,
76
- tokenizer=tokenizer,
77
- unet=unet,
78
- scheduler=scheduler,
79
- safety_checker=safety_checker,
80
- feature_extractor=feature_extractor,
81
- )
82
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
83
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
84
-
85
- def _encode_prompt(
86
- self,
87
- prompt,
88
- device,
89
- num_images_per_prompt,
90
- prompt_embeds: None,
91
- ):
92
- r"""
93
- Encodes the prompt into text encoder hidden states.
94
- Args:
95
- prompt (`str` or `List[str]`, *optional*):
96
- prompt to be encoded
97
- device: (`torch.device`):
98
- torch device
99
- num_images_per_prompt (`int`):
100
- number of images that should be generated per prompt
101
- prompt_embeds (`torch.FloatTensor`, *optional*):
102
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
103
- provided, text embeddings will be generated from `prompt` input argument.
104
- """
105
-
106
- if prompt is not None and isinstance(prompt, str):
107
- pass
108
- elif prompt is not None and isinstance(prompt, list):
109
- len(prompt)
110
- else:
111
- prompt_embeds.shape[0]
112
-
113
- if prompt_embeds is None:
114
- text_inputs = self.tokenizer(
115
- prompt,
116
- padding="max_length",
117
- max_length=self.tokenizer.model_max_length,
118
- truncation=True,
119
- return_tensors="pt",
120
- )
121
- text_input_ids = text_inputs.input_ids
122
- untruncated_ids = self.tokenizer(
123
- prompt, padding="longest", return_tensors="pt"
124
- ).input_ids
125
-
126
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
127
- -1
128
- ] and not torch.equal(text_input_ids, untruncated_ids):
129
- removed_text = self.tokenizer.batch_decode(
130
- untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
131
- )
132
- logger.warning(
133
- "The following part of your input was truncated because CLIP can only handle sequences up to"
134
- f" {self.tokenizer.model_max_length} tokens: {removed_text}"
135
- )
136
-
137
- if (
138
- hasattr(self.text_encoder.config, "use_attention_mask")
139
- and self.text_encoder.config.use_attention_mask
140
- ):
141
- attention_mask = text_inputs.attention_mask.to(device)
142
- else:
143
- attention_mask = None
144
-
145
- prompt_embeds = self.text_encoder(
146
- text_input_ids.to(device),
147
- attention_mask=attention_mask,
148
- )
149
- prompt_embeds = prompt_embeds[0]
150
-
151
- if self.text_encoder is not None:
152
- prompt_embeds_dtype = self.text_encoder.dtype
153
- elif self.unet is not None:
154
- prompt_embeds_dtype = self.unet.dtype
155
- else:
156
- prompt_embeds_dtype = prompt_embeds.dtype
157
-
158
- prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
159
-
160
- bs_embed, seq_len, _ = prompt_embeds.shape
161
- # duplicate text embeddings for each generation per prompt, using mps friendly method
162
- prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
163
- prompt_embeds = prompt_embeds.view(
164
- bs_embed * num_images_per_prompt, seq_len, -1
165
- )
166
-
167
- # Don't need to get uncond prompt embedding because of LCM Guided Distillation
168
- return prompt_embeds
169
-
170
- def run_safety_checker(self, image, device, dtype):
171
- if self.safety_checker is None:
172
- has_nsfw_concept = None
173
- else:
174
- if torch.is_tensor(image):
175
- feature_extractor_input = self.image_processor.postprocess(
176
- image, output_type="pil"
177
- )
178
- else:
179
- feature_extractor_input = self.image_processor.numpy_to_pil(image)
180
- safety_checker_input = self.feature_extractor(
181
- feature_extractor_input, return_tensors="pt"
182
- ).to(device)
183
- image, has_nsfw_concept = self.safety_checker(
184
- images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
185
- )
186
- return image, has_nsfw_concept
187
-
188
- def prepare_latents(
189
- self,
190
- batch_size,
191
- num_channels_latents,
192
- height,
193
- width,
194
- dtype,
195
- device,
196
- latents=None,
197
- generator=None,
198
- ):
199
- shape = (
200
- batch_size,
201
- num_channels_latents,
202
- height // self.vae_scale_factor,
203
- width // self.vae_scale_factor,
204
- )
205
- if generator is None:
206
- generator = torch.Generator()
207
- generator.manual_seed(torch.randint(0, 2 ** 32, (1,)).item())
208
-
209
- if latents is None:
210
- latents = torch.randn(shape, dtype=dtype, generator=generator).to(device)
211
- else:
212
- latents = latents.to(device)
213
- # scale the initial noise by the standard deviation required by the scheduler
214
- latents = latents * self.scheduler.init_noise_sigma
215
- return latents
216
-
217
- def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
218
- """
219
- see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
220
- Args:
221
- timesteps: torch.Tensor: generate embedding vectors at these timesteps
222
- embedding_dim: int: dimension of the embeddings to generate
223
- dtype: data type of the generated embeddings
224
- Returns:
225
- embedding vectors with shape `(len(timesteps), embedding_dim)`
226
- """
227
- assert len(w.shape) == 1
228
- w = w * 1000.0
229
-
230
- half_dim = embedding_dim // 2
231
- emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
232
- emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
233
- emb = w.to(dtype)[:, None] * emb[None, :]
234
- emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
235
- if embedding_dim % 2 == 1: # zero pad
236
- emb = torch.nn.functional.pad(emb, (0, 1))
237
- assert emb.shape == (w.shape[0], embedding_dim)
238
- return emb
239
-
240
- @torch.no_grad()
241
- def __call__(
242
- self,
243
- prompt: Union[str, List[str]] = None,
244
- height: Optional[int] = 768,
245
- width: Optional[int] = 768,
246
- guidance_scale: float = 7.5,
247
- num_images_per_prompt: Optional[int] = 1,
248
- latents: Optional[torch.FloatTensor] = None,
249
- generator: Optional[torch.Generator] = None,
250
- num_inference_steps: int = 4,
251
- lcm_origin_steps: int = 50,
252
- prompt_embeds: Optional[torch.FloatTensor] = None,
253
- output_type: Optional[str] = "pil",
254
- return_dict: bool = True,
255
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
256
- ):
257
- # 0. Default height and width to unet
258
- height = height or self.unet.config.sample_size * self.vae_scale_factor
259
- width = width or self.unet.config.sample_size * self.vae_scale_factor
260
-
261
- # 2. Define call parameters
262
- if prompt is not None and isinstance(prompt, str):
263
- batch_size = 1
264
- elif prompt is not None and isinstance(prompt, list):
265
- batch_size = len(prompt)
266
- else:
267
- batch_size = prompt_embeds.shape[0]
268
-
269
- device = self._execution_device
270
- # do_classifier_free_guidance = guidance_scale > 0.0 # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
271
-
272
- # 3. Encode input prompt
273
- prompt_embeds = self._encode_prompt(
274
- prompt,
275
- device,
276
- num_images_per_prompt,
277
- prompt_embeds=prompt_embeds,
278
- )
279
-
280
- # 4. Prepare timesteps
281
- self.scheduler.set_timesteps(num_inference_steps, lcm_origin_steps)
282
- timesteps = self.scheduler.timesteps
283
-
284
- # 5. Prepare latent variable
285
- num_channels_latents = self.unet.config.in_channels
286
- latents = self.prepare_latents(
287
- batch_size * num_images_per_prompt,
288
- num_channels_latents,
289
- height,
290
- width,
291
- prompt_embeds.dtype,
292
- device,
293
- latents,
294
- generator
295
- )
296
- bs = batch_size * num_images_per_prompt
297
-
298
- # 6. Get Guidance Scale Embedding
299
- w = torch.tensor(guidance_scale).repeat(bs)
300
- w_embedding = self.get_w_embedding(w, embedding_dim=256).to(
301
- device=device, dtype=latents.dtype
302
- )
303
-
304
- # 7. LCM MultiStep Sampling Loop:
305
- with self.progress_bar(total=num_inference_steps) as progress_bar:
306
- for i, t in enumerate(timesteps):
307
- ts = torch.full((bs,), t, device=device, dtype=torch.long)
308
- latents = latents.to(prompt_embeds.dtype)
309
-
310
- # model prediction (v-prediction, eps, x)
311
- model_pred = self.unet(
312
- latents,
313
- ts,
314
- timestep_cond=w_embedding,
315
- encoder_hidden_states=prompt_embeds,
316
- cross_attention_kwargs=cross_attention_kwargs,
317
- return_dict=False,
318
- )[0]
319
-
320
- # compute the previous noisy sample x_t -> x_t-1
321
- latents, denoised = self.scheduler.step(
322
- model_pred, i, t, latents, return_dict=False
323
- )
324
-
325
- # # call the callback, if provided
326
- # if i == len(timesteps) - 1:
327
- progress_bar.update()
328
-
329
- denoised = denoised.to(prompt_embeds.dtype)
330
- if not output_type == "latent":
331
- image = self.vae.decode(
332
- denoised / self.vae.config.scaling_factor, return_dict=False
333
- )[0]
334
- image, has_nsfw_concept = self.run_safety_checker(
335
- image, device, prompt_embeds.dtype
336
- )
337
- else:
338
- image = denoised
339
- has_nsfw_concept = None
340
-
341
- if has_nsfw_concept is None:
342
- do_denormalize = [True] * image.shape[0]
343
- else:
344
- do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
345
-
346
- image = self.image_processor.postprocess(
347
- image, output_type=output_type, do_denormalize=do_denormalize
348
- )
349
-
350
- if not return_dict:
351
- return (image, has_nsfw_concept)
352
-
353
- return StableDiffusionPipelineOutput(
354
- images=image, nsfw_content_detected=has_nsfw_concept
355
- )
356
-
357
-
358
- @dataclass
359
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
360
- class LCMSchedulerOutput(BaseOutput):
361
- """
362
- Output class for the scheduler's `step` function output.
363
- Args:
364
- prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
365
- Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
366
- denoising loop.
367
- pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
368
- The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
369
- `pred_original_sample` can be used to preview progress or for guidance.
370
- """
371
-
372
- prev_sample: torch.FloatTensor
373
- denoised: Optional[torch.FloatTensor] = None
374
-
375
-
376
- # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
377
- def betas_for_alpha_bar(
378
- num_diffusion_timesteps,
379
- max_beta=0.999,
380
- alpha_transform_type="cosine",
381
- ):
382
- """
383
- Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
384
- (1-beta) over time from t = [0,1].
385
- Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
386
- to that part of the diffusion process.
387
- Args:
388
- num_diffusion_timesteps (`int`): the number of betas to produce.
389
- max_beta (`float`): the maximum beta to use; use values lower than 1 to
390
- prevent singularities.
391
- alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
392
- Choose from `cosine` or `exp`
393
- Returns:
394
- betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
395
- """
396
- if alpha_transform_type == "cosine":
397
-
398
- def alpha_bar_fn(t):
399
- return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
400
-
401
- elif alpha_transform_type == "exp":
402
-
403
- def alpha_bar_fn(t):
404
- return math.exp(t * -12.0)
405
-
406
- else:
407
- raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
408
-
409
- betas = []
410
- for i in range(num_diffusion_timesteps):
411
- t1 = i / num_diffusion_timesteps
412
- t2 = (i + 1) / num_diffusion_timesteps
413
- betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
414
- return torch.tensor(betas, dtype=torch.float32)
415
-
416
-
417
- def rescale_zero_terminal_snr(betas):
418
- """
419
- Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
420
- Args:
421
- betas (`torch.FloatTensor`):
422
- the betas that the scheduler is being initialized with.
423
- Returns:
424
- `torch.FloatTensor`: rescaled betas with zero terminal SNR
425
- """
426
- # Convert betas to alphas_bar_sqrt
427
- alphas = 1.0 - betas
428
- alphas_cumprod = torch.cumprod(alphas, dim=0)
429
- alphas_bar_sqrt = alphas_cumprod.sqrt()
430
-
431
- # Store old values.
432
- alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
433
- alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
434
-
435
- # Shift so the last timestep is zero.
436
- alphas_bar_sqrt -= alphas_bar_sqrt_T
437
-
438
- # Scale so the first timestep is back to the old value.
439
- alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
440
-
441
- # Convert alphas_bar_sqrt to betas
442
- alphas_bar = alphas_bar_sqrt**2 # Revert sqrt
443
- alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod
444
- alphas = torch.cat([alphas_bar[0:1], alphas])
445
- betas = 1 - alphas
446
-
447
- return betas
448
-
449
-
450
- class LCMScheduler(SchedulerMixin, ConfigMixin):
451
- """
452
- `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
453
- non-Markovian guidance.
454
- This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
455
- methods the library implements for all schedulers such as loading and saving.
456
- Args:
457
- num_train_timesteps (`int`, defaults to 1000):
458
- The number of diffusion steps to train the model.
459
- beta_start (`float`, defaults to 0.0001):
460
- The starting `beta` value of inference.
461
- beta_end (`float`, defaults to 0.02):
462
- The final `beta` value.
463
- beta_schedule (`str`, defaults to `"linear"`):
464
- The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
465
- `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
466
- trained_betas (`np.ndarray`, *optional*):
467
- Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
468
- clip_sample (`bool`, defaults to `True`):
469
- Clip the predicted sample for numerical stability.
470
- clip_sample_range (`float`, defaults to 1.0):
471
- The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
472
- set_alpha_to_one (`bool`, defaults to `True`):
473
- Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
474
- there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
475
- otherwise it uses the alpha value at step 0.
476
- steps_offset (`int`, defaults to 0):
477
- An offset added to the inference steps. You can use a combination of `offset=1` and
478
- `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
479
- Diffusion.
480
- prediction_type (`str`, defaults to `epsilon`, *optional*):
481
- Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
482
- `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
483
- Video](https://imagen.research.google/video/paper.pdf) paper).
484
- thresholding (`bool`, defaults to `False`):
485
- Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
486
- as Stable Diffusion.
487
- dynamic_thresholding_ratio (`float`, defaults to 0.995):
488
- The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
489
- sample_max_value (`float`, defaults to 1.0):
490
- The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
491
- timestep_spacing (`str`, defaults to `"leading"`):
492
- The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
493
- Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
494
- rescale_betas_zero_snr (`bool`, defaults to `False`):
495
- Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
496
- dark samples instead of limiting it to samples with medium brightness. Loosely related to
497
- [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
498
- """
499
-
500
- # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
501
- order = 1
502
-
503
- @register_to_config
504
- def __init__(
505
- self,
506
- num_train_timesteps: int = 1000,
507
- beta_start: float = 0.0001,
508
- beta_end: float = 0.02,
509
- beta_schedule: str = "linear",
510
- trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
511
- clip_sample: bool = True,
512
- set_alpha_to_one: bool = True,
513
- steps_offset: int = 0,
514
- prediction_type: str = "epsilon",
515
- thresholding: bool = False,
516
- dynamic_thresholding_ratio: float = 0.995,
517
- clip_sample_range: float = 1.0,
518
- sample_max_value: float = 1.0,
519
- timestep_spacing: str = "leading",
520
- rescale_betas_zero_snr: bool = False,
521
- ):
522
- if trained_betas is not None:
523
- self.betas = torch.tensor(trained_betas, dtype=torch.float32)
524
- elif beta_schedule == "linear":
525
- self.betas = torch.linspace(
526
- beta_start, beta_end, num_train_timesteps, dtype=torch.float32
527
- )
528
- elif beta_schedule == "scaled_linear":
529
- # this schedule is very specific to the latent diffusion model.
530
- self.betas = (
531
- torch.linspace(
532
- beta_start**0.5,
533
- beta_end**0.5,
534
- num_train_timesteps,
535
- dtype=torch.float32,
536
- )
537
- ** 2
538
- )
539
- elif beta_schedule == "squaredcos_cap_v2":
540
- # Glide cosine schedule
541
- self.betas = betas_for_alpha_bar(num_train_timesteps)
542
- else:
543
- raise NotImplementedError(
544
- f"{beta_schedule} does is not implemented for {self.__class__}"
545
- )
546
-
547
- # Rescale for zero SNR
548
- if rescale_betas_zero_snr:
549
- self.betas = rescale_zero_terminal_snr(self.betas)
550
-
551
- self.alphas = 1.0 - self.betas
552
- self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
553
-
554
- # At every step in ddim, we are looking into the previous alphas_cumprod
555
- # For the final step, there is no previous alphas_cumprod because we are already at 0
556
- # `set_alpha_to_one` decides whether we set this parameter simply to one or
557
- # whether we use the final alpha of the "non-previous" one.
558
- self.final_alpha_cumprod = (
559
- torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
560
- )
561
-
562
- # standard deviation of the initial noise distribution
563
- self.init_noise_sigma = 1.0
564
-
565
- # setable values
566
- self.num_inference_steps = None
567
- self.timesteps = torch.from_numpy(
568
- np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)
569
- )
570
-
571
- def scale_model_input(
572
- self, sample: torch.FloatTensor, timestep: Optional[int] = None
573
- ) -> torch.FloatTensor:
574
- """
575
- Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
576
- current timestep.
577
- Args:
578
- sample (`torch.FloatTensor`):
579
- The input sample.
580
- timestep (`int`, *optional*):
581
- The current timestep in the diffusion chain.
582
- Returns:
583
- `torch.FloatTensor`:
584
- A scaled input sample.
585
- """
586
- return sample
587
-
588
- def _get_variance(self, timestep, prev_timestep):
589
- alpha_prod_t = self.alphas_cumprod[timestep]
590
- alpha_prod_t_prev = (
591
- self.alphas_cumprod[prev_timestep]
592
- if prev_timestep >= 0
593
- else self.final_alpha_cumprod
594
- )
595
- beta_prod_t = 1 - alpha_prod_t
596
- beta_prod_t_prev = 1 - alpha_prod_t_prev
597
-
598
- variance = (beta_prod_t_prev / beta_prod_t) * (
599
- 1 - alpha_prod_t / alpha_prod_t_prev
600
- )
601
-
602
- return variance
603
-
604
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
605
- def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
606
- """
607
- "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
608
- prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
609
- s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
610
- pixels from saturation at each step. We find that dynamic thresholding results in significantly better
611
- photorealism as well as better image-text alignment, especially when using very large guidance weights."
612
- https://arxiv.org/abs/2205.11487
613
- """
614
- dtype = sample.dtype
615
- batch_size, channels, height, width = sample.shape
616
-
617
- if dtype not in (torch.float32, torch.float64):
618
- sample = (
619
- sample.float()
620
- ) # upcast for quantile calculation, and clamp not implemented for cpu half
621
-
622
- # Flatten sample for doing quantile calculation along each image
623
- sample = sample.reshape(batch_size, channels * height * width)
624
-
625
- abs_sample = sample.abs() # "a certain percentile absolute pixel value"
626
-
627
- s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
628
- s = torch.clamp(
629
- s, min=1, max=self.config.sample_max_value
630
- ) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
631
-
632
- s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
633
- sample = (
634
- torch.clamp(sample, -s, s) / s
635
- ) # "we threshold xt0 to the range [-s, s] and then divide by s"
636
-
637
- sample = sample.reshape(batch_size, channels, height, width)
638
- sample = sample.to(dtype)
639
-
640
- return sample
641
-
642
- def set_timesteps(
643
- self,
644
- num_inference_steps: int,
645
- lcm_origin_steps: int,
646
- device: Union[str, torch.device] = None,
647
- ):
648
- """
649
- Sets the discrete timesteps used for the diffusion chain (to be run before inference).
650
- Args:
651
- num_inference_steps (`int`):
652
- The number of diffusion steps used when generating samples with a pre-trained model.
653
- """
654
-
655
- if num_inference_steps > self.config.num_train_timesteps:
656
- raise ValueError(
657
- f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
658
- f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
659
- f" maximal {self.config.num_train_timesteps} timesteps."
660
- )
661
-
662
- self.num_inference_steps = num_inference_steps
663
-
664
- # LCM Timesteps Setting: # Linear Spacing
665
- c = self.config.num_train_timesteps // lcm_origin_steps
666
- lcm_origin_timesteps = (
667
- np.asarray(list(range(1, lcm_origin_steps + 1))) * c - 1
668
- ) # LCM Training Steps Schedule
669
- skipping_step = len(lcm_origin_timesteps) // num_inference_steps
670
- timesteps = lcm_origin_timesteps[::-skipping_step][
671
- :num_inference_steps
672
- ] # LCM Inference Steps Schedule
673
-
674
- self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
675
-
676
- def get_scalings_for_boundary_condition_discrete(self, t):
677
- self.sigma_data = 0.5 # Default: 0.5
678
-
679
- # By dividing 0.1: This is almost a delta function at t=0.
680
- c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
681
- c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
682
- return c_skip, c_out
683
-
684
- def step(
685
- self,
686
- model_output: torch.FloatTensor,
687
- timeindex: int,
688
- timestep: int,
689
- sample: torch.FloatTensor,
690
- eta: float = 0.0,
691
- use_clipped_model_output: bool = False,
692
- generator=None,
693
- variance_noise: Optional[torch.FloatTensor] = None,
694
- return_dict: bool = True,
695
- ) -> Union[LCMSchedulerOutput, Tuple]:
696
- """
697
- Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
698
- process from the learned model outputs (most often the predicted noise).
699
- Args:
700
- model_output (`torch.FloatTensor`):
701
- The direct output from learned diffusion model.
702
- timestep (`float`):
703
- The current discrete timestep in the diffusion chain.
704
- sample (`torch.FloatTensor`):
705
- A current instance of a sample created by the diffusion process.
706
- eta (`float`):
707
- The weight of noise for added noise in diffusion step.
708
- use_clipped_model_output (`bool`, defaults to `False`):
709
- If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
710
- because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
711
- clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
712
- `use_clipped_model_output` has no effect.
713
- generator (`torch.Generator`, *optional*):
714
- A random number generator.
715
- variance_noise (`torch.FloatTensor`):
716
- Alternative to generating noise with `generator` by directly providing the noise for the variance
717
- itself. Useful for methods such as [`CycleDiffusion`].
718
- return_dict (`bool`, *optional*, defaults to `True`):
719
- Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
720
- Returns:
721
- [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
722
- If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
723
- tuple is returned where the first element is the sample tensor.
724
- """
725
- if self.num_inference_steps is None:
726
- raise ValueError(
727
- "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
728
- )
729
-
730
- # 1. get previous step value
731
- prev_timeindex = timeindex + 1
732
- if prev_timeindex < len(self.timesteps):
733
- prev_timestep = self.timesteps[prev_timeindex]
734
- else:
735
- prev_timestep = timestep
736
-
737
- # 2. compute alphas, betas
738
- alpha_prod_t = self.alphas_cumprod[timestep]
739
- alpha_prod_t_prev = (
740
- self.alphas_cumprod[prev_timestep]
741
- if prev_timestep >= 0
742
- else self.final_alpha_cumprod
743
- )
744
-
745
- beta_prod_t = 1 - alpha_prod_t
746
- beta_prod_t_prev = 1 - alpha_prod_t_prev
747
-
748
- # 3. Get scalings for boundary conditions
749
- c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
750
-
751
- # 4. Different Parameterization:
752
- parameterization = self.config.prediction_type
753
-
754
- if parameterization == "epsilon": # noise-prediction
755
- pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
756
-
757
- elif parameterization == "sample": # x-prediction
758
- pred_x0 = model_output
759
-
760
- elif parameterization == "v_prediction": # v-prediction
761
- pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
762
-
763
- # 4. Denoise model output using boundary conditions
764
- denoised = c_out * pred_x0 + c_skip * sample
765
-
766
- # 5. Sample z ~ N(0, I), For MultiStep Inference
767
- # Noise is not used for one-step sampling.
768
- if len(self.timesteps) > 1:
769
- noise = torch.randn(model_output.shape).to(model_output.device)
770
- prev_sample = (
771
- alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
772
- )
773
- else:
774
- prev_sample = denoised
775
-
776
- if not return_dict:
777
- return (prev_sample, denoised)
778
-
779
- return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
780
-
781
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
782
- def add_noise(
783
- self,
784
- original_samples: torch.FloatTensor,
785
- noise: torch.FloatTensor,
786
- timesteps: torch.IntTensor,
787
- ) -> torch.FloatTensor:
788
- # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
789
- alphas_cumprod = self.alphas_cumprod.to(
790
- device=original_samples.device, dtype=original_samples.dtype
791
- )
792
- timesteps = timesteps.to(original_samples.device)
793
-
794
- sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
795
- sqrt_alpha_prod = sqrt_alpha_prod.flatten()
796
- while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
797
- sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
798
-
799
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
800
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
801
- while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
802
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
803
-
804
- noisy_samples = (
805
- sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
806
- )
807
- return noisy_samples
808
-
809
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
810
- def get_velocity(
811
- self,
812
- sample: torch.FloatTensor,
813
- noise: torch.FloatTensor,
814
- timesteps: torch.IntTensor,
815
- ) -> torch.FloatTensor:
816
- # Make sure alphas_cumprod and timestep have same device and dtype as sample
817
- alphas_cumprod = self.alphas_cumprod.to(
818
- device=sample.device, dtype=sample.dtype
819
- )
820
- timesteps = timesteps.to(sample.device)
821
-
822
- sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
823
- sqrt_alpha_prod = sqrt_alpha_prod.flatten()
824
- while len(sqrt_alpha_prod.shape) < len(sample.shape):
825
- sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
826
-
827
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
828
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
829
- while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
830
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
831
-
832
- velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
833
- return velocity
834
-
835
- def __len__(self):
836
- return self.config.num_train_timesteps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- diffusers==0.21.4
2
  transformers==4.34.1
3
  gradio==3.50.2
4
  --extra-index-url https://download.pytorch.org/whl/cu121
 
1
+ diffusers==0.22.1
2
  transformers==4.34.1
3
  gradio==3.50.2
4
  --extra-index-url https://download.pytorch.org/whl/cu121