kiigii commited on
Commit
aaa261a
·
verified ·
1 Parent(s): b2d1dd8

Upload folder using huggingface_hub

Browse files
attention_processor.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from diffusers.models import UNet2DConditionModel
6
+ from diffusers.models.attention import Attention
7
+ from diffusers.models.attention_processor import AttnProcessor2_0
8
+
9
+
10
+
11
+ def add_imagedream_attn_processor(unet: UNet2DConditionModel) -> nn.Module:
12
+ attn_procs = {}
13
+ for key, attn_processor in unet.attn_processors.items():
14
+ if "attn1" in key:
15
+ attn_procs[key] = ImageDreamAttnProcessor2_0()
16
+ else:
17
+ attn_procs[key] = attn_processor
18
+ unet.set_attn_processor(attn_procs)
19
+ return unet
20
+
21
+
22
+ class ImageDreamAttnProcessor2_0(AttnProcessor2_0):
23
+ def __call__(
24
+ self,
25
+ attn: Attention,
26
+ hidden_states: torch.Tensor,
27
+ encoder_hidden_states: Optional[torch.Tensor] = None,
28
+ attention_mask: Optional[torch.Tensor] = None,
29
+ temb: Optional[torch.Tensor] = None,
30
+ num_views: int = 1,
31
+ *args,
32
+ **kwargs,
33
+ ):
34
+ if num_views == 1:
35
+ return super().__call__(
36
+ attn=attn,
37
+ hidden_states=hidden_states,
38
+ encoder_hidden_states=encoder_hidden_states,
39
+ attention_mask=attention_mask,
40
+ temb=temb,
41
+ *args,
42
+ **kwargs,
43
+ )
44
+
45
+ input_ndim = hidden_states.ndim
46
+ B = hidden_states.size(0)
47
+ if B % num_views:
48
+ raise ValueError(
49
+ f"`batch_size`(got {B}) must be a multiple of `num_views`(got {num_views})."
50
+ )
51
+ real_B = B // num_views
52
+ if input_ndim == 4:
53
+ H, W = hidden_states.shape[2:]
54
+ hidden_states = hidden_states.reshape(real_B, -1, H, W).transpose(1, 2)
55
+ else:
56
+ hidden_states = hidden_states.reshape(real_B, -1, hidden_states.size(-1))
57
+ hidden_states = super().__call__(
58
+ attn=attn,
59
+ hidden_states=hidden_states,
60
+ encoder_hidden_states=encoder_hidden_states,
61
+ attention_mask=attention_mask,
62
+ temb=temb,
63
+ *args,
64
+ **kwargs,
65
+ )
66
+ if input_ndim == 4:
67
+ hidden_states = hidden_states.transpose(-1, -2).reshape(B, -1, H, W)
68
+ else:
69
+ hidden_states = hidden_states.reshape(B, -1, hidden_states.size(-1))
70
+ return hidden_states
camera_utils.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copied from ImageDream
2
+ # https://github.com/bytedance/ImageDream/blob/main/extern/ImageDream/imagedream/camera_utils.py
3
+
4
+ import numpy as np
5
+ import torch
6
+
7
+
8
+ def create_camera_to_world_matrix(elevation, azimuth):
9
+ elevation = np.radians(elevation)
10
+ azimuth = np.radians(azimuth)
11
+ # Convert elevation and azimuth angles to Cartesian coordinates on a unit sphere
12
+ x = np.cos(elevation) * np.sin(azimuth)
13
+ y = np.sin(elevation)
14
+ z = np.cos(elevation) * np.cos(azimuth)
15
+
16
+ # Calculate camera position, target, and up vectors
17
+ camera_pos = np.array([x, y, z])
18
+ target = np.array([0, 0, 0])
19
+ up = np.array([0, 1, 0])
20
+
21
+ # Construct view matrix
22
+ forward = target - camera_pos
23
+ forward /= np.linalg.norm(forward)
24
+ right = np.cross(forward, up)
25
+ right /= np.linalg.norm(right)
26
+ new_up = np.cross(right, forward)
27
+ new_up /= np.linalg.norm(new_up)
28
+ cam2world = np.eye(4)
29
+ cam2world[:3, :3] = np.array([right, new_up, -forward]).T
30
+ cam2world[:3, 3] = camera_pos
31
+ return cam2world
32
+
33
+
34
+ def convert_opengl_to_blender(camera_matrix):
35
+ if isinstance(camera_matrix, np.ndarray):
36
+ # Construct transformation matrix to convert from OpenGL space to Blender space
37
+ flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
38
+ camera_matrix_blender = np.dot(flip_yz, camera_matrix)
39
+ else:
40
+ # Construct transformation matrix to convert from OpenGL space to Blender space
41
+ flip_yz = torch.tensor(
42
+ [[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]
43
+ )
44
+ if camera_matrix.ndim == 3:
45
+ flip_yz = flip_yz.unsqueeze(0)
46
+ camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix), camera_matrix)
47
+ return camera_matrix_blender
48
+
49
+
50
+ def normalize_camera(camera_matrix):
51
+ """normalize the camera location onto a unit-sphere"""
52
+ if isinstance(camera_matrix, np.ndarray):
53
+ camera_matrix = camera_matrix.reshape(-1, 4, 4)
54
+ translation = camera_matrix[:, :3, 3]
55
+ translation = translation / (
56
+ np.linalg.norm(translation, axis=1, keepdims=True) + 1e-8
57
+ )
58
+ camera_matrix[:, :3, 3] = translation
59
+ else:
60
+ camera_matrix = camera_matrix.reshape(-1, 4, 4)
61
+ translation = camera_matrix[:, :3, 3]
62
+ translation = translation / (
63
+ torch.norm(translation, dim=1, keepdim=True) + 1e-8
64
+ )
65
+ camera_matrix[:, :3, 3] = translation
66
+ return camera_matrix.reshape(-1, 16)
67
+
68
+
69
+ def get_camera(
70
+ num_frames,
71
+ elevation=15,
72
+ azimuth_start=0,
73
+ azimuth_span=360,
74
+ blender_coord=True,
75
+ extra_view=False,
76
+ ):
77
+ angle_gap = azimuth_span / num_frames
78
+ cameras = []
79
+ for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start, angle_gap):
80
+ camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
81
+ if blender_coord:
82
+ camera_matrix = convert_opengl_to_blender(camera_matrix)
83
+ cameras.append(camera_matrix.flatten())
84
+
85
+ if extra_view:
86
+ dim = len(cameras[0])
87
+ cameras.append(np.zeros(dim))
88
+ return torch.tensor(np.stack(cameras, 0)).float()
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_center_crop",
8
+ "crop_size",
9
+ "do_rescale",
10
+ "rescale_factor",
11
+ "do_normalize",
12
+ "image_mean",
13
+ "image_std",
14
+ "do_convert_rgb",
15
+ "return_tensors",
16
+ "data_format",
17
+ "input_data_format"
18
+ ],
19
+ "crop_size": {
20
+ "height": 224,
21
+ "width": 224
22
+ },
23
+ "do_center_crop": true,
24
+ "do_convert_rgb": true,
25
+ "do_normalize": true,
26
+ "do_rescale": true,
27
+ "do_resize": true,
28
+ "image_mean": [
29
+ 0.48145466,
30
+ 0.4578275,
31
+ 0.40821073
32
+ ],
33
+ "image_processor_type": "CLIPImageProcessor",
34
+ "image_std": [
35
+ 0.26862954,
36
+ 0.26130258,
37
+ 0.27577711
38
+ ],
39
+ "resample": 3,
40
+ "rescale_factor": 0.00392156862745098,
41
+ "size": {
42
+ "shortest_edge": 224
43
+ }
44
+ }
ip_adapter/image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
3
+ "architectures": [
4
+ "CLIPVisionModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 1280,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5120,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 32,
19
+ "patch_size": 14,
20
+ "projection_dim": 1024,
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.41.2"
23
+ }
ip_adapter/image_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a56cfd4ffcf40be097c430324ec184cc37187f6dafef128ef9225438a3c03c4
3
+ size 1261595704
ip_adapter/ip-adapter-plus_imagedream.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ccadbfaf399f3a0e12eeaef7c1dc3a0002de801bb4d7b134bf85ca3204bcc4b
3
+ size 148229970
model_index.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": [
3
+ "pipeline_imagedream",
4
+ "ImageDreamPipeline"
5
+ ],
6
+ "_diffusers_version": "0.29.0",
7
+ "feature_extractor": [
8
+ "transformers",
9
+ "CLIPImageProcessor"
10
+ ],
11
+ "image_encoder": [
12
+ null,
13
+ null
14
+ ],
15
+ "requires_safety_checker": false,
16
+ "safety_checker": [
17
+ null,
18
+ null
19
+ ],
20
+ "scheduler": [
21
+ "diffusers",
22
+ "DDIMScheduler"
23
+ ],
24
+ "text_encoder": [
25
+ "transformers",
26
+ "CLIPTextModel"
27
+ ],
28
+ "tokenizer": [
29
+ "transformers",
30
+ "CLIPTokenizer"
31
+ ],
32
+ "unet": [
33
+ "diffusers",
34
+ "UNet2DConditionModel"
35
+ ],
36
+ "vae": [
37
+ "diffusers",
38
+ "AutoencoderKL"
39
+ ]
40
+ }
pipeline_imagedream.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Callable, Dict, List, Optional, Union
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
6
+ from diffusers.image_processor import PipelineImageInput
7
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel
8
+ from diffusers.pipelines.stable_diffusion.pipeline_output import (
9
+ StableDiffusionPipelineOutput,
10
+ )
11
+ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
12
+ StableDiffusionPipeline,
13
+ rescale_noise_cfg,
14
+ retrieve_timesteps,
15
+ )
16
+ from diffusers.pipelines.stable_diffusion.safety_checker import (
17
+ StableDiffusionSafetyChecker,
18
+ )
19
+ from diffusers.schedulers import KarrasDiffusionSchedulers
20
+ from diffusers.utils import deprecate
21
+ from transformers import (
22
+ CLIPImageProcessor,
23
+ CLIPTextModel,
24
+ CLIPTokenizer,
25
+ CLIPVisionModel,
26
+ )
27
+
28
+ from attention_processor import add_imagedream_attn_processor
29
+ from camera_utils import get_camera
30
+
31
+
32
+ class ImageDreamPipeline(StableDiffusionPipeline):
33
+ def __init__(
34
+ self,
35
+ vae: AutoencoderKL,
36
+ text_encoder: CLIPTextModel,
37
+ tokenizer: CLIPTokenizer,
38
+ unet: UNet2DConditionModel,
39
+ scheduler: KarrasDiffusionSchedulers,
40
+ safety_checker: StableDiffusionSafetyChecker,
41
+ feature_extractor: CLIPImageProcessor,
42
+ image_encoder: CLIPVisionModel = None,
43
+ requires_safety_checker: bool = False,
44
+ ) -> None:
45
+ super().__init__(
46
+ vae=vae,
47
+ text_encoder=text_encoder,
48
+ tokenizer=tokenizer,
49
+ unet=add_imagedream_attn_processor(unet),
50
+ scheduler=scheduler,
51
+ safety_checker=safety_checker,
52
+ feature_extractor=feature_extractor,
53
+ image_encoder=image_encoder,
54
+ requires_safety_checker=requires_safety_checker,
55
+ )
56
+ self.num_views = 4
57
+
58
+ def load_ip_adapter(
59
+ self,
60
+ pretrained_model_name_or_path_or_dict: Union[
61
+ str, List[str], Dict[str, torch.Tensor]
62
+ ],
63
+ subfolder: Union[str, List[str]],
64
+ weight_name: Union[str, List[str]],
65
+ image_encoder_folder: Optional[str] = "image_encoder",
66
+ **kwargs,
67
+ ):
68
+ super().load_ip_adapter(
69
+ pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
70
+ subfolder=subfolder,
71
+ weight_name=weight_name,
72
+ image_encoder_folder=image_encoder_folder,
73
+ **kwargs,
74
+ )
75
+ add_imagedream_attn_processor(self.unet)
76
+
77
+ def encode_image_to_latents(
78
+ self,
79
+ image: PipelineImageInput,
80
+ height: int,
81
+ width: int,
82
+ device: torch.device,
83
+ num_images_per_prompt: int = 1,
84
+ ):
85
+ dtype = next(self.vae.parameters()).dtype
86
+
87
+ if isinstance(image, torch.Tensor):
88
+ image = F.interpolate(
89
+ image,
90
+ (height, width),
91
+ mode="bilinear",
92
+ align_corners=False,
93
+ antialias=True,
94
+ )
95
+ else:
96
+ image = self.image_processor.preprocess(image, height, width)
97
+
98
+ # image should be in range [-1, 1]
99
+ image = image.to(device=device, dtype=dtype)
100
+
101
+ def vae_encode(image):
102
+ posterior = self.vae.encode(image).latent_dist
103
+ latents = posterior.sample() * self.vae.config.scaling_factor
104
+ latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
105
+ return latents
106
+
107
+ latents = vae_encode(image)
108
+ uncond_latents = vae_encode(torch.zeros_like(image))
109
+ return latents, uncond_latents
110
+
111
+ @torch.no_grad()
112
+ def __call__(
113
+ self,
114
+ prompt: Union[str, List[str]] = None,
115
+ height: Optional[int] = None,
116
+ width: Optional[int] = None,
117
+ num_inference_steps: int = 50,
118
+ elevation: float = 0.0,
119
+ timesteps: List[int] = None,
120
+ sigmas: List[float] = None,
121
+ guidance_scale: float = 7.5,
122
+ negative_prompt: Optional[Union[str, List[str]]] = None,
123
+ num_images_per_prompt: Optional[int] = 1,
124
+ eta: float = 0.0,
125
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
126
+ latents: Optional[torch.Tensor] = None,
127
+ prompt_embeds: Optional[torch.Tensor] = None,
128
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
129
+ ip_adapter_image: Optional[PipelineImageInput] = None,
130
+ # StableDiffusion support `ip_adapter_image_embeds` but we don't use, and raise ValueError.
131
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
132
+ output_type: Optional[str] = "pil",
133
+ return_dict: bool = True,
134
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
135
+ guidance_rescale: float = 0.0,
136
+ clip_skip: Optional[int] = None,
137
+ callback_on_step_end: Optional[
138
+ Union[
139
+ Callable[[int, int, Dict], None],
140
+ PipelineCallback,
141
+ MultiPipelineCallbacks,
142
+ ]
143
+ ] = None,
144
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
145
+ **kwargs,
146
+ ):
147
+ if ip_adapter_image_embeds is not None:
148
+ raise ValueError(
149
+ "do not use `ip_adapter_image_embeds` in ImageDream, use `ip_adapter_image`"
150
+ )
151
+
152
+ callback = kwargs.pop("callback", None)
153
+ callback_steps = kwargs.pop("callback_steps", None)
154
+
155
+ if callback is not None:
156
+ deprecate(
157
+ "callback",
158
+ "1.0.0",
159
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
160
+ )
161
+ if callback_steps is not None:
162
+ deprecate(
163
+ "callback_steps",
164
+ "1.0.0",
165
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
166
+ )
167
+
168
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
169
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
170
+
171
+ # ImageDream number of views
172
+ if cross_attention_kwargs is None:
173
+ num_views = self.num_views
174
+ else:
175
+ cross_attention_kwargs.pop("num_views", self.num_views)
176
+
177
+ # 0. Default height and width to unet
178
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
179
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
180
+ # to deal with lora scaling and other possible forward hooks
181
+
182
+ # 1. Check inputs. Raise error if not correct
183
+ if prompt is None:
184
+ prompt = ""
185
+ self.check_inputs(
186
+ prompt,
187
+ height,
188
+ width,
189
+ callback_steps,
190
+ negative_prompt,
191
+ prompt_embeds,
192
+ negative_prompt_embeds,
193
+ ip_adapter_image,
194
+ None, # ip_adapter_image_embeds,
195
+ callback_on_step_end_tensor_inputs,
196
+ )
197
+
198
+ self._guidance_scale = guidance_scale
199
+ self._guidance_rescale = guidance_rescale
200
+ self._clip_skip = clip_skip
201
+ self._cross_attention_kwargs = cross_attention_kwargs
202
+ self._interrupt = False
203
+
204
+ # 2. Define call parameters
205
+ if prompt is not None and isinstance(prompt, str):
206
+ batch_size = 1
207
+ elif prompt is not None and isinstance(prompt, list):
208
+ batch_size = len(prompt)
209
+ else:
210
+ batch_size = prompt_embeds.shape[0]
211
+
212
+ device = self._execution_device
213
+
214
+ # 3. Encode input prompt
215
+ lora_scale = (
216
+ self.cross_attention_kwargs.get("scale", None)
217
+ if self.cross_attention_kwargs is not None
218
+ else None
219
+ )
220
+
221
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
222
+ prompt,
223
+ device,
224
+ num_images_per_prompt,
225
+ self.do_classifier_free_guidance,
226
+ negative_prompt,
227
+ prompt_embeds=prompt_embeds,
228
+ negative_prompt_embeds=negative_prompt_embeds,
229
+ lora_scale=lora_scale,
230
+ clip_skip=self.clip_skip,
231
+ )
232
+
233
+ # camera parameter for ImageDream
234
+ camera = get_camera(
235
+ num_views, elevation=elevation, extra_view=ip_adapter_image is not None
236
+ ).to(dtype=prompt_embeds.dtype, device=device)
237
+ camera = camera.repeat(batch_size * num_images_per_prompt, 1)
238
+
239
+ if ip_adapter_image is not None:
240
+ image_embeds = self.prepare_ip_adapter_image_embeds(
241
+ ip_adapter_image,
242
+ None, # ip_adapter_image_embeds,
243
+ device,
244
+ batch_size * num_images_per_prompt,
245
+ self.do_classifier_free_guidance,
246
+ )
247
+ # ImageDream
248
+ image_latents, negative_image_latents = self.encode_image_to_latents(
249
+ ip_adapter_image,
250
+ height,
251
+ width,
252
+ device,
253
+ batch_size * num_images_per_prompt,
254
+ )
255
+ num_views += 1
256
+
257
+ # For classifier free guidance, we need to do two forward passes.
258
+ # Here we concatenate the unconditional and text embeddings into a single batch
259
+ # to avoid doing two forward passes
260
+ if self.do_classifier_free_guidance:
261
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
262
+ camera = torch.cat([camera] * 2)
263
+ if ip_adapter_image is not None:
264
+ image_latents = torch.cat([negative_image_latents, image_latents])
265
+
266
+ # Multi-view inputs for ImageDream.
267
+ prompt_embeds = prompt_embeds.repeat_interleave(num_views, dim=0)
268
+ if ip_adapter_image is not None:
269
+ image_embeds = [i.repeat_interleave(num_views, dim=0) for i in image_embeds]
270
+
271
+ # 4. Prepare timesteps
272
+ timesteps, num_inference_steps = retrieve_timesteps(
273
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
274
+ )
275
+
276
+ # 5. Prepare latent variables
277
+ num_channels_latents = self.unet.config.in_channels
278
+ latents = self.prepare_latents(
279
+ batch_size * num_images_per_prompt * num_views,
280
+ num_channels_latents,
281
+ height,
282
+ width,
283
+ prompt_embeds.dtype,
284
+ device,
285
+ generator,
286
+ latents,
287
+ )
288
+
289
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
290
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
291
+
292
+ # 6.1 Add image embeds for IP-Adapter
293
+ if ip_adapter_image is not None:
294
+ added_cond_kwargs = {"image_embeds": image_embeds}
295
+ else:
296
+ added_cond_kwargs = None
297
+
298
+ # 6.2 Optionally get Guidance Scale Embedding
299
+ timestep_cond = None
300
+ if self.unet.config.time_cond_proj_dim is not None:
301
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
302
+ batch_size * num_images_per_prompt
303
+ )
304
+ timestep_cond = self.get_guidance_scale_embedding(
305
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
306
+ ).to(device=device, dtype=latents.dtype)
307
+
308
+ cross_attention_kwargs = {"num_views": num_views}
309
+ if self.cross_attention_kwargs is not None:
310
+ cross_attention_kwargs.update(self.cross_attention_kwargs)
311
+
312
+ # fmt: off
313
+ # 7. Denoising loop
314
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
315
+ self._num_timesteps = len(timesteps)
316
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
317
+ for i, t in enumerate(timesteps):
318
+ if self.interrupt:
319
+ continue
320
+
321
+ # expand the latents if we are doing classifier free guidance
322
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
323
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
324
+
325
+ if ip_adapter_image is not None:
326
+ latent_model_input[num_views - 1 :: num_views, :, :, :] = image_latents
327
+ # predict the noise residual
328
+ noise_pred = self.unet(
329
+ latent_model_input,
330
+ t,
331
+ class_labels=camera,
332
+ encoder_hidden_states=prompt_embeds,
333
+ timestep_cond=timestep_cond,
334
+ cross_attention_kwargs=cross_attention_kwargs,
335
+ added_cond_kwargs=added_cond_kwargs,
336
+ return_dict=False,
337
+ )[0]
338
+
339
+ # perform guidance
340
+ if self.do_classifier_free_guidance:
341
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
342
+ noise_pred = torch.lerp(noise_pred_uncond, noise_pred_text, self.guidance_scale)
343
+
344
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
345
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
346
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
347
+
348
+ # compute the previous noisy sample x_t -> x_t-1
349
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
350
+
351
+ if callback_on_step_end is not None:
352
+ callback_kwargs = {}
353
+ for k in callback_on_step_end_tensor_inputs:
354
+ callback_kwargs[k] = locals()[k]
355
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
356
+
357
+ latents = callback_outputs.pop("latents", latents)
358
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
359
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
360
+
361
+ # call the callback, if provided
362
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
363
+ progress_bar.update()
364
+ if callback is not None and i % callback_steps == 0:
365
+ step_idx = i // getattr(self.scheduler, "order", 1)
366
+ callback(step_idx, t, latents)
367
+ # fmt: on
368
+ if not output_type == "latent":
369
+ image = self.vae.decode(
370
+ latents / self.vae.config.scaling_factor,
371
+ return_dict=False,
372
+ generator=generator,
373
+ )[0]
374
+ image, has_nsfw_concept = self.run_safety_checker(
375
+ image, device, prompt_embeds.dtype
376
+ )
377
+ else:
378
+ image = latents
379
+ has_nsfw_concept = None
380
+
381
+ if has_nsfw_concept is None:
382
+ do_denormalize = [True] * image.shape[0]
383
+ else:
384
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
385
+
386
+ image = self.image_processor.postprocess(
387
+ image, output_type=output_type, do_denormalize=do_denormalize
388
+ )
389
+
390
+ # Offload all models
391
+ self.maybe_free_model_hooks()
392
+
393
+ if not return_dict:
394
+ return (image, has_nsfw_concept)
395
+
396
+ return StableDiffusionPipelineOutput(
397
+ images=image, nsfw_content_detected=has_nsfw_concept
398
+ )
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.29.0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "prediction_type": "epsilon",
12
+ "rescale_betas_zero_snr": false,
13
+ "sample_max_value": 1.0,
14
+ "set_alpha_to_one": false,
15
+ "steps_offset": 1,
16
+ "thresholding": false,
17
+ "timestep_spacing": "leading",
18
+ "trained_betas": null
19
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-2-1",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1024,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 23,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.41.2",
24
+ "vocab_size": 49408
25
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1827c465450322616f06dea41596eac7d493f4e95904dcb51f0fc745c4e13f
3
+ size 680820392
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "!",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "model_max_length": 77,
35
+ "pad_token": "!",
36
+ "tokenizer_class": "CLIPTokenizer",
37
+ "unk_token": "<|endoftext|>"
38
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.29.0",
4
+ "act_fn": "silu",
5
+ "addition_embed_type": null,
6
+ "addition_embed_type_num_heads": 64,
7
+ "addition_time_embed_dim": null,
8
+ "attention_head_dim": [
9
+ 5,
10
+ 10,
11
+ 20,
12
+ 20
13
+ ],
14
+ "attention_type": "default",
15
+ "block_out_channels": [
16
+ 320,
17
+ 640,
18
+ 1280,
19
+ 1280
20
+ ],
21
+ "center_input_sample": false,
22
+ "class_embed_type": "projection",
23
+ "class_embeddings_concat": false,
24
+ "conv_in_kernel": 3,
25
+ "conv_out_kernel": 3,
26
+ "cross_attention_dim": 1024,
27
+ "cross_attention_norm": null,
28
+ "down_block_types": [
29
+ "CrossAttnDownBlock2D",
30
+ "CrossAttnDownBlock2D",
31
+ "CrossAttnDownBlock2D",
32
+ "DownBlock2D"
33
+ ],
34
+ "downsample_padding": 1,
35
+ "dropout": 0.0,
36
+ "dual_cross_attention": false,
37
+ "encoder_hid_dim": null,
38
+ "encoder_hid_dim_type": null,
39
+ "flip_sin_to_cos": true,
40
+ "freq_shift": 0,
41
+ "in_channels": 4,
42
+ "layers_per_block": [
43
+ 2,
44
+ 2,
45
+ 2,
46
+ 2
47
+ ],
48
+ "mid_block_only_cross_attention": null,
49
+ "mid_block_scale_factor": 1,
50
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 4,
57
+ "projection_class_embeddings_input_dim": 16,
58
+ "resnet_out_scale_factor": 1.0,
59
+ "resnet_skip_time_act": false,
60
+ "resnet_time_scale_shift": "default",
61
+ "reverse_transformer_layers_per_block": null,
62
+ "sample_size": 32,
63
+ "time_cond_proj_dim": null,
64
+ "time_embedding_act_fn": null,
65
+ "time_embedding_dim": null,
66
+ "time_embedding_type": "positional",
67
+ "timestep_post_act": null,
68
+ "transformer_layers_per_block": 1,
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "upcast_attention": false,
76
+ "use_linear_projection": true
77
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64ab5e3b897710f00e50447ba087e6c6ebb54beb8b32851ce66978d0c40f049e
3
+ size 1735228080
vae/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.29.0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "latents_mean": null,
21
+ "latents_std": null,
22
+ "layers_per_block": 2,
23
+ "norm_num_groups": 32,
24
+ "out_channels": 3,
25
+ "sample_size": 256,
26
+ "scaling_factor": 0.18215,
27
+ "shift_factor": null,
28
+ "up_block_types": [
29
+ "UpDecoderBlock2D",
30
+ "UpDecoderBlock2D",
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D"
33
+ ],
34
+ "use_post_quant_conv": true,
35
+ "use_quant_conv": true
36
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
3
+ size 167335342