kiigii commited on Jun 13, 2024

Commit

aaa261a

verified ·

1 Parent(s): b2d1dd8

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

attention_processor.py +70 -0
camera_utils.py +88 -0
feature_extractor/preprocessor_config.json +44 -0
ip_adapter/image_encoder/config.json +23 -0
ip_adapter/image_encoder/model.safetensors +3 -0
ip_adapter/ip-adapter-plus_imagedream.bin +3 -0
model_index.json +40 -0
pipeline_imagedream.py +398 -0
scheduler/scheduler_config.json +19 -0
text_encoder/config.json +25 -0
text_encoder/model.safetensors +3 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +24 -0
tokenizer/tokenizer_config.json +38 -0
tokenizer/vocab.json +0 -0
unet/config.json +77 -0
unet/diffusion_pytorch_model.safetensors +3 -0
vae/config.json +36 -0
vae/diffusion_pytorch_model.safetensors +3 -0

attention_processor.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from diffusers.models import UNet2DConditionModel
+from diffusers.models.attention import Attention
+from diffusers.models.attention_processor import AttnProcessor2_0
+def add_imagedream_attn_processor(unet: UNet2DConditionModel) -> nn.Module:
+    attn_procs = {}
+    for key, attn_processor in unet.attn_processors.items():
+        if "attn1" in key:
+            attn_procs[key] = ImageDreamAttnProcessor2_0()
+        else:
+            attn_procs[key] = attn_processor
+    unet.set_attn_processor(attn_procs)
+    return unet
+class ImageDreamAttnProcessor2_0(AttnProcessor2_0):
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        num_views: int = 1,
+        *args,
+        **kwargs,
+    ):
+        if num_views == 1:
+            return super().__call__(
+                attn=attn,
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                temb=temb,
+                *args,
+                **kwargs,
+            )
+        input_ndim = hidden_states.ndim
+        B = hidden_states.size(0)
+        if B % num_views:
+            raise ValueError(
+                f"`batch_size`(got {B}) must be a multiple of `num_views`(got {num_views})."
+            )
+        real_B = B // num_views
+        if input_ndim == 4:
+            H, W = hidden_states.shape[2:]
+            hidden_states = hidden_states.reshape(real_B, -1, H, W).transpose(1, 2)
+        else:
+            hidden_states = hidden_states.reshape(real_B, -1, hidden_states.size(-1))
+        hidden_states = super().__call__(
+            attn=attn,
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            temb=temb,
+            *args,
+            **kwargs,
+        )
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(B, -1, H, W)
+        else:
+            hidden_states = hidden_states.reshape(B, -1, hidden_states.size(-1))
+        return hidden_states

camera_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copied from ImageDream
+# https://github.com/bytedance/ImageDream/blob/main/extern/ImageDream/imagedream/camera_utils.py
+import numpy as np
+import torch
+def create_camera_to_world_matrix(elevation, azimuth):
+    elevation = np.radians(elevation)
+    azimuth = np.radians(azimuth)
+    # Convert elevation and azimuth angles to Cartesian coordinates on a unit sphere
+    x = np.cos(elevation) * np.sin(azimuth)
+    y = np.sin(elevation)
+    z = np.cos(elevation) * np.cos(azimuth)
+    # Calculate camera position, target, and up vectors
+    camera_pos = np.array([x, y, z])
+    target = np.array([0, 0, 0])
+    up = np.array([0, 1, 0])
+    # Construct view matrix
+    forward = target - camera_pos
+    forward /= np.linalg.norm(forward)
+    right = np.cross(forward, up)
+    right /= np.linalg.norm(right)
+    new_up = np.cross(right, forward)
+    new_up /= np.linalg.norm(new_up)
+    cam2world = np.eye(4)
+    cam2world[:3, :3] = np.array([right, new_up, -forward]).T
+    cam2world[:3, 3] = camera_pos
+    return cam2world
+def convert_opengl_to_blender(camera_matrix):
+    if isinstance(camera_matrix, np.ndarray):
+        # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
+        camera_matrix_blender = np.dot(flip_yz, camera_matrix)
+    else:
+        # Construct transformation matrix to convert from OpenGL space to Blender space
+        flip_yz = torch.tensor(
+            [[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]
+        )
+        if camera_matrix.ndim == 3:
+            flip_yz = flip_yz.unsqueeze(0)
+        camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix), camera_matrix)
+    return camera_matrix_blender
+def normalize_camera(camera_matrix):
+    """normalize the camera location onto a unit-sphere"""
+    if isinstance(camera_matrix, np.ndarray):
+        camera_matrix = camera_matrix.reshape(-1, 4, 4)
+        translation = camera_matrix[:, :3, 3]
+        translation = translation / (
+            np.linalg.norm(translation, axis=1, keepdims=True) + 1e-8
+        )
+        camera_matrix[:, :3, 3] = translation
+    else:
+        camera_matrix = camera_matrix.reshape(-1, 4, 4)
+        translation = camera_matrix[:, :3, 3]
+        translation = translation / (
+            torch.norm(translation, dim=1, keepdim=True) + 1e-8
+        )
+        camera_matrix[:, :3, 3] = translation
+    return camera_matrix.reshape(-1, 16)
+def get_camera(
+    num_frames,
+    elevation=15,
+    azimuth_start=0,
+    azimuth_span=360,
+    blender_coord=True,
+    extra_view=False,
+):
+    angle_gap = azimuth_span / num_frames
+    cameras = []
+    for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start, angle_gap):
+        camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
+        if blender_coord:
+            camera_matrix = convert_opengl_to_blender(camera_matrix)
+        cameras.append(camera_matrix.flatten())
+    if extra_view:
+        dim = len(cameras[0])
+        cameras.append(np.zeros(dim))
+    return torch.tensor(np.stack(cameras, 0)).float()

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "_valid_processor_keys": [
+    "images",
+    "do_resize",
+    "size",
+    "resample",
+    "do_center_crop",
+    "crop_size",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "do_convert_rgb",
+    "return_tensors",
+    "data_format",
+    "input_data_format"
+  ],
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

ip_adapter/image_encoder/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+  "architectures": [
+    "CLIPVisionModel"
+  ],
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_size": 1280,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 32,
+  "patch_size": 14,
+  "projection_dim": 1024,
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2"
+}

ip_adapter/image_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a56cfd4ffcf40be097c430324ec184cc37187f6dafef128ef9225438a3c03c4
+size 1261595704

ip_adapter/ip-adapter-plus_imagedream.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ccadbfaf399f3a0e12eeaef7c1dc3a0002de801bb4d7b134bf85ca3204bcc4b
+size 148229970

model_index.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_class_name": [
+    "pipeline_imagedream",
+    "ImageDreamPipeline"
+  ],
+  "_diffusers_version": "0.29.0",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "image_encoder": [
+    null,
+    null
+  ],
+  "requires_safety_checker": false,
+  "safety_checker": [
+    null,
+    null
+  ],
+  "scheduler": [
+    "diffusers",
+    "DDIMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

pipeline_imagedream.py ADDED Viewed

	@@ -0,0 +1,398 @@

+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+import torch.nn.functional as F
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.pipeline_output import (
+    StableDiffusionPipelineOutput,
+)
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipeline,
+    rescale_noise_cfg,
+    retrieve_timesteps,
+)
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModel,
+)
+from attention_processor import add_imagedream_attn_processor
+from camera_utils import get_camera
+class ImageDreamPipeline(StableDiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModel = None,
+        requires_safety_checker: bool = False,
+    ) -> None:
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=add_imagedream_attn_processor(unet),
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            requires_safety_checker=requires_safety_checker,
+        )
+        self.num_views = 4
+    def load_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[
+            str, List[str], Dict[str, torch.Tensor]
+        ],
+        subfolder: Union[str, List[str]],
+        weight_name: Union[str, List[str]],
+        image_encoder_folder: Optional[str] = "image_encoder",
+        **kwargs,
+    ):
+        super().load_ip_adapter(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            subfolder=subfolder,
+            weight_name=weight_name,
+            image_encoder_folder=image_encoder_folder,
+            **kwargs,
+        )
+        add_imagedream_attn_processor(self.unet)
+    def encode_image_to_latents(
+        self,
+        image: PipelineImageInput,
+        height: int,
+        width: int,
+        device: torch.device,
+        num_images_per_prompt: int = 1,
+    ):
+        dtype = next(self.vae.parameters()).dtype
+        if isinstance(image, torch.Tensor):
+            image = F.interpolate(
+                image,
+                (height, width),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+        else:
+            image = self.image_processor.preprocess(image, height, width)
+        # image should be in range [-1, 1]
+        image = image.to(device=device, dtype=dtype)
+        def vae_encode(image):
+            posterior = self.vae.encode(image).latent_dist
+            latents = posterior.sample() * self.vae.config.scaling_factor
+            latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+            return latents
+        latents = vae_encode(image)
+        uncond_latents = vae_encode(torch.zeros_like(image))
+        return latents, uncond_latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        elevation: float = 0.0,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        # StableDiffusion support `ip_adapter_image_embeds` but we don't use, and raise ValueError.
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        if ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "do not use `ip_adapter_image_embeds` in ImageDream, use `ip_adapter_image`"
+            )
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # ImageDream number of views
+        if cross_attention_kwargs is None:
+            num_views = self.num_views
+        else:
+            cross_attention_kwargs.pop("num_views", self.num_views)
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        # 1. Check inputs. Raise error if not correct
+        if prompt is None:
+            prompt = ""
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            None,  # ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # camera parameter for ImageDream
+        camera = get_camera(
+            num_views, elevation=elevation, extra_view=ip_adapter_image is not None
+        ).to(dtype=prompt_embeds.dtype, device=device)
+        camera = camera.repeat(batch_size * num_images_per_prompt, 1)
+        if ip_adapter_image is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                None,  # ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+            # ImageDream
+            image_latents, negative_image_latents = self.encode_image_to_latents(
+                ip_adapter_image,
+                height,
+                width,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+            num_views += 1
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            camera = torch.cat([camera] * 2)
+            if ip_adapter_image is not None:
+                image_latents = torch.cat([negative_image_latents, image_latents])
+        # Multi-view inputs for ImageDream.
+        prompt_embeds = prompt_embeds.repeat_interleave(num_views, dim=0)
+        if ip_adapter_image is not None:
+            image_embeds = [i.repeat_interleave(num_views, dim=0) for i in image_embeds]
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt * num_views,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 6.1 Add image embeds for IP-Adapter
+        if ip_adapter_image is not None:
+            added_cond_kwargs = {"image_embeds": image_embeds}
+        else:
+            added_cond_kwargs = None
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        cross_attention_kwargs = {"num_views": num_views}
+        if self.cross_attention_kwargs is not None:
+            cross_attention_kwargs.update(self.cross_attention_kwargs)
+        # fmt: off
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                if ip_adapter_image is not None:
+                    latent_model_input[num_views - 1 :: num_views, :, :, :] = image_latents
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    class_labels=camera,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = torch.lerp(noise_pred_uncond, noise_pred_text, self.guidance_scale)
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        # fmt: on
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor,
+                return_dict=False,
+                generator=generator,
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.29.0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "clip_sample_range": 1.0,
+  "dynamic_thresholding_ratio": 0.995,
+  "num_train_timesteps": 1000,
+  "prediction_type": "epsilon",
+  "rescale_betas_zero_snr": false,
+  "sample_max_value": 1.0,
+  "set_alpha_to_one": false,
+  "steps_offset": 1,
+  "thresholding": false,
+  "timestep_spacing": "leading",
+  "trained_betas": null
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "stabilityai/stable-diffusion-2-1",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_size": 1024,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 23,
+  "pad_token_id": 1,
+  "projection_dim": 512,
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "vocab_size": 49408
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc1827c465450322616f06dea41596eac7d493f4e95904dcb51f0fc745c4e13f
+size 680820392

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "!",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "!",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "!",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": "<|endoftext|>"
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

unet/config.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.29.0",
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": "projection",
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 1024,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": [
+    2,
+    2,
+    2,
+    2
+  ],
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": 16,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 32,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": true
+}

unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64ab5e3b897710f00e50447ba087e6c6ebb54beb8b32851ce66978d0c40f049e
+size 1735228080

vae/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.29.0",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "latents_mean": null,
+  "latents_std": null,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 256,
+  "scaling_factor": 0.18215,
+  "shift_factor": null,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "use_post_quant_conv": true,
+  "use_quant_conv": true
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
+size 167335342