Spaces:

Jinl
/

PortraitDiffusion

Runtime error

App Files Files Community

Jinl commited on Nov 30, 2023

Commit

9bf9ce7

1 Parent(s): 9c3237f

initial add

Browse files

Files changed (34) hide show

.gitignore +16 -0
app.py +529 -0
app.sh +7 -0
pdiff/pdiff_pipeline.py +275 -0
requirements.txt +6 -0
style.css +3 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/__init__.cpython-38.pyc +0 -0
utils/__pycache__/__init__.cpython-39.pyc +0 -0
utils/__pycache__/convert_from_ckpt.cpython-310.pyc +0 -0
utils/__pycache__/convert_from_ckpt.cpython-38.pyc +0 -0
utils/__pycache__/convert_from_ckpt.cpython-39.pyc +0 -0
utils/__pycache__/convert_lora_safetensor_to_diffusers.cpython-310.pyc +0 -0
utils/__pycache__/convert_lora_safetensor_to_diffusers.cpython-38.pyc +0 -0
utils/__pycache__/convert_lora_safetensor_to_diffusers.cpython-39.pyc +0 -0
utils/__pycache__/diffuser_utils.cpython-310.pyc +0 -0
utils/__pycache__/diffuser_utils.cpython-38.pyc +0 -0
utils/__pycache__/diffuser_utils.cpython-39.pyc +0 -0
utils/__pycache__/free_lunch_utils.cpython-310.pyc +0 -0
utils/__pycache__/free_lunch_utils.cpython-38.pyc +0 -0
utils/__pycache__/free_lunch_utils.cpython-39.pyc +0 -0
utils/__pycache__/masactrl_utils.cpython-310.pyc +0 -0
utils/__pycache__/masactrl_utils.cpython-38.pyc +0 -0
utils/__pycache__/masactrl_utils.cpython-39.pyc +0 -0
utils/__pycache__/style_attn_control.cpython-310.pyc +0 -0
utils/__pycache__/style_attn_control.cpython-38.pyc +0 -0
utils/__pycache__/style_attn_control.cpython-39.pyc +0 -0
utils/convert_from_ckpt.py +959 -0
utils/convert_lora_safetensor_to_diffusers.py +154 -0
utils/diffuser_utils.py +275 -0
utils/free_lunch_utils.py +334 -0
utils/masactrl_utils.py +212 -0
utils/style_attn_control.py +275 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+./data
+./results
+./results_ablation
+./workdir
+./row_results
+./new_res
+./cop
+examper
+results
+data
+results_ablation
+row_results
+new_res
+cop
+./samples
+samples

app.py ADDED Viewed

	@@ -0,0 +1,529 @@

+import os
+import json
+import re
+from turtle import width
+import torch
+import random
+import numpy as np
+import gradio as gr
+from glob import glob
+from omegaconf import OmegaConf
+from datetime import datetime
+from safetensors import safe_open
+from diffusers import AutoencoderKL,UNet2DConditionModel,StableDiffusionPipeline
+from diffusers import DDIMScheduler, EulerDiscreteScheduler, PNDMScheduler
+from diffusers.utils.import_utils import is_xformers_available
+from transformers import CLIPTextModel, CLIPTokenizer
+from utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
+from utils.convert_lora_safetensor_to_diffusers import convert_lora
+import torch.nn.functional as F
+from PIL import Image
+from utils.diffuser_utils import MasaCtrlPipeline
+from utils.masactrl_utils import (AttentionBase,
+                                     regiter_attention_editor_diffusers)
+from utils.free_lunch_utils import register_upblock2d,register_crossattn_upblock2d,register_free_upblock2d, register_free_crossattn_upblock2d
+from utils.style_attn_control import MaskPromptedStyleAttentionControl
+from torchvision.utils import save_image
+from diffusers.models.attention_processor import AttnProcessor2_0
+css = """
+.toolbutton {
+    margin-buttom: 0em 0em 0em 0em;
+    max-width: 2.5em;
+    min-width: 2.5em !important;
+    height: 2.5em;
+}
+"""
+class GlobalText:
+    def __init__(self):
+        # config dirs
+        self.basedir                = os.getcwd()
+        self.stable_diffusion_dir   = os.path.join(self.basedir, "models", "StableDiffusion")
+        self.personalized_model_dir = '/home/jin.liu/liujin/webui/stable-diffusion-webui/models/Stable-diffusion'
+        self.lora_model_dir         = '/home/jin.liu/liujin/webui/stable-diffusion-webui/models/Lora'
+        self.savedir                = os.path.join(self.basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
+        self.savedir_sample         = os.path.join(self.savedir, "sample")
+        self.savedir_mask         = os.path.join(self.savedir, "mask")
+        self.stable_diffusion_list   = ["/home/jin.liu/liujin/cache/hub/models--runwayml--stable-diffusion-v1-5/snapshots/1d0c4ebf6ff58a5caecab40fa1406526bca4b5b9",
+                                        "runwayml/stable-diffusion-v1-5",
+                                        "stabilityai/stable-diffusion-2-1"]
+        self.personalized_model_list = []
+        self.lora_model_list = []
+        # config models
+        self.tokenizer             = None
+        self.text_encoder          = None
+        self.vae                   = None
+        self.unet                  = None
+        self.pipeline              = None
+        self.lora_loaded           = None
+        self.personal_model_loaded = None
+        self.lora_model_state_dict = {}
+        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        self.refresh_stable_diffusion()
+        self.refresh_personalized_model()
+        self.reset_start_code()
+    def load_base_pipeline(self, model_path):
+        print(f'loading {model_path} model')
+        scheduler = DDIMScheduler.from_pretrained(model_path,subfolder="scheduler")
+        self.pipeline = MasaCtrlPipeline.from_pretrained(model_path,
+                                         scheduler=scheduler).to(self.device)
+    def refresh_stable_diffusion(self):
+        self.load_base_pipeline(self.stable_diffusion_list[0])
+        self.lora_loaded           = None
+        self.personal_model_loaded = None
+        return self.stable_diffusion_list[0]
+    def refresh_personalized_model(self):
+        personalized_model_list = glob(os.path.join(self.personalized_model_dir, "**/*.safetensors"), recursive=True)
+        self.personalized_model_list = {os.path.basename(file): file for file in personalized_model_list}
+        lora_model_list = glob(os.path.join(self.lora_model_dir, "**/*.safetensors"), recursive=True)
+        self.lora_model_list = {os.path.basename(file): file for file in lora_model_list}
+    def update_stable_diffusion(self, stable_diffusion_dropdown):
+        self.load_base_pipeline(stable_diffusion_dropdown)
+        self.lora_loaded           = None
+        self.personal_model_loaded = None
+        return gr.Dropdown.update()
+    def update_base_model(self, base_model_dropdown):
+        if self.pipeline is None:
+            gr.Info(f"Please select a pretrained model path.")
+            return None
+        else:
+            base_model = self.personalized_model_list[base_model_dropdown]
+            mid_model = StableDiffusionPipeline.from_single_file(base_model)
+            self.pipeline.vae = mid_model.vae
+            self.pipeline.unet = mid_model.unet
+            self.pipeline.text_encoder = mid_model.text_encoder
+            self.pipeline.to(self.device)
+            self.personal_model_loaded = base_model_dropdown.split('.')[0]
+            print(f'load {base_model_dropdown} model success!')
+            return gr.Dropdown()
+    def update_lora_model(self, lora_model_dropdown,lora_alpha_slider):
+        if self.pipeline is None:
+            gr.Info(f"Please select a pretrained model path.")
+            return None
+        else:
+            if lora_model_dropdown == "none":
+                self.pipeline.unfuse_lora()
+                self.pipeline.unload_lora_weights()
+                self.lora_loaded           = None
+        # self.personal_model_loaded = None
+                print("Restore lora.")
+            else:
+                lora_model_path = self.lora_model_list[lora_model_dropdown]#os.path.join(self.lora_model_dir, lora_model_dropdown)
+                # self.lora_model_state_dict = {}
+                # if lora_model_dropdown == "none": pass
+                # else:
+                #     with safe_open(lora_model_dropdown, framework="pt", device="cpu") as f:
+                #         for key in f.keys():
+                #             self.lora_model_state_dict[key] = f.get_tensor(key)
+                # convert_lora(self.pipeline, self.lora_model_state_dict, alpha=lora_alpha_slider)
+                self.pipeline.unfuse_lora()
+                self.pipeline.unload_lora_weights()
+                self.pipeline.load_lora_weights(lora_model_path)
+                self.pipeline.fuse_lora(lora_alpha_slider)
+                self.lora_loaded = lora_model_dropdown.split('.')[0]
+                print(f'load {lora_model_dropdown} model success!')
+        return gr.Dropdown()
+    def generate(self, source, style, source_mask, style_mask,
+                       start_step, start_layer, Style_attn_step,
+                       Method, Style_Guidance, ddim_steps, scale, seed, de_bug,
+                       target_prompt, negative_prompt_textbox,
+                       inter_latents,
+                       freeu, b1, b2, s1, s2,
+                       width_slider,height_slider,
+                       ):
+        os.makedirs(self.savedir, exist_ok=True)
+        os.makedirs(self.savedir_sample, exist_ok=True)
+        os.makedirs(self.savedir_mask, exist_ok=True)
+        model = self.pipeline
+        if seed != -1 and seed != "": torch.manual_seed(int(seed))
+        else: torch.seed()
+        seed = torch.initial_seed()
+        sample_count = len(os.listdir(self.savedir_sample))
+        os.makedirs(os.path.join(self.savedir_mask, f"results_{sample_count}"), exist_ok=True)
+        # ref_prompt = [source_prompt, target_prompt]
+        # prompts = ref_prompt+['']
+        ref_prompt = [target_prompt, target_prompt]
+        prompts = ref_prompt+[target_prompt]
+        source_image,style_image,source_mask,style_mask = load_mask_images(source,style,source_mask,style_mask,self.device,width_slider,height_slider,out_dir=os.path.join(self.savedir_mask, f"results_{sample_count}"))
+        # global START_CODE, LATENTS_LIST
+        with torch.no_grad():
+            #import pdb;pdb.set_trace()
+            #prev_source
+            if self.start_code is None and self.latents_list is None:
+                content_style = torch.cat([style_image, source_image], dim=0)
+                editor = AttentionBase()
+                regiter_attention_editor_diffusers(model, editor)
+                st_code, latents_list = model.invert(content_style,
+                                                        ref_prompt,
+                                                        guidance_scale=scale,
+                                                        num_inference_steps=ddim_steps,
+                                                        return_intermediates=True)
+                start_code = torch.cat([st_code, st_code[1:]], dim=0)
+                self.start_code = start_code
+                self.latents_list = latents_list
+            else:
+                start_code = self.start_code
+                latents_list = self.latents_list
+                print('------------------------------------------  Use previous latents ------------------------------------------  ')
+            #["Without mask", "Only masked region", "Seperate Background Foreground"]
+            if Method == "Without mask":
+                style_mask = None
+                source_mask = None
+                only_masked_region = False
+            elif Method == "Only masked region":
+                assert style_mask is not None and source_mask is not None
+                only_masked_region = True
+            else:
+                assert style_mask is not None and source_mask is not None
+                only_masked_region = False
+            controller = MaskPromptedStyleAttentionControl(start_step, start_layer,
+                                                        style_attn_step=Style_attn_step,
+                                                        style_guidance=Style_Guidance,
+                                                        style_mask=style_mask,
+                                                        source_mask=source_mask,
+                                                        only_masked_region=only_masked_region,
+                                                        guidance=scale,
+                                                        de_bug=de_bug,
+                                                        )
+            if freeu:
+                print(f'++++++++++++++++++ Run with FreeU {b1}_{b2}_{s1}_{s2} ++++++++++++++++')
+                if Method != "Without mask":
+                    register_free_upblock2d(model, b1=b1, b2=b2, s1=s1, s2=s1,source_mask=source_mask)
+                    register_free_crossattn_upblock2d(model, b1=b1, b2=b2, s1=s1, s2=s1,source_mask=source_mask)
+                else:
+                    register_free_upblock2d(model, b1=b1, b2=b2, s1=s1, s2=s1,source_mask=None)
+                    register_free_crossattn_upblock2d(model, b1=b1, b2=b2, s1=s1, s2=s1,source_mask=None)
+            else:
+                print(f'++++++++++++++++++ Run without FreeU ++++++++++++++++')
+                register_upblock2d(model)
+                register_crossattn_upblock2d(model)
+            regiter_attention_editor_diffusers(model, controller)
+            regiter_attention_editor_diffusers(model, controller)
+            # inference the synthesized image
+            generate_image= model(prompts,
+                                width=width_slider,
+                                height=height_slider,
+                                latents=start_code,
+                                guidance_scale=scale,
+                                num_inference_steps=ddim_steps,
+                                ref_intermediate_latents=latents_list if inter_latents else None,
+                                neg_prompt=negative_prompt_textbox,
+                                return_intermediates=False,)
+            # os.makedirs(os.path.join(output_dir, f"results_{sample_count}"))
+            save_file_name = f"results_{sample_count}_step{start_step}_layer{start_layer}SG{Style_Guidance}_style_attn_step{Style_attn_step}.jpg"
+            if self.lora_loaded != None:
+                save_file_name = f"lora_{self.lora_loaded}_" + save_file_name
+            if self.personal_model_loaded != None:
+                save_file_name = f"personal_{self.personal_model_loaded}_" + save_file_name
+                #f"results_{sample_count}_step{start_step}_layer{start_layer}SG{Style_Guidance}_style_attn_step{Style_attn_step}_lora_{self.lora_loaded}.jpg"
+            save_file_path = os.path.join(self.savedir_sample, save_file_name)
+            #save_file_name = os.path.join(output_dir, f"results_style_{style_name}", f"{content_name}.jpg")
+            save_image(torch.cat([source_image/2 + 0.5, style_image/2 + 0.5, generate_image[2:]], dim=0), save_file_path, nrow=3, padding=0)
+            # global OUTPUT_RESULT
+            # OUTPUT_RESULT = save_file_name
+            generate_image = generate_image.cpu().permute(0, 2, 3, 1).numpy()
+            #save_gif(latents_list, os.path.join(output_dir, f"results_{sample_count}",'output_latents_list.gif'))
+        # import pdb;pdb.set_trace()
+            #gif_dir = os.path.join(output_dir, f"results_{sample_count}",'output_latents_list.gif')
+        return [
+            generate_image[0],
+            generate_image[1],
+            generate_image[2],
+            ]
+    def reset_start_code(self,):
+        self.start_code = None
+        self.latents_list = None
+global_text = GlobalText()
+def load_mask_images(source,style,source_mask,style_mask,device,width,height,out_dir=None):
+    # invert the image into noise map
+    if isinstance(source['image'], np.ndarray):
+        source_image = torch.from_numpy(source['image']).to(device) / 127.5 - 1.
+    else:
+        source_image = torch.from_numpy(np.array(source['image'])).to(device) / 127.5 - 1.
+    source_image = source_image.unsqueeze(0).permute(0, 3, 1, 2)
+    source_image = F.interpolate(source_image, (height,width ))
+    if out_dir is not None and source_mask is None:
+        source['mask'].save(os.path.join(out_dir,'source_mask.jpg'))
+    else:
+        Image.fromarray(source_mask).save(os.path.join(out_dir,'source_mask.jpg'))
+    if out_dir is not None and style_mask is None:
+        style['mask'].save(os.path.join(out_dir,'style_mask.jpg'))
+    else:
+        Image.fromarray(style_mask).save(os.path.join(out_dir,'style_mask.jpg'))
+    # save source['mask']
+    # import pdb;pdb.set_trace()
+    source_mask = torch.from_numpy(np.array(source['mask']) if source_mask is None else source_mask).to(device) / 255.
+    source_mask = source_mask.unsqueeze(0).permute(0, 3, 1, 2)[:,:1]
+    source_mask = F.interpolate(source_mask, (height//8,width//8))
+    if isinstance(source['image'], np.ndarray):
+        style_image = torch.from_numpy(style['image']).to(device) / 127.5 - 1.
+    else:
+        style_image = torch.from_numpy(np.array(style['image'])).to(device) / 127.5 - 1.
+    style_image = style_image.unsqueeze(0).permute(0, 3, 1, 2)
+    style_image = F.interpolate(style_image, (height,width))
+    style_mask = torch.from_numpy(np.array(style['mask']) if style_mask is None else style_mask ).to(device) / 255.
+    style_mask = style_mask.unsqueeze(0).permute(0, 3, 1, 2)[:,:1]
+    style_mask = F.interpolate(style_mask, (height//8,width//8))
+    return source_image,style_image,source_mask,style_mask
+def ui():
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # [Portrait Diffusion: Training-free Face Stylization with Chain-of-Painting](https://arxiv.org/abs/00000)
+            Jin Liu, Huaibo Huang, Chao Jin, Ran He* (*Corresponding Author)<br>
+            [Arxiv Report](https://arxiv.org/abs/0000) | [Project Page](https://www.github.io/) | [Github](https://github.com/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            gr.Markdown(
+                """
+                ### 1. Select a pretrained model.
+                """
+            )
+            with gr.Row():
+                stable_diffusion_dropdown = gr.Dropdown(
+                    label="Pretrained Model Path",
+                    choices=global_text.stable_diffusion_list,
+                    interactive=True,
+                    allow_custom_value=True
+                )
+                stable_diffusion_dropdown.change(fn=global_text.update_stable_diffusion, inputs=[stable_diffusion_dropdown], outputs=[stable_diffusion_dropdown])
+                stable_diffusion_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
+                def update_stable_diffusion():
+                    global_text.refresh_stable_diffusion()
+                stable_diffusion_refresh_button.click(fn=update_stable_diffusion, inputs=[], outputs=[])
+                base_model_dropdown = gr.Dropdown(
+                    label="Select a ckpt model (optional)",
+                    choices=sorted(list(global_text.personalized_model_list.keys())),
+                    interactive=True,
+                    allow_custom_value=True,
+                )
+                base_model_dropdown.change(fn=global_text.update_base_model, inputs=[base_model_dropdown], outputs=[base_model_dropdown])
+                lora_model_dropdown = gr.Dropdown(
+                    label="Select a LoRA model (optional)",
+                    choices=["none"] + sorted(list(global_text.lora_model_list.keys())),
+                    value="none",
+                    interactive=True,
+                    allow_custom_value=True,
+                )
+                lora_alpha_slider = gr.Slider(label="LoRA alpha", value=0.8, minimum=0, maximum=2, interactive=True)
+                lora_model_dropdown.change(fn=global_text.update_lora_model, inputs=[lora_model_dropdown,lora_alpha_slider], outputs=[lora_model_dropdown])
+                personalized_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
+                def update_personalized_model():
+                    global_text.refresh_personalized_model()
+                    return [
+                        gr.Dropdown(choices=sorted(list(global_text.personalized_model_list.keys()))),
+                        gr.Dropdown(choices=["none"] + sorted(list(global_text.lora_model_list.keys())))
+                    ]
+                personalized_refresh_button.click(fn=update_personalized_model, inputs=[], outputs=[base_model_dropdown, lora_model_dropdown])
+        with gr.Column(variant="panel"):
+            gr.Markdown(
+                """
+                ### 2. Configs for PortraitDiff.
+                """
+            )
+            with gr.Tab("Configs"):
+                with gr.Row():
+                    source_image = gr.Image(label="Source Image",  elem_id="img2maskimg", source="upload", interactive=True, type="pil", tool="sketch", image_mode="RGB", height=512)
+                    style_image = gr.Image(label="Style Image", elem_id="img2maskimg", source="upload", interactive=True, type="pil", tool="sketch", image_mode="RGB", height=512)
+                with gr.Row():
+                    prompt_textbox = gr.Textbox(label="Prompt", value='head', lines=1)
+                    negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=1)
+                    # output_dir = gr.Textbox(label="output_dir", value='./results/')
+                with gr.Row().style(equal_height=False):
+                    with gr.Column():
+                        width_slider     = gr.Slider(label="Width", value=512, minimum=256, maximum=1024, step=64)
+                        height_slider    = gr.Slider(label="Height", value=512, minimum=256, maximum=1024, step=64)
+                        Method = gr.Dropdown(
+                                    ["Without mask", "Only masked region", "Seperate Background Foreground"],
+                                    value="Without mask",
+                                    label="Mask", info="Select how to use masks")
+                        with gr.Tab('Base Configs'):
+                            with gr.Row():
+                                # sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
+                                ddim_steps = gr.Slider(label="DDIM Steps", value=50, minimum=10, maximum=100, step=1)
+                                Style_attn_step = gr.Slider(label="Step of Style Attention Control",
+                                                        minimum=0,
+                                                        maximum=50,
+                                                        value=35,
+                                                        step=1)
+                                start_step = gr.Slider(label="Step of Attention Control",
+                                                        minimum=0,
+                                                        maximum=150,
+                                                        value=0,
+                                                        step=1)
+                                start_layer = gr.Slider(label="Layer of Style Attention Control",
+                                                        minimum=0,
+                                                        maximum=16,
+                                                        value=10,
+                                                        step=1)
+                                Style_Guidance = gr.Slider(label="Style Guidance Scale",
+                                                minimum=0,
+                                                maximum=4,
+                                                value=1.2,
+                                                step=0.05)
+                                cfg_scale_slider = gr.Slider(label="CFG Scale",        value=0, minimum=0,   maximum=20)
+                        with gr.Tab('FreeU'):
+                            with gr.Row():
+                                freeu = gr.Checkbox(label="Free Upblock", value=False)
+                                de_bug = gr.Checkbox(value=False,label='DeBug')
+                                inter_latents = gr.Checkbox(value=True,label='Use intermediate latents')
+                            with gr.Row():
+                                b1 = gr.Slider(label='b1:',
+                                                        minimum=-1,
+                                                        maximum=2,
+                                                        step=0.01,
+                                                        value=1.3)
+                                b2 = gr.Slider(label='b2:',
+                                                        minimum=-1,
+                                                        maximum=2,
+                                                        step=0.01,
+                                                        value=1.5)
+                            with gr.Row():
+                                s1 = gr.Slider(label='s1: ',
+                                                        minimum=0,
+                                                        maximum=2,
+                                                        step=0.1,
+                                                        value=1.0)
+                                s2 = gr.Slider(label='s2:',
+                                                        minimum=0,
+                                                        maximum=2,
+                                                        step=0.1,
+                                                        value=1.0)
+                        with gr.Row():
+                            seed_textbox = gr.Textbox(label="Seed", value=-1)
+                            seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
+                            seed_button.click(fn=lambda: random.randint(1, 1e8), inputs=[], outputs=[seed_textbox])
+                    with gr.Column():
+                        generate_button = gr.Button(value="Generate", variant='primary')
+                        generate_image = gr.Image(label="Image with PortraitDiff", interactive=False, type='numpy', height=512,)
+                        with gr.Row():
+                            recons_content = gr.Image(label="reconstructed content", type="pil", image_mode="RGB", height=256)
+                            recons_style = gr.Image(label="reconstructed style", type="pil", image_mode="RGB", height=256)
+            with gr.Tab("SAM"):
+                with gr.Column():
+                    add_or_remove = gr.Radio(["Add Mask", "Remove Area"], value="Add Mask", label="Point_label (foreground/background)")
+                    with gr.Row():
+                        sam_source_btn = gr.Button(value="SAM Source")
+                        send_source_btn = gr.Button(value="Send Source")
+                        sam_style_btn = gr.Button(value="SAM Style")
+                        send_style_btn = gr.Button(value="Send Style")
+                    with gr.Row():
+                        source_image_sam = gr.Image(label="Source Image SAM",  elem_id="SourceimgSAM", source="upload", interactive=True, type="pil", image_mode="RGB", height=512)
+                        style_image_sam = gr.Image(label="Style Image SAM", elem_id="StyleimgSAM", source="upload", interactive=True, type="pil", image_mode="RGB", height=512)
+                    with gr.Row():
+                        source_image_with_points = gr.Image(label="source Image with points", elem_id="style_image_with_points", type="pil", image_mode="RGB", height=256)
+                        source_mask = gr.Image(label="Source Mask",  elem_id="img2maskimg", source="upload", interactive=True, type="numpy", image_mode="RGB", height=256)
+                        style_image_with_points = gr.Image(label="Style Image with points", elem_id="style_image_with_points", type="pil", image_mode="RGB", height=256)
+                        style_mask = gr.Image(label="Style Mask", elem_id="img2maskimg", source="upload", interactive=True, type="numpy", image_mode="RGB", height=256)
+        gr.Examples(
+            [[os.path.join(os.path.dirname(__file__), "gradio_app/images/content/1.jpg"),
+              os.path.join(os.path.dirname(__file__), "gradio_app/images/style/1.jpg")],
+            ],
+            [source_image, style_image]
+        )
+        inputs = [
+            source_image, style_image, source_mask, style_mask,
+            start_step, start_layer, Style_attn_step,
+            Method, Style_Guidance,ddim_steps, cfg_scale_slider, seed_textbox, de_bug,
+            prompt_textbox, negative_prompt_textbox, inter_latents,
+            freeu, b1, b2, s1, s2,
+            width_slider,height_slider,
+        ]
+        generate_button.click(
+            fn=global_text.generate,
+            inputs=inputs,
+            outputs=[recons_style,recons_content,generate_image]
+        )
+        source_image.upload(global_text.reset_start_code, inputs=[], outputs=[])
+        style_image.upload(global_text.reset_start_code, inputs=[], outputs=[])
+        ddim_steps.change(fn=global_text.reset_start_code, inputs=[], outputs=[])
+    return demo
+if __name__ == "__main__":
+    demo = ui()
+    demo.launch(server_name="172.18.32.44")

app.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=$1
+echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+# export CUDA_VISIBLE_DEVICES=5
+python gapp.py

pdiff/pdiff_pipeline.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+Util functions based on Diffuser framework.
+"""
+import os
+import torch
+import cv2
+import numpy as np
+import torch.nn.functional as F
+from tqdm import tqdm
+from PIL import Image
+from torchvision.utils import save_image
+from torchvision.io import read_image
+from diffusers import StableDiffusionPipeline
+from pytorch_lightning import seed_everything
+class MasaCtrlPipeline(StableDiffusionPipeline):
+    def next_step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+        eta=0.,
+        verbose=False
+    ):
+        """
+        Inverse sampling for DDIM Inversion
+        """
+        if verbose:
+            print("timestep: ", timestep)
+        next_step = timestep
+        timestep = min(timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps, 999)
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+        alpha_prod_t_next = self.scheduler.alphas_cumprod[next_step]
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_next)**0.5 * model_output
+        x_next = alpha_prod_t_next**0.5 * pred_x0 + pred_dir
+        return x_next, pred_x0
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+        eta: float=0.0,
+        verbose=False,
+    ):
+        """
+        predict the sampe the next step in the denoise process.
+        """
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep > 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_prev)**0.5 * model_output
+        x_prev = alpha_prod_t_prev**0.5 * pred_x0 + pred_dir
+        return x_prev, pred_x0
+    @torch.no_grad()
+    def image2latent(self, image):
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        if type(image) is Image:
+            image = np.array(image)
+            image = torch.from_numpy(image).float() / 127.5 - 1
+            image = image.permute(2, 0, 1).unsqueeze(0).to(DEVICE)
+        # input image density range [-1, 1]
+        latents = self.vae.encode(image)['latent_dist'].mean
+        latents = latents * 0.18215
+        return latents
+    @torch.no_grad()
+    def latent2image(self, latents, return_type='np'):
+        latents = 1 / 0.18215 * latents.detach()
+        image = self.vae.decode(latents)['sample']
+        if return_type == 'np':
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+            image = (image * 255).astype(np.uint8)
+        elif return_type == "pt":
+            image = (image / 2 + 0.5).clamp(0, 1)
+        return image
+    def latent2image_grad(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents)['sample']
+        return image  # range [-1, 1]
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        batch_size=1,
+        height=512,
+        width=512,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        eta=0.0,
+        latents=None,
+        unconditioning=None,
+        neg_prompt=None,
+        ref_intermediate_latents=None,
+        return_intermediates=False,
+        **kwds):
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+        elif isinstance(prompt, str):
+            if batch_size > 1:
+                prompt = [prompt] * batch_size
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+        print("input text embeddings :", text_embeddings.shape)
+        if kwds.get("dir"):
+            dir = text_embeddings[-2] - text_embeddings[-1]
+            u, s, v = torch.pca_lowrank(dir.transpose(-1, -2), q=1, center=True)
+            text_embeddings[-1] = text_embeddings[-1] + kwds.get("dir") * v
+            print(u.shape)
+            print(v.shape)
+        # define initial latents
+        latents_shape = (batch_size, self.unet.config.in_channels, height//8, width//8)
+        if latents is None:
+            latents = torch.randn(latents_shape, device=DEVICE)
+        else:
+            assert latents.shape == latents_shape, f"The shape of input latent tensor {latents.shape} should equal to predefined one."
+        # unconditional embedding for classifier free guidance
+        if guidance_scale > 1.:
+            max_length = text_input.input_ids.shape[-1]
+            if neg_prompt:
+                uc_text = neg_prompt
+            else:
+                uc_text = ""
+            # uc_text = "ugly, tiling, poorly drawn hands, poorly drawn feet, body out of frame, cut off, low contrast, underexposed, distorted face"
+            unconditional_input = self.tokenizer(
+                [uc_text] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            # unconditional_input.input_ids = unconditional_input.input_ids[:, 1:]
+            unconditional_embeddings = self.text_encoder(unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat([unconditional_embeddings, text_embeddings], dim=0)
+        print("latents shape: ", latents.shape)
+        # iterative sampling
+        self.scheduler.set_timesteps(num_inference_steps)
+        # print("Valid timesteps: ", reversed(self.scheduler.timesteps))
+        latents_list = [latents]
+        pred_x0_list = [latents]
+        for i, t in enumerate(tqdm(self.scheduler.timesteps, desc="DDIM Sampler")):
+            if ref_intermediate_latents is not None:
+                # note that the batch_size >= 2
+                latents_ref = ref_intermediate_latents[-1 - i]
+                _, latents_cur = latents.chunk(2)
+                latents = torch.cat([latents_ref, latents_cur])
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            if unconditioning is not None and isinstance(unconditioning, list):
+                _, text_embeddings = text_embeddings.chunk(2)
+                text_embeddings = torch.cat([unconditioning[i].expand(*text_embeddings.shape), text_embeddings])
+            # predict tghe noise
+            noise_pred = self.unet(model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t -> x_t-1
+            latents, pred_x0 = self.step(noise_pred, t, latents)
+            latents_list.append(latents)
+            pred_x0_list.append(pred_x0)
+        image = self.latent2image(latents, return_type="pt")
+        if return_intermediates:
+            pred_x0_list = [self.latent2image(img, return_type="pt") for img in pred_x0_list]
+            latents_list = [self.latent2image(img, return_type="pt") for img in latents_list]
+            return image, pred_x0_list, latents_list
+        return image
+    @torch.no_grad()
+    def invert(
+        self,
+        image: torch.Tensor,
+        prompt,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        eta=0.0,
+        return_intermediates=False,
+        **kwds):
+        """
+        invert a real image into noise map with determinisc DDIM inversion
+        """
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        batch_size = image.shape[0]
+        if isinstance(prompt, list):
+            if batch_size == 1:
+                image = image.expand(len(prompt), -1, -1, -1)
+        elif isinstance(prompt, str):
+            if batch_size > 1:
+                prompt = [prompt] * batch_size
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+        print("input text embeddings :", text_embeddings.shape)
+        # define initial latents
+        latents = self.image2latent(image)
+        start_latents = latents
+        # print(latents)
+        # exit()
+        # unconditional embedding for classifier free guidance
+        if guidance_scale > 1.:
+            max_length = text_input.input_ids.shape[-1]
+            unconditional_input = self.tokenizer(
+                [""] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            unconditional_embeddings = self.text_encoder(unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat([unconditional_embeddings, text_embeddings], dim=0)
+        print("latents shape: ", latents.shape)
+        # interative sampling
+        self.scheduler.set_timesteps(num_inference_steps)
+        print("Valid timesteps: ", reversed(self.scheduler.timesteps))
+        # print("attributes: ", self.scheduler.__dict__)
+        latents_list = [latents]
+        pred_x0_list = [latents]
+        for i, t in enumerate(tqdm(reversed(self.scheduler.timesteps), desc="DDIM Inversion")):
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            # predict the noise
+            noise_pred = self.unet(model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t-1 -> x_t
+            latents, pred_x0 = self.next_step(noise_pred, t, latents)
+            latents_list.append(latents)
+            pred_x0_list.append(pred_x0)
+        if return_intermediates:
+            # return the intermediate laters during inversion
+            # pred_x0_list = [self.latent2image(img, return_type="pt") for img in pred_x0_list]
+            return latents, latents_list
+        return latents, start_latents

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+diffusers==0.15.0
+transformers
+opencv-python
+einops
+omegaconf
+pytorch_lightning

style.css ADDED Viewed

	@@ -0,0 +1,3 @@

+h1 {
+    text-align: center;
+  }

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (141 Bytes). View file

utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (139 Bytes). View file

utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (139 Bytes). View file

utils/__pycache__/convert_from_ckpt.cpython-310.pyc ADDED Viewed

Binary file (27.2 kB). View file

utils/__pycache__/convert_from_ckpt.cpython-38.pyc ADDED Viewed

Binary file (28.2 kB). View file

utils/__pycache__/convert_from_ckpt.cpython-39.pyc ADDED Viewed

Binary file (27.9 kB). View file

utils/__pycache__/convert_lora_safetensor_to_diffusers.cpython-310.pyc ADDED Viewed

Binary file (3.36 kB). View file

utils/__pycache__/convert_lora_safetensor_to_diffusers.cpython-38.pyc ADDED Viewed

Binary file (3.36 kB). View file

utils/__pycache__/convert_lora_safetensor_to_diffusers.cpython-39.pyc ADDED Viewed

Binary file (3.33 kB). View file

utils/__pycache__/diffuser_utils.cpython-310.pyc ADDED Viewed

Binary file (6.77 kB). View file

utils/__pycache__/diffuser_utils.cpython-38.pyc ADDED Viewed

Binary file (6.8 kB). View file

utils/__pycache__/diffuser_utils.cpython-39.pyc ADDED Viewed

Binary file (6.78 kB). View file

utils/__pycache__/free_lunch_utils.cpython-310.pyc ADDED Viewed

Binary file (8.28 kB). View file

utils/__pycache__/free_lunch_utils.cpython-38.pyc ADDED Viewed

Binary file (8.6 kB). View file

utils/__pycache__/free_lunch_utils.cpython-39.pyc ADDED Viewed

Binary file (8.59 kB). View file

utils/__pycache__/masactrl_utils.cpython-310.pyc ADDED Viewed

Binary file (6.2 kB). View file

utils/__pycache__/masactrl_utils.cpython-38.pyc ADDED Viewed

Binary file (6.71 kB). View file

utils/__pycache__/masactrl_utils.cpython-39.pyc ADDED Viewed

Binary file (6.69 kB). View file

utils/__pycache__/style_attn_control.cpython-310.pyc ADDED Viewed

Binary file (8.73 kB). View file

utils/__pycache__/style_attn_control.cpython-38.pyc ADDED Viewed

Binary file (8.67 kB). View file

utils/__pycache__/style_attn_control.cpython-39.pyc ADDED Viewed

Binary file (8.75 kB). View file

utils/convert_from_ckpt.py ADDED Viewed

	@@ -0,0 +1,959 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Stable Diffusion checkpoints."""
+import re
+from io import BytesIO
+from typing import Optional
+import requests
+import torch
+from transformers import (
+    AutoFeatureExtractor,
+    BertTokenizerFast,
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+from diffusers.models import (
+    AutoencoderKL,
+    PriorTransformer,
+    UNet2DConditionModel,
+)
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UnCLIPScheduler,
+)
+from diffusers.utils.import_utils import BACKENDS_MAPPING
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if controlnet:
+        unet_params = original_config.model.params.control_stage_config.params
+    else:
+        unet_params = original_config.model.params.unet_config.params
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+    class_embed_type = None
+    projection_class_embeddings_input_dim = None
+    if "num_classes" in unet_params:
+        if unet_params.num_classes == "sequential":
+            class_embed_type = "projection"
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params.adm_in_channels
+        else:
+            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params.in_channels,
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params.num_res_blocks,
+        "cross_attention_dim": unet_params.context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+    }
+    if not controlnet:
+        config["out_channels"] = unet_params.out_channels
+        config["up_block_types"] = tuple(up_block_types)
+    return config
+def create_vae_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    _ = original_config.model.params.first_stage_config.params.embed_dim
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params.in_channels,
+        "out_channels": vae_params.out_ch,
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params.z_channels,
+        "layers_per_block": vae_params.num_res_blocks,
+    }
+    return config
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+def create_ldm_bert_config(original_config):
+    bert_params = original_config.model.parms.cond_stage_config.params
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    if controlnet:
+        unet_key = "control_model."
+    else:
+        unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    if controlnet:
+        # conditioning embedding
+        orig_index = 0
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+        orig_index += 2
+        diffusers_index = 0
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+    return new_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+    hf_model = LDMBertModel(config).eval()
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+    return hf_model
+def convert_ldm_clip_checkpoint(checkpoint):
+    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+    text_model.load_state_dict(text_model_dict)
+    return text_model
+textenc_conversion_lst = [
+    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+def convert_paint_by_example_checkpoint(checkpoint):
+    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
+    model = PaintByExampleImageEncoder(config)
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+    # load clip vision
+    model.model.load_state_dict(text_model_dict)
+    # load mapper
+    keys_mapper = {
+        k[len("cond_stage_model.mapper.res") :]: v
+        for k, v in checkpoint.items()
+        if k.startswith("cond_stage_model.mapper")
+    }
+    MAPPING = {
+        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
+        "attn.c_proj": ["attn1.to_out.0"],
+        "ln_1": ["norm1"],
+        "ln_2": ["norm3"],
+        "mlp.c_fc": ["ff.net.0.proj"],
+        "mlp.c_proj": ["ff.net.2"],
+    }
+    mapped_weights = {}
+    for key, value in keys_mapper.items():
+        prefix = key[: len("blocks.i")]
+        suffix = key.split(prefix)[-1].split(".")[-1]
+        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
+        mapped_names = MAPPING[name]
+        num_splits = len(mapped_names)
+        for i, mapped_name in enumerate(mapped_names):
+            new_name = ".".join([prefix, mapped_name, suffix])
+            shape = value.shape[0] // num_splits
+            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
+    model.mapper.load_state_dict(mapped_weights)
+    # load final layer norm
+    model.final_layer_norm.load_state_dict(
+        {
+            "bias": checkpoint["cond_stage_model.final_ln.bias"],
+            "weight": checkpoint["cond_stage_model.final_ln.weight"],
+        }
+    )
+    # load final proj
+    model.proj_out.load_state_dict(
+        {
+            "bias": checkpoint["proj_out.bias"],
+            "weight": checkpoint["proj_out.weight"],
+        }
+    )
+    # load uncond vector
+    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
+    return model
+def convert_open_clip_checkpoint(checkpoint):
+    text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    if "cond_stage_model.model.text_projection" in checkpoint:
+        d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
+    else:
+        d_model = 1024
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
+    for key in keys:
+        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
+            continue
+        if key in textenc_conversion_map:
+            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
+        if key.startswith("cond_stage_model.model.transformer."):
+            new_key = key[len("cond_stage_model.model.transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key] = checkpoint[key]
+    text_model.load_state_dict(text_model_dict)
+    return text_model
+def stable_unclip_image_encoder(original_config):
+    """
+    Returns the image processor and clip image encoder for the img2img unclip pipeline.
+    We currently know of two types of stable unclip models which separately use the clip and the openclip image
+    encoders.
+    """
+    image_embedder_config = original_config.model.params.embedder_config
+    sd_clip_image_embedder_class = image_embedder_config.target
+    sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
+    if sd_clip_image_embedder_class == "ClipImageEmbedder":
+        clip_model_name = image_embedder_config.params.model
+        if clip_model_name == "ViT-L/14":
+            feature_extractor = CLIPImageProcessor()
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+        else:
+            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
+    elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
+        feature_extractor = CLIPImageProcessor()
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+    else:
+        raise NotImplementedError(
+            f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
+        )
+    return feature_extractor, image_encoder
+def stable_unclip_image_noising_components(
+    original_config, clip_stats_path: Optional[str] = None, device: Optional[str] = None
+):
+    """
+    Returns the noising components for the img2img and txt2img unclip pipelines.
+    Converts the stability noise augmentor into
+    1. a `StableUnCLIPImageNormalizer` for holding the CLIP stats
+    2. a `DDPMScheduler` for holding the noise schedule
+    If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.
+    """
+    noise_aug_config = original_config.model.params.noise_aug_config
+    noise_aug_class = noise_aug_config.target
+    noise_aug_class = noise_aug_class.split(".")[-1]
+    if noise_aug_class == "CLIPEmbeddingNoiseAugmentation":
+        noise_aug_config = noise_aug_config.params
+        embedding_dim = noise_aug_config.timestep_dim
+        max_noise_level = noise_aug_config.noise_schedule_config.timesteps
+        beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
+        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
+        if "clip_stats_path" in noise_aug_config:
+            if clip_stats_path is None:
+                raise ValueError("This stable unclip config requires a `clip_stats_path`")
+            clip_mean, clip_std = torch.load(clip_stats_path, map_location=device)
+            clip_mean = clip_mean[None, :]
+            clip_std = clip_std[None, :]
+            clip_stats_state_dict = {
+                "mean": clip_mean,
+                "std": clip_std,
+            }
+            image_normalizer.load_state_dict(clip_stats_state_dict)
+    else:
+        raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
+    return image_normalizer, image_noising_scheduler
+def convert_controlnet_checkpoint(
+    checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema
+):
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
+    ctrlnet_config["upcast_attention"] = upcast_attention
+    ctrlnet_config.pop("sample_size")
+    controlnet_model = ControlNetModel(**ctrlnet_config)
+    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, ctrlnet_config, path=checkpoint_path, extract_ema=extract_ema, controlnet=True
+    )
+    controlnet_model.load_state_dict(converted_ctrl_checkpoint)
+    return controlnet_model

utils/convert_lora_safetensor_to_diffusers.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# coding=utf-8
+# Copyright 2023, Haofan Wang, Qixun Wang, All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LoRA's safetensors checkpoints. """
+import argparse
+import torch
+from safetensors.torch import load_file
+from diffusers import StableDiffusionPipeline
+import pdb
+def convert_motion_lora_ckpt_to_diffusers(pipeline, state_dict, alpha=1.0):
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # only process lora down key
+        if "up." in key: continue
+        up_key    = key.replace(".down.", ".up.")
+        model_key = key.replace("processor.", "").replace("_lora", "").replace("down.", "").replace("up.", "")
+        model_key = model_key.replace("to_out.", "to_out.0.")
+        layer_infos = model_key.split(".")[:-1]
+        curr_layer = pipeline.unet
+        while len(layer_infos) > 0:
+            temp_name = layer_infos.pop(0)
+            curr_layer = curr_layer.__getattr__(temp_name)
+        weight_down = state_dict[key]
+        weight_up   = state_dict[up_key]
+        curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
+    return pipeline
+def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX_TEXT_ENCODER="lora_te", alpha=0.6):
+    # load base model
+    # pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32)
+    # load LoRA weight from .safetensors
+    # state_dict = load_file(checkpoint_path)
+    visited = []
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # it is suggested to print out the key, it usually will be something like below
+        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+        # as we have set the alpha beforehand, so just skip
+        if ".alpha" in key or key in visited:
+            continue
+        if "text" in key:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            curr_layer = pipeline.text_encoder
+        else:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
+            curr_layer = pipeline.unet
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+        pair_keys = []
+        if "lora_down" in key:
+            pair_keys.append(key.replace("lora_down", "lora_up"))
+            pair_keys.append(key)
+        else:
+            pair_keys.append(key)
+            pair_keys.append(key.replace("lora_up", "lora_down"))
+        # update weight
+        if len(state_dict[pair_keys[0]].shape) == 4:
+            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
+        else:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
+        # update visited list
+        for item in pair_keys:
+            visited.append(item)
+    return pipeline
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_path", default=None, type=str, required=True, help="Path to the base model in diffusers format."
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--lora_prefix_unet", default="lora_unet", type=str, help="The prefix of UNet weight in safetensors"
+    )
+    parser.add_argument(
+        "--lora_prefix_text_encoder",
+        default="lora_te",
+        type=str,
+        help="The prefix of text encoder weight in safetensors",
+    )
+    parser.add_argument("--alpha", default=0.75, type=float, help="The merging ratio in W = W0 + alpha * deltaW")
+    parser.add_argument(
+        "--to_safetensors", action="store_true", help="Whether to store pipeline in safetensors format or not."
+    )
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+    base_model_path = args.base_model_path
+    checkpoint_path = args.checkpoint_path
+    dump_path = args.dump_path
+    lora_prefix_unet = args.lora_prefix_unet
+    lora_prefix_text_encoder = args.lora_prefix_text_encoder
+    alpha = args.alpha
+    pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha)
+    pipe = pipe.to(args.device)
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

utils/diffuser_utils.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+Util functions based on Diffuser framework.
+"""
+import os
+import torch
+import cv2
+import numpy as np
+import torch.nn.functional as F
+from tqdm import tqdm
+from PIL import Image
+from torchvision.utils import save_image
+from torchvision.io import read_image
+from diffusers import StableDiffusionPipeline
+from pytorch_lightning import seed_everything
+class MasaCtrlPipeline(StableDiffusionPipeline):
+    def next_step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+        eta=0.,
+        verbose=False
+    ):
+        """
+        Inverse sampling for DDIM Inversion
+        """
+        if verbose:
+            print("timestep: ", timestep)
+        next_step = timestep
+        timestep = min(timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps, 999)
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+        alpha_prod_t_next = self.scheduler.alphas_cumprod[next_step]
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_next)**0.5 * model_output
+        x_next = alpha_prod_t_next**0.5 * pred_x0 + pred_dir
+        return x_next, pred_x0
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+        eta: float=0.0,
+        verbose=False,
+    ):
+        """
+        predict the sampe the next step in the denoise process.
+        """
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep > 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_prev)**0.5 * model_output
+        x_prev = alpha_prod_t_prev**0.5 * pred_x0 + pred_dir
+        return x_prev, pred_x0
+    @torch.no_grad()
+    def image2latent(self, image):
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        if type(image) is Image:
+            image = np.array(image)
+            image = torch.from_numpy(image).float() / 127.5 - 1
+            image = image.permute(2, 0, 1).unsqueeze(0).to(DEVICE)
+        # input image density range [-1, 1]
+        latents = self.vae.encode(image)['latent_dist'].mean
+        latents = latents * 0.18215
+        return latents
+    @torch.no_grad()
+    def latent2image(self, latents, return_type='np'):
+        latents = 1 / 0.18215 * latents.detach()
+        image = self.vae.decode(latents)['sample']
+        if return_type == 'np':
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+            image = (image * 255).astype(np.uint8)
+        elif return_type == "pt":
+            image = (image / 2 + 0.5).clamp(0, 1)
+        return image
+    def latent2image_grad(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents)['sample']
+        return image  # range [-1, 1]
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        batch_size=1,
+        height=512,
+        width=512,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        eta=0.0,
+        latents=None,
+        unconditioning=None,
+        neg_prompt=None,
+        ref_intermediate_latents=None,
+        return_intermediates=False,
+        **kwds):
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+        elif isinstance(prompt, str):
+            if batch_size > 1:
+                prompt = [prompt] * batch_size
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+        print("input text embeddings :", text_embeddings.shape)
+        if kwds.get("dir"):
+            dir = text_embeddings[-2] - text_embeddings[-1]
+            u, s, v = torch.pca_lowrank(dir.transpose(-1, -2), q=1, center=True)
+            text_embeddings[-1] = text_embeddings[-1] + kwds.get("dir") * v
+            print(u.shape)
+            print(v.shape)
+        # define initial latents
+        latents_shape = (batch_size, self.unet.config.in_channels, height//8, width//8)
+        if latents is None:
+            latents = torch.randn(latents_shape, device=DEVICE)
+        else:
+            assert latents.shape == latents_shape, f"The shape of input latent tensor {latents.shape} should equal to predefined one."
+        # unconditional embedding for classifier free guidance
+        if guidance_scale > 1.:
+            max_length = text_input.input_ids.shape[-1]
+            if neg_prompt:
+                uc_text = neg_prompt
+            else:
+                uc_text = ""
+            # uc_text = "ugly, tiling, poorly drawn hands, poorly drawn feet, body out of frame, cut off, low contrast, underexposed, distorted face"
+            unconditional_input = self.tokenizer(
+                [uc_text] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            # unconditional_input.input_ids = unconditional_input.input_ids[:, 1:]
+            unconditional_embeddings = self.text_encoder(unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat([unconditional_embeddings, text_embeddings], dim=0)
+        print("latents shape: ", latents.shape)
+        # iterative sampling
+        self.scheduler.set_timesteps(num_inference_steps)
+        # print("Valid timesteps: ", reversed(self.scheduler.timesteps))
+        latents_list = [latents]
+        pred_x0_list = [latents]
+        for i, t in enumerate(tqdm(self.scheduler.timesteps, desc="DDIM Sampler")):
+            if ref_intermediate_latents is not None:
+                # note that the batch_size >= 2
+                latents_ref = ref_intermediate_latents[-1 - i]
+                _, latents_cur = latents.chunk(2)
+                latents = torch.cat([latents_ref, latents_cur])
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            if unconditioning is not None and isinstance(unconditioning, list):
+                _, text_embeddings = text_embeddings.chunk(2)
+                text_embeddings = torch.cat([unconditioning[i].expand(*text_embeddings.shape), text_embeddings])
+            # predict tghe noise
+            noise_pred = self.unet(model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t -> x_t-1
+            latents, pred_x0 = self.step(noise_pred, t, latents)
+            latents_list.append(latents)
+            pred_x0_list.append(pred_x0)
+        image = self.latent2image(latents, return_type="pt")
+        if return_intermediates:
+            pred_x0_list = [self.latent2image(img, return_type="pt") for img in pred_x0_list]
+            latents_list = [self.latent2image(img, return_type="pt") for img in latents_list]
+            return image, pred_x0_list, latents_list
+        return image
+    @torch.no_grad()
+    def invert(
+        self,
+        image: torch.Tensor,
+        prompt,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        eta=0.0,
+        return_intermediates=False,
+        **kwds):
+        """
+        invert a real image into noise map with determinisc DDIM inversion
+        """
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        batch_size = image.shape[0]
+        if isinstance(prompt, list):
+            if batch_size == 1:
+                image = image.expand(len(prompt), -1, -1, -1)
+        elif isinstance(prompt, str):
+            if batch_size > 1:
+                prompt = [prompt] * batch_size
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+        print("input text embeddings :", text_embeddings.shape)
+        # define initial latents
+        latents = self.image2latent(image)
+        start_latents = latents
+        # print(latents)
+        # exit()
+        # unconditional embedding for classifier free guidance
+        if guidance_scale > 1.:
+            max_length = text_input.input_ids.shape[-1]
+            unconditional_input = self.tokenizer(
+                [""] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            unconditional_embeddings = self.text_encoder(unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat([unconditional_embeddings, text_embeddings], dim=0)
+        print("latents shape: ", latents.shape)
+        # interative sampling
+        self.scheduler.set_timesteps(num_inference_steps)
+        print("Valid timesteps: ", reversed(self.scheduler.timesteps))
+        # print("attributes: ", self.scheduler.__dict__)
+        latents_list = [latents]
+        pred_x0_list = [latents]
+        for i, t in enumerate(tqdm(reversed(self.scheduler.timesteps), desc="DDIM Inversion")):
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            # predict the noise
+            noise_pred = self.unet(model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t-1 -> x_t
+            latents, pred_x0 = self.next_step(noise_pred, t, latents)
+            latents_list.append(latents)
+            pred_x0_list.append(pred_x0)
+        if return_intermediates:
+            # return the intermediate laters during inversion
+            # pred_x0_list = [self.latent2image(img, return_type="pt") for img in pred_x0_list]
+            return latents, latents_list
+        return latents, start_latents

utils/free_lunch_utils.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import torch
+import torch.fft as fft
+from diffusers.models.unet_2d_condition import logger
+from diffusers.utils import is_torch_version
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch.nn.functional as F
+def isinstance_str(x: object, cls_name: str):
+    """
+    Checks whether x has any class *named* cls_name in its ancestry.
+    Doesn't require access to the class's implementation.
+    Useful for patching!
+    """
+    for _cls in x.__class__.__mro__:
+        if _cls.__name__ == cls_name:
+            return True
+    return False
+def Fourier_filter(x, threshold, scale):
+    dtype = x.dtype
+    x = x.type(torch.float32)
+    # FFT
+    x_freq = fft.fftn(x, dim=(-2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-2, -1))
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W)).cuda()
+    crow, ccol = H // 2, W //2
+    mask[..., crow - threshold:crow + threshold, ccol - threshold:ccol + threshold] = scale
+    x_freq = x_freq * mask
+    # IFFT
+    x_freq = fft.ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = fft.ifftn(x_freq, dim=(-2, -1)).real
+    x_filtered = x_filtered.type(dtype)
+    return x_filtered
+def register_upblock2d(model):
+    def up_forward(self):
+        def forward(hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, scale=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                #print(f"in upblock2d, hidden states shape: {hidden_states.shape}")
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    if is_torch_version(">=", "1.11.0"):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                        )
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb
+                        )
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "UpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+def register_free_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2,source_mask=None):
+    def up_forward(self):
+        def forward(hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, scale=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                #print(f"in free upblock2d, hidden states shape: {hidden_states.shape}")
+                if self.source_mask is not None:
+                    spatial_mask_source = F.interpolate(self.source_mask, (hidden_states.shape[2], hidden_states.shape[3]))
+                    spatial_mask_source_b1 = spatial_mask_source * self.b1 + (1 - spatial_mask_source)
+                    spatial_mask_source_b2 = spatial_mask_source * self.b2 + (1 - spatial_mask_source)
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    if self.source_mask is not None:
+                        #where in mask = 0, set hidden states unchanged
+                        hidden_states[:,:640] = hidden_states[:,:640] * spatial_mask_source_b1
+                    else:
+                        hidden_states[:,:640] = hidden_states[:,:640] * self.b1
+                        res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    if self.source_mask is not None:
+                        hidden_states[:,:320] = hidden_states[:,:320] * spatial_mask_source_b2
+                    else:
+                        hidden_states[:,:320] = hidden_states[:,:320] * self.b2
+                        res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    if is_torch_version(">=", "1.11.0"):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                        )
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb
+                        )
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "UpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)
+            setattr(upsample_block, 'source_mask', source_mask)
+def register_crossattn_upblock2d(model):
+    def up_forward(self):
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                #print(f"in crossatten upblock2d, hidden states shape: {hidden_states.shape}")
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        attention_mask=attention_mask,
+                        encoder_attention_mask=encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "CrossAttnUpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+def register_free_crossattn_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2,source_mask=None):
+    def up_forward(self):
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            if self.source_mask is not None:
+                spatial_mask_source = F.interpolate(self.source_mask, (hidden_states.shape[2], hidden_states.shape[3]))
+                spatial_mask_source_b1 = spatial_mask_source * self.b1 + (1 - spatial_mask_source)
+                spatial_mask_source_b2 = spatial_mask_source * self.b2 + (1 - spatial_mask_source)
+                # print(f"source mask is not none, {spatial_mask_source_b1.shape} with min {spatial_mask_source_b1.min()}", )
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                #print(f"in free crossatten upblock2d, hidden states shape: {hidden_states.shape}")
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    if self.source_mask is not None:
+                        #where in mask = 0, set hidden states unchanged
+                        hidden_states[:,:640] = hidden_states[:,:640] * spatial_mask_source_b1
+                    else:
+                        hidden_states[:,:640] = hidden_states[:,:640] * self.b1
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    if self.source_mask is not None:
+                        hidden_states[:,:320] = hidden_states[:,:320] * spatial_mask_source_b2
+                    else:
+                        hidden_states[:,:320] = hidden_states[:,:320] * self.b2
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    # hidden_states = attn(
+                    #     hidden_states,
+                    #     encoder_hidden_states=encoder_hidden_states,
+                    #     cross_attention_kwargs=cross_attention_kwargs,
+                    #     encoder_attention_mask=encoder_attention_mask,
+                    #     return_dict=False,
+                    # )[0]
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )[0]
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "CrossAttnUpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)
+            setattr(upsample_block, 'source_mask', source_mask)

utils/masactrl_utils.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Union, Tuple, List, Callable, Dict
+from torchvision.utils import save_image
+from einops import rearrange, repeat
+class AttentionBase:
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+    def after_step(self):
+        pass
+    def __call__(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        out = self.forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            # after step
+            self.after_step()
+        return out
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        out = torch.einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=num_heads)
+        return out
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+class AttentionStore(AttentionBase):
+    def __init__(self, res=[32], min_step=0, max_step=1000):
+        super().__init__()
+        self.res = res
+        self.min_step = min_step
+        self.max_step = max_step
+        self.valid_steps = 0
+        self.self_attns = []  # store the all attns
+        self.cross_attns = []
+        self.self_attns_step = []  # store the attns in each step
+        self.cross_attns_step = []
+    def after_step(self):
+        if self.cur_step > self.min_step and self.cur_step < self.max_step:
+            self.valid_steps += 1
+            if len(self.self_attns) == 0:
+                self.self_attns = self.self_attns_step
+                self.cross_attns = self.cross_attns_step
+            else:
+                for i in range(len(self.self_attns)):
+                    self.self_attns[i] += self.self_attns_step[i]
+                    self.cross_attns[i] += self.cross_attns_step[i]
+        self.self_attns_step.clear()
+        self.cross_attns_step.clear()
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        if attn.shape[1] <= 64 ** 2:  # avoid OOM
+            if is_cross:
+                self.cross_attns_step.append(attn)
+            else:
+                self.self_attns_step.append(attn)
+        return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+def regiter_attention_editor_diffusers(model, editor: AttentionBase):
+    """
+    Register a attention editor to Diffuser Pipeline, refer from [Prompt-to-Prompt]
+    """
+    def ca_forward(self, place_in_unet):
+        def forward(x, encoder_hidden_states=None, attention_mask=None, context=None, mask=None):
+            """
+            The attention is similar to the original implementation of LDM CrossAttention class
+            except adding some modifications on the attention
+            """
+            if encoder_hidden_states is not None:
+                context = encoder_hidden_states
+            if attention_mask is not None:
+                mask = attention_mask
+            to_out = self.to_out
+            if isinstance(to_out, nn.modules.container.ModuleList):
+                to_out = self.to_out[0]
+            else:
+                to_out = self.to_out
+            h = self.heads
+            q = self.to_q(x)
+            is_cross = context is not None
+            context = context if is_cross else x
+            k = self.to_k(context)
+            v = self.to_v(context)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+            sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
+            if mask is not None:
+                mask = rearrange(mask, 'b ... -> b (...)')
+                max_neg_value = -torch.finfo(sim.dtype).max
+                mask = repeat(mask, 'b j -> (b h) () j', h=h)
+                mask = mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~mask, max_neg_value)
+            attn = sim.softmax(dim=-1)
+            # the only difference
+            out = editor(
+                q, k, v, sim, attn, is_cross, place_in_unet,
+                self.heads, scale=self.scale)
+            return to_out(out)
+        return forward
+    def register_editor(net, count, place_in_unet):
+        for name, subnet in net.named_children():
+            if net.__class__.__name__ == 'Attention':  # spatial Transformer layer
+                net.forward = ca_forward(net, place_in_unet)
+                return count + 1
+            elif hasattr(net, 'children'):
+                count = register_editor(subnet, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    for net_name, net in model.unet.named_children():
+        if "down" in net_name:
+            cross_att_count += register_editor(net, 0, "down")
+        elif "mid" in net_name:
+            cross_att_count += register_editor(net, 0, "mid")
+        elif "up" in net_name:
+            cross_att_count += register_editor(net, 0, "up")
+    editor.num_att_layers = cross_att_count
+def regiter_attention_editor_ldm(model, editor: AttentionBase):
+    """
+    Register a attention editor to Stable Diffusion model, refer from [Prompt-to-Prompt]
+    """
+    def ca_forward(self, place_in_unet):
+        def forward(x, encoder_hidden_states=None, attention_mask=None, context=None, mask=None):
+            """
+            The attention is similar to the original implementation of LDM CrossAttention class
+            except adding some modifications on the attention
+            """
+            if encoder_hidden_states is not None:
+                context = encoder_hidden_states
+            if attention_mask is not None:
+                mask = attention_mask
+            to_out = self.to_out
+            if isinstance(to_out, nn.modules.container.ModuleList):
+                to_out = self.to_out[0]
+            else:
+                to_out = self.to_out
+            h = self.heads
+            q = self.to_q(x)
+            is_cross = context is not None
+            context = context if is_cross else x
+            k = self.to_k(context)
+            v = self.to_v(context)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+            sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
+            if mask is not None:
+                mask = rearrange(mask, 'b ... -> b (...)')
+                max_neg_value = -torch.finfo(sim.dtype).max
+                mask = repeat(mask, 'b j -> (b h) () j', h=h)
+                mask = mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~mask, max_neg_value)
+            attn = sim.softmax(dim=-1)
+            # the only difference
+            out = editor(
+                q, k, v, sim, attn, is_cross, place_in_unet,
+                self.heads, scale=self.scale)
+            return to_out(out)
+        return forward
+    def register_editor(net, count, place_in_unet):
+        for name, subnet in net.named_children():
+            if net.__class__.__name__ == 'CrossAttention':  # spatial Transformer layer
+                net.forward = ca_forward(net, place_in_unet)
+                return count + 1
+            elif hasattr(net, 'children'):
+                count = register_editor(subnet, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    for net_name, net in model.model.diffusion_model.named_children():
+        if "input" in net_name:
+            cross_att_count += register_editor(net, 0, "input")
+        elif "middle" in net_name:
+            cross_att_count += register_editor(net, 0, "middle")
+        elif "output" in net_name:
+            cross_att_count += register_editor(net, 0, "output")
+    editor.num_att_layers = cross_att_count

utils/style_attn_control.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from re import U
+import numpy as np
+from einops import rearrange
+from .masactrl_utils import AttentionBase
+from torchvision.utils import save_image
+import sys
+import torch
+import torch.nn.functional as F
+from torch import nn
+import torch.fft as fft
+from einops import rearrange, repeat
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_xformers_available
+# from masactrl.masactrl import MutualSelfAttentionControl
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class AttentionBase:
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+    def after_step(self):
+        pass
+    def __call__(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        out = self.forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            # after step
+            self.after_step()
+        return out
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        out = torch.einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=num_heads)
+        return out
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+class MaskPromptedStyleAttentionControl(AttentionBase):
+    def __init__(self, start_step=4, start_layer=10, style_attn_step=35, layer_idx=None, step_idx=None, total_steps=50, style_guidance=0.1,
+                 only_masked_region=False, guidance=0.0,
+                 style_mask=None, source_mask=None, de_bug=False):
+        """
+        MaskPromptedSAC
+        Args:
+            start_step: the step to start mutual self-attention control
+            start_layer: the layer to start mutual self-attention control
+            layer_idx: list of the layers to apply mutual self-attention control
+            step_idx: list the steps to apply mutual self-attention control
+            total_steps: the total number of steps
+            thres: the thereshold for mask thresholding
+            ref_token_idx: the token index list for cross-attention map aggregation
+            cur_token_idx: the token index list for cross-attention map aggregation
+            mask_save_dir: the path to save the mask image
+        """
+        super().__init__()
+        self.total_steps = total_steps
+        self.total_layers = 16
+        self.start_step = start_step
+        self.start_layer = start_layer
+        self.layer_idx = layer_idx if layer_idx is not None else list(range(start_layer, self.total_layers))
+        self.step_idx = step_idx if step_idx is not None else list(range(start_step, total_steps))
+        print("using MaskPromptStyleAttentionControl")
+        print("MaskedSAC at denoising steps: ", self.step_idx)
+        print("MaskedSAC at U-Net layers: ", self.layer_idx)
+        self.de_bug = de_bug
+        self.style_guidance = style_guidance
+        self.only_masked_region = only_masked_region
+        self.style_attn_step = style_attn_step
+        self.self_attns = []
+        self.cross_attns = []
+        self.guidance = guidance
+        self.style_mask = style_mask
+        self.source_mask = source_mask
+    def after_step(self):
+        self.self_attns = []
+        self.cross_attns = []
+    def attn_batch(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, q_mask,k_mask, **kwargs):
+        B = q.shape[0] // num_heads
+        H = W = int(np.sqrt(q.shape[1]))
+        q = rearrange(q, "(b h) n d -> h (b n) d", h=num_heads)
+        k = rearrange(k, "(b h) n d -> h (b n) d", h=num_heads)
+        v = rearrange(v, "(b h) n d -> h (b n) d", h=num_heads)
+        sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+        if q_mask is not None:
+            sim = sim.masked_fill(q_mask.unsqueeze(0)==0, -torch.finfo(sim.dtype).max)
+        if k_mask is not None:
+            sim = sim.masked_fill(k_mask.permute(1,0).unsqueeze(0)==0, -torch.finfo(sim.dtype).max)
+        attn = sim.softmax(-1) if attn is None else attn
+        if len(attn) == 2 * len(v):
+            v = torch.cat([v] * 2)
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        out = rearrange(out, "(h1 h) (b n) d -> (h1 b) n (h d)", b=B, h=num_heads)
+        return out
+    def attn_batch_fg_bg(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, q_mask,k_mask, **kwargs):
+        B = q.shape[0] // num_heads
+        H = W = int(np.sqrt(q.shape[1]))
+        q = rearrange(q, "(b h) n d -> h (b n) d", h=num_heads)
+        k = rearrange(k, "(b h) n d -> h (b n) d", h=num_heads)
+        v = rearrange(v, "(b h) n d -> h (b n) d", h=num_heads)
+        sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+        if q_mask is not None:
+            sim_fg = sim.masked_fill(q_mask.unsqueeze(0)==0, -torch.finfo(sim.dtype).max)
+            sim_bg = sim.masked_fill(q_mask.unsqueeze(0)==1, -torch.finfo(sim.dtype).max)
+        if k_mask is not None:
+            sim_fg = sim.masked_fill(k_mask.permute(1,0).unsqueeze(0)==0, -torch.finfo(sim.dtype).max)
+            sim_bg = sim.masked_fill(k_mask.permute(1,0).unsqueeze(0)==1, -torch.finfo(sim.dtype).max)
+        sim = torch.cat([sim_fg, sim_bg])
+        attn = sim.softmax(-1)
+        if len(attn) == 2 * len(v):
+            v = torch.cat([v] * 2)
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        out = rearrange(out, "(h1 h) (b n) d -> (h1 b) n (h d)", b=B, h=num_heads)
+        return out
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Attention forward function
+        """
+        if is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+            return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        B = q.shape[0] // num_heads // 2
+        H = W = int(np.sqrt(q.shape[1]))
+        if self.style_mask is not None and self.source_mask is not None:
+            #mask = self.aggregate_cross_attn_map(idx=self.cur_token_idx)  # (4, H, W)
+            heigh, width = self.style_mask.shape[-2:]
+            mask_style = self.style_mask# (H, W)
+            mask_source = self.source_mask# (H, W)
+            scale = int(np.sqrt(heigh * width / q.shape[1]))
+            # res = int(np.sqrt(q.shape[1]))
+            spatial_mask_source = F.interpolate(mask_source, (heigh//scale, width//scale)).reshape(-1, 1)
+            spatial_mask_style = F.interpolate(mask_style, (heigh//scale, width//scale)).reshape(-1, 1)
+        else:
+            spatial_mask_source=None
+            spatial_mask_style=None
+        if spatial_mask_style is None or spatial_mask_source is None:
+            out_s,out_c,out_t = self.style_attn_ctrl(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, spatial_mask_source,spatial_mask_style,**kwargs)
+        else:
+            if self.only_masked_region:
+                out_s,out_c,out_t = self.mask_prompted_style_attn_ctrl(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, spatial_mask_source,spatial_mask_style,**kwargs)
+            else:
+                out_s,out_c,out_t = self.separate_mask_prompted_style_attn_ctrl(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, spatial_mask_source,spatial_mask_style,**kwargs)
+        out = torch.cat([out_s,out_c,out_t],dim=0)
+        return out
+    def style_attn_ctrl(self,q,k,v,sim,attn,is_cross,place_in_unet,num_heads,spatial_mask_source,spatial_mask_style,**kwargs):
+        if self.de_bug:
+            import pdb; pdb.set_trace()
+        qs, qc, qt = q.chunk(3)
+        out_s = self.attn_batch(qs, k[:num_heads], v[:num_heads], sim[:num_heads], attn[:num_heads], is_cross, place_in_unet, num_heads, q_mask=None,k_mask=None,**kwargs)
+        out_c = self.attn_batch(qc, k[:num_heads], v[:num_heads], sim[:num_heads], None, is_cross, place_in_unet, num_heads, q_mask=None,k_mask=None,**kwargs)
+        if self.cur_step < self.style_attn_step:
+            out_t = self.attn_batch(qc, k[:num_heads], v[:num_heads], sim[:num_heads], None, is_cross, place_in_unet, num_heads, q_mask=None,k_mask=None,**kwargs)
+        else:
+            out_t = self.attn_batch(qt, k[:num_heads], v[:num_heads], sim[:num_heads], None, is_cross, place_in_unet, num_heads, q_mask=None,k_mask=None,**kwargs)
+            if self.style_guidance>=0:
+                out_t = out_c + (out_t - out_c) * self.style_guidance
+        return out_s,out_c,out_t
+    def mask_prompted_style_attn_ctrl(self,q,k,v,sim,attn,is_cross,place_in_unet,num_heads,spatial_mask_source,spatial_mask_style,**kwargs):
+        qs, qc, qt = q.chunk(3)
+        out_s = self.attn_batch(qs, k[:num_heads], v[:num_heads], sim[:num_heads], attn[:num_heads], is_cross, place_in_unet, num_heads, q_mask=None,k_mask=None,**kwargs)
+        out_c = self.attn_batch(qc, k[num_heads: 2*num_heads], v[num_heads:2*num_heads], sim[num_heads: 2*num_heads], attn[num_heads: 2*num_heads], is_cross, place_in_unet, num_heads, q_mask=None,k_mask=None, **kwargs)
+        out_c_new = self.attn_batch(qc, k[num_heads: 2*num_heads], v[num_heads:2*num_heads], sim[num_heads: 2*num_heads], None, is_cross, place_in_unet, num_heads, q_mask=None,k_mask=None, **kwargs)
+        if self.de_bug:
+            import pdb; pdb.set_trace()
+        if self.cur_step < self.style_attn_step:
+            out_t = out_c #self.attn_batch(qc, k[:num_heads], v[:num_heads], sim[:num_heads], attn, is_cross, place_in_unet, num_heads, q_mask=spatial_mask_source,k_mask=spatial_mask_style,**kwargs)
+        else:
+            out_t_fg = self.attn_batch(qt, k[:num_heads], v[:num_heads], sim[:num_heads], None, is_cross, place_in_unet, num_heads, q_mask=spatial_mask_source,k_mask=spatial_mask_style,**kwargs)
+            out_c_fg = self.attn_batch(qc, k[:num_heads], v[:num_heads], sim[:num_heads], None, is_cross, place_in_unet, num_heads, q_mask=spatial_mask_source,k_mask=spatial_mask_style,**kwargs)
+            if self.style_guidance>=0:
+                out_t = out_c_fg + (out_t_fg - out_c_fg) * self.style_guidance
+            out_t = out_t * spatial_mask_source + out_c * (1 - spatial_mask_source)
+        if self.de_bug:
+            import pdb; pdb.set_trace()
+        # print(torch.sum(out_t* (1 - spatial_mask_source) - out_c * (1 - spatial_mask_source)))
+        return out_s,out_c,out_t
+    def separate_mask_prompted_style_attn_ctrl(self,q,k,v,sim,attn,is_cross,place_in_unet,num_heads,spatial_mask_source,spatial_mask_style,**kwargs):
+        if self.de_bug:
+            import pdb; pdb.set_trace()
+        # To prevent query confusion, render fg and bg according to mask.
+        qs, qc, qt = q.chunk(3)
+        out_s = self.attn_batch(qs, k[:num_heads], v[:num_heads], sim[:num_heads], attn[:num_heads], is_cross, place_in_unet, num_heads, q_mask=None,k_mask=None,**kwargs)
+        if self.cur_step < self.style_attn_step:
+            out_c = self.attn_batch_fg_bg(qc, k[:num_heads], v[:num_heads], sim[:num_heads], attn, is_cross, place_in_unet, num_heads, q_mask=spatial_mask_source,k_mask=spatial_mask_style,**kwargs)
+            out_c_fg,out_c_bg = out_c.chunk(2)
+            out_t = out_c_fg * spatial_mask_source + out_c_bg * (1 - spatial_mask_source)
+        else:
+            out_t = self.attn_batch_fg_bg(qt, k[:num_heads], v[:num_heads], sim[:num_heads], attn, is_cross, place_in_unet, num_heads, q_mask=spatial_mask_source,k_mask=spatial_mask_style,**kwargs)
+            out_c = self.attn_batch_fg_bg(qc, k[:num_heads], v[:num_heads], sim[:num_heads], attn, is_cross, place_in_unet, num_heads, q_mask=spatial_mask_source,k_mask=spatial_mask_style,**kwargs)
+            out_t_fg,out_t_bg = out_t.chunk(2)
+            out_c_fg,out_c_bg = out_c.chunk(2)
+            if self.style_guidance>=0:
+                out_t_fg = out_c_fg + (out_t_fg - out_c_fg) * self.style_guidance
+                out_t_bg = out_c_bg + (out_t_bg - out_c_bg) * self.style_guidance
+            out_t = out_t_fg * spatial_mask_source + out_t_bg * (1 - spatial_mask_source)
+        return out_s,out_t,out_t