**[CVPR 2024] ECLIPSE: Revisiting the Text-to-Image Prior for Effecient Image Generation**

import gradio as gr
from PIL import Image

import torch

from torchvision import transforms
from transformers import (
    CLIPProcessor,
    CLIPModel,
    CLIPTokenizer,
    CLIPTextModelWithProjection,
    CLIPVisionModelWithProjection,
    CLIPFeatureExtractor,
)

import math
from typing import List
from PIL import Image, ImageChops
import numpy as np
import torch

from diffusers import UnCLIPPipeline

# from diffusers.utils.torch_utils import randn_tensor

from transformers import CLIPTokenizer

from src.priors.prior_transformer import (
    PriorTransformer,
)  # original huggingface prior transformer without time conditioning
from src.pipelines.pipeline_kandinsky_prior import KandinskyPriorPipeline

from diffusers import DiffusionPipeline
import spaces


__DEVICE__ = "cpu"
if torch.cuda.is_available():
    __DEVICE__ = "cuda"
__DEVICE__ = "cuda"

class Ours:
    def __init__(self, device):
        text_encoder = (
            CLIPTextModelWithProjection.from_pretrained(
                "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
                projection_dim=1280,
                torch_dtype=torch.float16,
            )
            .eval()
            .requires_grad_(False)
        ) 

        tokenizer = CLIPTokenizer.from_pretrained(
            "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
        )

        prior = PriorTransformer.from_pretrained(
            "ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior",
            torch_dtype=torch.float16,
        )

        self.pipe_prior = KandinskyPriorPipeline.from_pretrained(
            "kandinsky-community/kandinsky-2-2-prior",
            prior=prior,
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            torch_dtype=torch.float16,
        ).to(device)

        self.pipe = DiffusionPipeline.from_pretrained(
            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
        ).to(device)

    def inference(self, text, negative_text, steps, guidance_scale, width, height):
        gen_images = []
        for i in range(2):
            image_emb, negative_image_emb = self.pipe_prior(
                text, negative_prompt=negative_text
            ).to_tuple()
            image = self.pipe(
                image_embeds=image_emb,
                negative_image_embeds=negative_image_emb,
                num_inference_steps=steps,
                guidance_scale=guidance_scale,
                width=width,
                height=height,
            ).images
            gen_images.append(image[0])
        return gen_images


selected_model = Ours(device=__DEVICE__)

@spaces.GPU
def get_images(text, negative_text, steps, guidance_scale, width, height, fixed_res):
    if fixed_res!="manual":
        print(f"Using {fixed_res} resolution")
        width, height = fixed_res.split("x")
    images = selected_model.inference(text, negative_text, steps, guidance_scale, width=int(width), height=int(height))
    new_images = []
    for img in images:
        new_images.append(img)
    return new_images


with gr.Blocks() as demo:
    gr.Markdown(
        """<h1 style="text-align: center;"><b>[CVPR 2024] <i>ECLIPSE</i>: Revisiting the Text-to-Image Prior for Effecient Image Generation</b></h1>
        <h1 style='text-align: center;'><a href='https://eclipse-t2i.vercel.app/'>Project Page</a> | <a href='https://arxiv.org/abs/2312.04655'>Paper</a>  </h1>

        """
    )

    with gr.Group():
        with gr.Row():
            with gr.Column():
                text = gr.Textbox(
                    label="Enter your prompt",
                    show_label=False,
                    max_lines=1,
                    placeholder="Enter your prompt",
                    elem_id="prompt-text-input",
                )

        with gr.Row():
            with gr.Column():
                negative_text = gr.Textbox(
                    label="Enter your negative prompt",
                    show_label=False,
                    max_lines=1,
                    placeholder="Enter your negative prompt",
                    elem_id="prompt-text-input",
                )

        with gr.Row():
            steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=1)
            guidance_scale = gr.Slider(
                label="Guidance Scale", minimum=0, maximum=10, value=7.5, step=0.1
            )

        with gr.Row():
            with gr.Group():
                width_inp = gr.Textbox(
                    label="Please provide the width",
                    value="512",
                    max_lines=1,
                )
                height_inp = gr.Textbox(
                    label="Please provide the height",
                    max_lines=1,
                    value="512",
                )

            fixed_res = gr.Dropdown(
                ["manual", "512x512", "1024x1024", "1920x1080", "1280x720"], value="manual", label="Prefined Resolution", info="Either select one or manually define one!"
            )

        with gr.Row():
            btn = gr.Button(value="Generate Image")

    gallery = gr.Gallery(
        label="Generated images", show_label=False, elem_id="gallery"
    , columns=[2], rows=[1], object_fit="contain", height="auto")

    btn.click(
        get_images,
        inputs=[
            text,
            negative_text,
            steps,
            guidance_scale,
            width_inp,
            height_inp,
            fixed_res,
        ],
        outputs=gallery,
    )
    text.submit(
        get_images,
        inputs=[
            text,
            negative_text,
            steps,
            guidance_scale,
            width_inp,
            height_inp,
            fixed_res,
        ],
        outputs=gallery,
    )
    negative_text.submit(
        get_images,
        inputs=[
            text,
            negative_text,
            steps,
            guidance_scale,
            width_inp,
            height_inp,
            fixed_res,
        ],
        outputs=gallery,
    )

    with gr.Accordion(label="Ethics & Privacy", open=False):
        gr.HTML(
            """<div class="acknowledgments">
                <p><h4>Privacy</h4>
We do not collect any images or key data. This demo is designed with sole purpose of fun and reducing misuse of AI.
                <p><h4>Biases and content acknowledgment</h4>
This model will have the same biases as pre-trained CLIP model.               </div>
            """
        )

if __name__ == "__main__":
    demo.queue(max_size=20).launch()