Spaces:

ECLIPSE-Community
/

ECLIPSE-Kandinsky-v2.2

Running on Zero

App Files Files Community

ECLIPSE-Kandinsky-v2.2 / app.py

mpatel57

latest gradio changes

04efe97 verified about 1 year ago

raw

history blame contribute delete

6.54 kB

	import gradio as gr
	from PIL import Image

	import torch

	from torchvision import transforms
	from transformers import (
	CLIPProcessor,
	CLIPModel,
	CLIPTokenizer,
	CLIPTextModelWithProjection,
	CLIPVisionModelWithProjection,
	CLIPFeatureExtractor,
	)

	import math
	from typing import List
	from PIL import Image, ImageChops
	import numpy as np
	import torch

	from diffusers import UnCLIPPipeline

	# from diffusers.utils.torch_utils import randn_tensor

	from transformers import CLIPTokenizer

	from src.priors.prior_transformer import (
	PriorTransformer,
	) # original huggingface prior transformer without time conditioning
	from src.pipelines.pipeline_kandinsky_prior import KandinskyPriorPipeline

	from diffusers import DiffusionPipeline
	import spaces


	__DEVICE__ = "cpu"
	if torch.cuda.is_available():
	__DEVICE__ = "cuda"
	__DEVICE__ = "cuda"

	class Ours:
	def __init__(self, device):
	text_encoder = (
	CLIPTextModelWithProjection.from_pretrained(
	"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
	projection_dim=1280,
	torch_dtype=torch.float16,
	)
	.eval()
	.requires_grad_(False)
	)

	tokenizer = CLIPTokenizer.from_pretrained(
	"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
	)

	prior = PriorTransformer.from_pretrained(
	"ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior",
	torch_dtype=torch.float16,
	)

	self.pipe_prior = KandinskyPriorPipeline.from_pretrained(
	"kandinsky-community/kandinsky-2-2-prior",
	prior=prior,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	torch_dtype=torch.float16,
	).to(device)

	self.pipe = DiffusionPipeline.from_pretrained(
	"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
	).to(device)

	def inference(self, text, negative_text, steps, guidance_scale, width, height):
	gen_images = []
	for i in range(2):
	image_emb, negative_image_emb = self.pipe_prior(
	text, negative_prompt=negative_text
	).to_tuple()
	image = self.pipe(
	image_embeds=image_emb,
	negative_image_embeds=negative_image_emb,
	num_inference_steps=steps,
	guidance_scale=guidance_scale,
	width=width,
	height=height,
	).images
	gen_images.append(image[0])
	return gen_images


	selected_model = Ours(device=__DEVICE__)

	@spaces.GPU
	def get_images(text, negative_text, steps, guidance_scale, width, height, fixed_res):
	if fixed_res!="manual":
	print(f"Using {fixed_res} resolution")
	width, height = fixed_res.split("x")
	images = selected_model.inference(text, negative_text, steps, guidance_scale, width=int(width), height=int(height))
	new_images = []
	for img in images:
	new_images.append(img)
	return new_images


	with gr.Blocks() as demo:
	gr.Markdown(
	"""<h1 style="text-align: center;"><b>[CVPR 2024] <i>ECLIPSE</i>: Revisiting the Text-to-Image Prior for Effecient Image Generation</b></h1>
	<h1 style='text-align: center;'><a href='https://eclipse-t2i.vercel.app/'>Project Page</a> \| <a href='https://arxiv.org/abs/2312.04655'>Paper</a> </h1>

	"""
	)

	with gr.Group():
	with gr.Row():
	with gr.Column():
	text = gr.Textbox(
	label="Enter your prompt",
	show_label=False,
	max_lines=1,
	placeholder="Enter your prompt",
	elem_id="prompt-text-input",
	)

	with gr.Row():
	with gr.Column():
	negative_text = gr.Textbox(
	label="Enter your negative prompt",
	show_label=False,
	max_lines=1,
	placeholder="Enter your negative prompt",
	elem_id="prompt-text-input",
	)

	with gr.Row():
	steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=1)
	guidance_scale = gr.Slider(
	label="Guidance Scale", minimum=0, maximum=10, value=7.5, step=0.1
	)

	with gr.Row():
	with gr.Group():
	width_inp = gr.Textbox(
	label="Please provide the width",
	value="512",
	max_lines=1,
	)
	height_inp = gr.Textbox(
	label="Please provide the height",
	max_lines=1,
	value="512",
	)

	fixed_res = gr.Dropdown(
	["manual", "512x512", "1024x1024", "1920x1080", "1280x720"], value="manual", label="Prefined Resolution", info="Either select one or manually define one!"
	)

	with gr.Row():
	btn = gr.Button(value="Generate Image")

	gallery = gr.Gallery(
	label="Generated images", show_label=False, elem_id="gallery"
	, columns=[2], rows=[1], object_fit="contain", height="auto")

	btn.click(
	get_images,
	inputs=[
	text,
	negative_text,
	steps,
	guidance_scale,
	width_inp,
	height_inp,
	fixed_res,
	],
	outputs=gallery,
	)
	text.submit(
	get_images,
	inputs=[
	text,
	negative_text,
	steps,
	guidance_scale,
	width_inp,
	height_inp,
	fixed_res,
	],
	outputs=gallery,
	)
	negative_text.submit(
	get_images,
	inputs=[
	text,
	negative_text,
	steps,
	guidance_scale,
	width_inp,
	height_inp,
	fixed_res,
	],
	outputs=gallery,
	)

	with gr.Accordion(label="Ethics & Privacy", open=False):
	gr.HTML(
	"""<div class="acknowledgments">
	<p><h4>Privacy</h4>
	We do not collect any images or key data. This demo is designed with sole purpose of fun and reducing misuse of AI.
	<p><h4>Biases and content acknowledgment</h4>
	This model will have the same biases as pre-trained CLIP model. </div>
	"""
	)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()