import gradio as gr from PIL import Image import torch from torchvision import transforms from transformers import ( CLIPProcessor, CLIPModel, CLIPTokenizer, CLIPTextModelWithProjection, CLIPVisionModelWithProjection, CLIPFeatureExtractor, ) import math from typing import List from PIL import Image, ImageChops import numpy as np import torch from diffusers import UnCLIPPipeline # from diffusers.utils.torch_utils import randn_tensor from transformers import CLIPTokenizer from src.priors.prior_transformer import ( PriorTransformer, ) # original huggingface prior transformer without time conditioning from src.pipelines.pipeline_kandinsky_prior import KandinskyPriorPipeline from diffusers import DiffusionPipeline import spaces __DEVICE__ = "cpu" if torch.cuda.is_available(): __DEVICE__ = "cuda" __DEVICE__ = "cuda" class Ours: def __init__(self, device): text_encoder = ( CLIPTextModelWithProjection.from_pretrained( "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280, torch_dtype=torch.float16, ) .eval() .requires_grad_(False) ) tokenizer = CLIPTokenizer.from_pretrained( "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" ) prior = PriorTransformer.from_pretrained( "ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior", torch_dtype=torch.float16, ) self.pipe_prior = KandinskyPriorPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", prior=prior, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=torch.float16, ).to(device) self.pipe = DiffusionPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 ).to(device) def inference(self, text, negative_text, steps, guidance_scale, width, height): gen_images = [] for i in range(2): image_emb, negative_image_emb = self.pipe_prior( text, negative_prompt=negative_text ).to_tuple() image = self.pipe( image_embeds=image_emb, negative_image_embeds=negative_image_emb, num_inference_steps=steps, guidance_scale=guidance_scale, width=width, height=height, ).images gen_images.append(image[0]) return gen_images selected_model = Ours(device=__DEVICE__) @spaces.GPU def get_images(text, negative_text, steps, guidance_scale, width, height, fixed_res): if fixed_res!="manual": print(f"Using {fixed_res} resolution") width, height = fixed_res.split("x") images = selected_model.inference(text, negative_text, steps, guidance_scale, width=int(width), height=int(height)) new_images = [] for img in images: new_images.append(img) return new_images with gr.Blocks() as demo: gr.Markdown( """