import spaces import gradio as gr import torch from PIL import Image from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, pipeline from diffusers import DiffusionPipeline import random import numpy as np import os import subprocess from qwen_vl_utils import process_vision_info from threading import Thread import uuid import io # Initialize models device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.bfloat16 huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # FLUX.1-dev model pipe = DiffusionPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", torch_dtype=dtype, token=huggingface_token ).to(device) # Initialize Qwen2VL model qwen_model = Qwen2VLForConditionalGeneration.from_pretrained( "prithivMLmods/JSONify-Flux", trust_remote_code=True, torch_dtype=torch.float16 ).to(device).eval() qwen_processor = AutoProcessor.from_pretrained("prithivMLmods/JSONify-Flux", trust_remote_code=True) # Prompt Enhancer enhancer_long = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance-Long", device=device) MAX_SEED = np.iinfo(np.int32).max MAX_IMAGE_SIZE = 1024 # Reduced to prevent memory issues # Qwen2VL caption function @spaces.GPU def qwen_caption(image): # Convert image to PIL if it's not already if not isinstance(image, Image.Image): image = Image.fromarray(image) messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "Caption the image"}, ], } ] text = qwen_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = qwen_processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(device) generated_ids = qwen_model.generate(**inputs, max_new_tokens=1024) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = qwen_processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False, )[0] return output_text # Prompt Enhancer function def enhance_prompt(input_prompt): result = enhancer_long("Enhance the description: " + input_prompt) enhanced_text = result[0]['summary_text'] return enhanced_text @spaces.GPU(duration=190) def process_workflow(image, text_prompt, use_enhancer, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)): if image is not None: # Convert image to PIL if it's not already if not isinstance(image, Image.Image): image = Image.fromarray(image) prompt = qwen_caption(image) print(prompt) else: prompt = text_prompt if use_enhancer: prompt = enhance_prompt(prompt) if randomize_seed: seed = random.randint(0, MAX_SEED) generator = torch.Generator(device=device).manual_seed(seed) # Reduce memory usage by clearing GPU cache torch.cuda.empty_cache() # Generate image with FLUX.1-dev try: image = pipe( prompt=prompt, generator=generator, num_inference_steps=num_inference_steps, width=width, height=height, guidance_scale=guidance_scale ).images[0] except RuntimeError as e: if "CUDA out of memory" in str(e): raise RuntimeError("CUDA out of memory. Try reducing image size or inference steps.") else: raise e return image, prompt, seed custom_css = """ .input-group, .output-group { border: 1px solid #e0e0e0; border-radius: 10px; padding: 20px; margin-bottom: 20px; background-color: #f9f9f9; } .submit-btn { background-color: #2980b9 !important; color: white !important; } .submit-btn:hover { background-color: #3498db !important; } """ title = """
Create long prompts from images or enhance your short prompts with prompt enhancer