Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor | |
| from PIL import Image | |
| import torch | |
| from torchvision import io | |
| from typing import Dict | |
| from datetime import datetime | |
| import numpy as np | |
| import base64 | |
| import os, io | |
| # Load the model in half-precision on the available device(s) | |
| model = Qwen2VLForConditionalGeneration.from_pretrained( | |
| "./Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained("./Qwen2-VL-7B-Instruct") | |
| def array_to_image_path(image_array): | |
| if image_array is None: | |
| raise ValueError("No image provided. Please upload an image before submitting.") | |
| # Convert numpy array to PIL Image | |
| img = Image.fromarray(np.uint8(image_array)) | |
| # Generate a unique filename using timestamp | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"image_{timestamp}.png" | |
| # Save the image | |
| img.save(filename) | |
| # Get the full path of the saved image | |
| full_path = os.path.abspath(filename) | |
| return full_path | |
| def generate_embeddings(text): | |
| model = SentenceTransformer('./all-MiniLM-L6-v2') | |
| embeddings = model.encode(sentences) | |
| return embeddings | |
| def describe_image(image): | |
| # Convert the image to the format expected by the model | |
| image_path = array_to_image_path(image) | |
| with open(image_path, "rb") as f: | |
| image = base64.b64encode(f.read()).decode("utf-8") | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": f"data:image/png;base64,{image}"}, | |
| {"type": "text", "text": "Make a very detailed description of the image."}, | |
| ], | |
| } | |
| ] | |
| text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) | |
| # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n' | |
| inputs = processor( | |
| text=[text_prompt], images=[image], padding=True, return_tensors="pt" | |
| ) | |
| inputs = inputs.to("cuda") | |
| # Inference: Generation of the output | |
| output_ids = model.generate(**inputs, max_new_tokens=128) | |
| generated_ids = [ | |
| output_ids[len(input_ids) :] | |
| for input_ids, output_ids in zip(inputs.input_ids, output_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True | |
| ) | |
| # remove image | |
| os.remove(image_path) | |
| # Extract the detailed description from the response | |
| return output_text, generate_embeddings(output_text) | |
| # Create a Gradio interface | |
| iface = gr.Interface( | |
| fn=describe_image, | |
| inputs=gr.Image(), | |
| outputs=[gr.Textbox(label="Description"), gr.JSON(label="Embeddings")], | |
| title="Image Description with Qwen Model", | |
| description="Upload an image to get a detailed description using the Qwen2-VL-7B-Instruct model." | |
| ) | |
| # Launch the app | |
| #iface.launch(share=True) | |
| iface.launch() |