Spaces:

Geraldine
/

Image-to-text-SmolVLM-for-Omeka

Runtime error

App Files Files Community

Image-to-text-SmolVLM-for-Omeka / app.py

Geraldine

Create app.py

2ba0248 verified 9 months ago

raw

history blame

3.16 kB

	import gradio as gr
	from sentence_transformers import SentenceTransformer
	from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
	from PIL import Image
	import torch
	from torchvision import io
	from typing import Dict
	from datetime import datetime
	import numpy as np
	import base64
	import os, io

	# Load the model in half-precision on the available device(s)
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"./Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
	)
	processor = AutoProcessor.from_pretrained("./Qwen2-VL-7B-Instruct")

	def array_to_image_path(image_array):
	if image_array is None:
	raise ValueError("No image provided. Please upload an image before submitting.")
	# Convert numpy array to PIL Image
	img = Image.fromarray(np.uint8(image_array))

	# Generate a unique filename using timestamp
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"image_{timestamp}.png"

	# Save the image
	img.save(filename)

	# Get the full path of the saved image
	full_path = os.path.abspath(filename)

	return full_path

	def generate_embeddings(text):
	model = SentenceTransformer('./all-MiniLM-L6-v2')
	embeddings = model.encode(sentences)
	return embeddings

	def describe_image(image):
	# Convert the image to the format expected by the model
	image_path = array_to_image_path(image)
	with open(image_path, "rb") as f:
	image = base64.b64encode(f.read()).decode("utf-8")

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": f"data:image/png;base64,{image}"},
	{"type": "text", "text": "Make a very detailed description of the image."},
	],
	}
	]

	text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
	# Excepted output: '<\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n<\|im_start\|>user\n<\|vision_start\|><\|image_pad\|><\|vision_end\|>Describe this image.<\|im_end\|>\n<\|im_start\|>assistant\n'

	inputs = processor(
	text=[text_prompt], images=[image], padding=True, return_tensors="pt"
	)
	inputs = inputs.to("cuda")

	# Inference: Generation of the output
	output_ids = model.generate(**inputs, max_new_tokens=128)
	generated_ids = [
	output_ids[len(input_ids) :]
	for input_ids, output_ids in zip(inputs.input_ids, output_ids)
	]
	output_text = processor.batch_decode(
	generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
	)
	# remove image
	os.remove(image_path)
	# Extract the detailed description from the response
	return output_text, generate_embeddings(output_text)

	# Create a Gradio interface
	iface = gr.Interface(
	fn=describe_image,
	inputs=gr.Image(),
	outputs=[gr.Textbox(label="Description"), gr.JSON(label="Embeddings")],
	title="Image Description with Qwen Model",
	description="Upload an image to get a detailed description using the Qwen2-VL-7B-Instruct model."
	)

	# Launch the app
	#iface.launch(share=True)
	iface.launch()