Spaces:

omlab
/

VLM-R1-Referral-Expression

Runtime error

SZhanZ

zero

0c0f4a9 8 months ago

4.07 kB

	import re
	import torch
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from PIL import Image, ImageDraw

	def draw_bbox(image, bbox):
	x1, y1, x2, y2 = bbox
	draw = ImageDraw.Draw(image)
	draw.rectangle((x1, y1, x2, y2), outline="red", width=5)
	return image

	def extract_bbox_answer(content):
	bbox_pattern = r'\{.\[(\d+),\s(\d+),\s(\d+),\s(\d+)]\s.\}'
	bbox_match = re.search(bbox_pattern, content)
	if bbox_match:
	bbox = [int(bbox_match.group(1)), int(bbox_match.group(2)), int(bbox_match.group(3)), int(bbox_match.group(4))]
	return bbox
	return [0, 0, 0, 0]

	import spaces

	@spaces.GPU
	def process_image_and_text(image, text):
	"""Process image and text input, return thinking process and bbox"""
	question = f"Please provide the bounding box coordinate of the region this sentence describes: {text}."
	QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": QUESTION_TEMPLATE.format(Question=question)},
	],
	}
	]

	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	inputs = processor(
	text=[text],
	images=image,
	return_tensors="pt",
	padding=True,
	padding_side="left",
	add_special_tokens=False,
	)

	inputs = inputs.to("cuda")

	with torch.no_grad():
	generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
	generated_ids_trimmed = [
	out_ids[len(inputs.input_ids[0]):] for out_ids in generated_ids
	]

	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True
	)[0]
	print("output_text: ", output_text)

	# Extract thinking process
	think_match = re.search(r'<think>(.*?)</think>', output_text, re.DOTALL)
	thinking_process = think_match.group(1).strip() if think_match else "No thinking process found"

	# Get bbox and draw
	bbox = extract_bbox_answer(output_text)

	# Draw bbox on the image
	result_image = image.copy()
	result_image = draw_bbox(result_image, bbox)

	return thinking_process, result_image

	if __name__ == "__main__":
	import gradio as gr

	# model_path = "/data/shz/project/vlm-r1/VLM-R1/output/Qwen2.5-VL-3B-GRPO-REC/checkpoint-500"
	model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	device = "cuda"
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
	model.to(device)
	processor = AutoProcessor.from_pretrained(model_path)

	def gradio_interface(image, text):
	thinking, result_image = process_image_and_text(image, text)
	return thinking, result_image

	demo = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.Image(type="pil", label="Input Image"),
	gr.Textbox(label="Description Text")
	],
	outputs=[
	gr.Textbox(label="Thinking Process"),
	gr.Image(type="pil", label="Result with Bbox")
	],
	title="Visual Referring Expression Demo",
	description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
	examples=[
	["examples/image1.jpg", "person with blue shirt"],
	["examples/image2.jpg", "food with the highest protein"],
	["examples/image3.jpg", "the cheapest Apple laptop"],
	],
	cache_examples=False,
	examples_per_page=10
	)

	demo.launch(server_name="0.0.0.0", server_port=7860, share=True)