Spaces:

prithivMLmods
/

Qwen2.5-VL-Outpost

Running on Zero

App Files Files Community

Qwen2.5-VL-Outpost / app.py

prithivMLmods

update app (#6)

05f9a90 verified 25 days ago

raw

history blame

15 kB

	import os
	import random
	import uuid
	import json
	import time
	import asyncio
	from threading import Thread

	import gradio as gr
	import spaces
	import torch
	import numpy as np
	from PIL import Image
	import cv2

	from transformers import (
	Qwen2_5_VLForConditionalGeneration,
	AutoModel,
	AutoTokenizer,
	AutoProcessor,
	TextIteratorStreamer,
	)
	from transformers.image_utils import load_image

	# Constants for text generation
	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	# --- Model Loading ---

	# Load Qwen2.5-VL-7B-Instruct
	MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
	processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
	model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID_M,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to(device).eval()

	# Load Qwen2.5-VL-3B-Instruct
	MODEL_ID_X = "Qwen/Qwen2.5-VL-3B-Instruct"
	processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
	model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID_X,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to(device).eval()

	# Load Qwen2.5-VL-7B-Abliterated-Caption-it
	MODEL_ID_Q = "prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it"
	processor_q = AutoProcessor.from_pretrained(MODEL_ID_Q, trust_remote_code=True)
	model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID_Q,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to(device).eval()

	# Load prithivMLmods/DeepCaption-VLA-7B
	MODEL_ID_DC = "prithivMLmods/DeepCaption-VLA-7B"
	processor_dc = AutoProcessor.from_pretrained(MODEL_ID_DC, trust_remote_code=True)
	model_dc = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID_DC,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to(device).eval()


	# --- System Prompt for DeepCaption-VLA-7B ---
	CAPTION_SYSTEM_PROMPT = """
	You are an AI assistant that rigorously follows this response protocol:

	1. For every input image, your primary task is to write a precise caption. The caption must capture the essence of the image in clear, concise, and contextually accurate language.

	2. Along with the caption, provide a structured set of attributes that describe the visual elements. Attributes should include details such as objects, people, actions, colors, environment, mood, and other notable characteristics.

	3. Always include a class_name field. This must represent the core theme or main subject of the image in a compact format.
	- Use the syntax: `{class_name==write_the_core_theme}`
	- Example: `{class_name==dog_playing}` or `{class_name==city_sunset}`

	4. Maintain the following strict format in your output:
	- Caption: <one-sentence description>
	- Attributes: <comma-separated list of visual attributes>
	- {class_name==core_theme}

	5. Ensure captions are precise, neutral, and descriptive, avoiding unnecessary elaboration or subjective interpretation unless explicitly required.

	6. Do not reference the rules or instructions in the output. Only return the formatted caption, attributes, and class_name.
	""".strip()


	def downsample_video(video_path):
	"""
	Downsamples the video to evenly spaced frames.
	Each frame is returned as a PIL image along with its timestamp.
	"""
	vidcap = cv2.VideoCapture(video_path)
	total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
	fps = vidcap.get(cv2.CAP_PROP_FPS)
	frames = []
	# Use a denser sampling for better video understanding
	frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
	for i in frame_indices:
	vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
	success, image = vidcap.read()
	if success:
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	pil_image = Image.fromarray(image)
	timestamp = round(i / fps, 2)
	frames.append((pil_image, timestamp))
	vidcap.release()
	return frames

	@spaces.GPU
	def generate_image(model_name: str, text: str, image: Image.Image,
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2):
	"""
	Generates responses using the selected model for image input.
	Yields raw text and Markdown-formatted text.
	"""
	processor = None
	model = None

	if model_name == "Qwen2.5-VL-7B-Instruct":
	processor = processor_m
	model = model_m
	elif model_name == "Qwen2.5-VL-3B-Instruct":
	processor = processor_x
	model = model_x
	elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
	processor = processor_q
	model = model_q
	elif model_name == "DeepCaption-VLA-7B":
	processor = processor_dc
	model = model_dc
	# Prepend system prompt for this model
	text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
	else:
	yield "Invalid model selected.", "Invalid model selected."
	return

	if image is None:
	yield "Please upload an image.", "Please upload an image."
	return

	messages = [{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": text},
	]
	}]
	prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(
	text=[prompt_full],
	images=[image],
	return_tensors="pt",
	padding=True,
	truncation=False,
	max_length=MAX_INPUT_TOKEN_LENGTH
	).to(device)

	streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = {
	**inputs,
	"streamer": streamer,
	"max_new_tokens": max_new_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repetition_penalty": repetition_penalty,
	"do_sample": True,
	}
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	time.sleep(0.01)
	yield buffer, buffer

	@spaces.GPU
	def generate_video(model_name: str, text: str, video_path: str,
	max_new_tokens: int = 1024,
	temperature: float = 0.6,
	top_p: float = 0.9,
	top_k: int = 50,
	repetition_penalty: float = 1.2):
	"""
	Generates responses using the selected model for video input.
	Yields raw text and Markdown-formatted text.
	"""
	processor = None
	model = None

	if model_name == "Qwen2.5-VL-7B-Instruct":
	processor = processor_m
	model = model_m
	elif model_name == "Qwen2.5-VL-3B-Instruct":
	processor = processor_x
	model = model_x
	elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
	processor = processor_q
	model = model_q
	elif model_name == "DeepCaption-VLA-7B":
	processor = processor_dc
	model = model_dc
	# Prepend system prompt for this model
	text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
	else:
	yield "Invalid model selected.", "Invalid model selected."
	return

	if video_path is None:
	yield "Please upload a video.", "Please upload a video."
	return

	frames = downsample_video(video_path)
	# Create the message structure with a system prompt and user query
	messages = [
	{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
	{"role": "user", "content": [{"type": "text", "text": text}]}
	]

	# Add each frame to the user content
	for frame in frames:
	image, timestamp = frame
	messages[1]["content"].append({"type": "text", "text": f"Frame at {timestamp}s:"})
	messages[1]["content"].append({"type": "image", "image": image})

	# Prepare inputs for the model
	inputs = processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt",
	truncation=False,
	max_length=MAX_INPUT_TOKEN_LENGTH
	).to(device)

	streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
	generation_kwargs = {
	**inputs,
	"streamer": streamer,
	"max_new_tokens": max_new_tokens,
	"do_sample": True,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repetition_penalty": repetition_penalty,
	}
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	time.sleep(0.01)
	yield buffer, buffer


	# Define examples for image and video inference
	image_examples = [
	["Provide a detailed caption for the image..", "images/A.jpg"],
	["Explain the pie-chart in detail.", "images/2.jpg"],
	["Jsonify Data.", "images/1.jpg"],
	]

	video_examples = [
	["Explain the ad in detail", "videos/1.mp4"],
	["Identify the main actions in the video", "videos/2.mp4"],
	["Identify the main scenes in the video", "videos/3.mp4"]
	]

	css = """
	.submit-btn {
	background-color: #2980b9 !important;
	color: white !important;
	}
	.submit-btn:hover {
	background-color: #3498db !important;
	}
	.canvas-output {
	border: 2px solid #4682B4;
	border-radius: 10px;
	padding: 20px;
	}
	"""

	# Create the Gradio Interface
	with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
	gr.Markdown("# [Qwen2.5-VL-Outpost](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)")
	with gr.Row():
	with gr.Column():
	with gr.Tabs():
	with gr.TabItem("Image Inference"):
	image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
	image_upload = gr.Image(type="pil", label="Image")
	image_submit = gr.Button("Submit", elem_classes="submit-btn")
	gr.Examples(
	examples=image_examples,
	inputs=[image_query, image_upload]
	)
	with gr.TabItem("Video Inference"):
	video_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query here...")
	video_upload = gr.Video(label="Video")
	video_submit = gr.Button("Submit", elem_classes="submit-btn")
	gr.Examples(
	examples=video_examples,
	inputs=[video_query, video_upload]
	)
	with gr.Accordion("Advanced options", open=False):
	max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
	temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
	top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
	top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
	repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)

	with gr.Column():
	with gr.Column(elem_classes="canvas-output"):
	gr.Markdown("## Output")
	output = gr.Textbox(label="Raw Output", interactive=False, lines=2, scale=2)

	with gr.Accordion("(Result.md)", open=False):
	markdown_output = gr.Markdown()

	model_choice = gr.Radio(
	choices=[
	"Qwen2.5-VL-7B-Instruct",
	"Qwen2.5-VL-3B-Instruct",
	"Qwen2.5-VL-7B-Abliterated-Caption-it",
	"DeepCaption-VLA-7B"
	],
	label="Select Model",
	value="Qwen2.5-VL-7B-Instruct"
	)
	gr.Markdown("Model Info 💻 \| [Report Bug](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions)")
	gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
	gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
	gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
	gr.Markdown("> [prithivMLmods/DeepCaption-VLA-7B](https://huggingface.co/prithivMLmods/DeepCaption-VLA-7B): DeepCaption-VLA-7B is a fine-tuned model based on Qwen2.5-VL, designed for generating precise, structured captions and attributes for images. It follows a strict protocol to provide a main caption, a list of visual attributes, and a core class name, making it ideal for detailed and organized visual analysis.")
	gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")

	image_submit.click(
	fn=generate_image,
	inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
	outputs=[output, markdown_output]
	)
	video_submit.click(
	fn=generate_video,
	inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
	outputs=[output, markdown_output]
	)

	if __name__ == "__main__":
	demo.queue(max_size=50).launch(share=True, show_error=True)