|
|
import gradio as gr |
|
|
from PIL import Image |
|
|
from transformers import AutoModelForCausalLM |
|
|
from transformers import AutoProcessor |
|
|
from transformers import TextIteratorStreamer |
|
|
from threading import Thread |
|
|
import torch |
|
|
import spaces |
|
|
|
|
|
model_id = "microsoft/Phi-3-vision-128k-instruct" |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_id, |
|
|
device_map="cpu", |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.float32, |
|
|
_attn_implementation="eager" |
|
|
) |
|
|
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) |
|
|
|
|
|
@spaces.CPU |
|
|
def bot_streaming(message, history): |
|
|
try: |
|
|
image = (message["files"][-1]["path"] if isinstance(message["files"][-1], dict) else message["files"][-1]) if message["files"] else None |
|
|
|
|
|
if not image: |
|
|
raise ValueError("No image uploaded") |
|
|
|
|
|
conversation = [] |
|
|
for user, assistant in history: |
|
|
conversation.extend([ |
|
|
{"role": "user", "content": user}, |
|
|
{"role": "assistant", "content": assistant or ""} |
|
|
]) |
|
|
|
|
|
conversation.append({"role": "user", "content": f"<|image_1|>\n{message['text']}"}) |
|
|
|
|
|
prompt = processor.tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) |
|
|
image = Image.open(image) |
|
|
inputs = processor(prompt, image, return_tensors="pt") |
|
|
|
|
|
streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True) |
|
|
generation_kwargs = dict( |
|
|
inputs, |
|
|
streamer=streamer, |
|
|
max_new_tokens=256, |
|
|
do_sample=False, |
|
|
temperature=0.1, |
|
|
eos_token_id=processor.tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
thread = Thread(target=model.generate, kwargs=generation_kwargs) |
|
|
thread.start() |
|
|
|
|
|
buffer = "" |
|
|
for new_text in streamer: |
|
|
buffer += new_text |
|
|
yield buffer |
|
|
|
|
|
except Exception as e: |
|
|
yield f"Error: {str(e)}" |
|
|
|
|
|
demo = gr.Blocks() |
|
|
with demo: |
|
|
gr.ChatInterface( |
|
|
fn=bot_streaming, |
|
|
title="Phi3 Vision 128K", |
|
|
description="Multimodal AI Vision Model", |
|
|
examples=[ |
|
|
{"text": "Describe this image", "files": ["./example.jpg"]}, |
|
|
] |
|
|
) |
|
|
|
|
|
demo.queue() |
|
|
demo.launch(debug=True) |