|
import gradio as gr |
|
from PIL import Image |
|
import torch |
|
import soundfile as sf |
|
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig |
|
from urllib.request import urlopen |
|
import spaces |
|
|
|
|
|
model_path = "microsoft/Phi-4-multimodal-instruct" |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_path, |
|
device_map="auto", |
|
torch_dtype="auto", |
|
trust_remote_code=True, |
|
_attn_implementation="eager", |
|
) |
|
|
|
|
|
user_prompt = '<|user|>' |
|
assistant_prompt = '<|assistant|>' |
|
prompt_suffix = '<|end|>' |
|
|
|
|
|
@spaces.GPU |
|
def process_input(input_type, file, question): |
|
if not file or not question: |
|
return "Please upload a file and provide a question." |
|
|
|
|
|
if input_type == "Image": |
|
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}' |
|
|
|
image = Image.open(file) |
|
inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device) |
|
elif input_type == "Audio": |
|
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}' |
|
|
|
audio, samplerate = sf.read(file) |
|
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device) |
|
else: |
|
return "Invalid input type selected." |
|
|
|
|
|
with torch.no_grad(): |
|
generate_ids = model.generate( |
|
**inputs, |
|
max_new_tokens=200, |
|
num_logits_to_keep=0, |
|
) |
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
response = processor.batch_decode( |
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
)[0] |
|
|
|
return response |
|
|
|
|
|
with gr.Blocks( |
|
title="Phi-4 Multimodal Demo", |
|
theme=gr.themes.Soft( |
|
primary_hue="blue", |
|
secondary_hue="gray", |
|
radius_size="lg", |
|
), |
|
) as demo: |
|
gr.Markdown( |
|
""" |
|
# Phi-4 Multimodal Demo |
|
Upload an **image** or **audio** file, ask a question, and get a response from the model! |
|
|
|
Built with the `microsoft/Phi-4-multimodal-instruct` model by Microsoft. |
|
|
|
Credits: Grok from xAI helped me in making this demo. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
input_type = gr.Radio( |
|
choices=["Image", "Audio"], |
|
label="Select Input Type", |
|
value="Image", |
|
) |
|
file_input = gr.File( |
|
label="Upload Your File", |
|
file_types=["image", "audio"], |
|
) |
|
question_input = gr.Textbox( |
|
label="Your Question", |
|
placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'", |
|
lines=2, |
|
) |
|
submit_btn = gr.Button("Submit", variant="primary") |
|
|
|
with gr.Column(scale=2): |
|
output_text = gr.Textbox( |
|
label="Model Response", |
|
placeholder="Response will appear here...", |
|
lines=10, |
|
interactive=False, |
|
) |
|
|
|
|
|
with gr.Accordion("Examples", open=False): |
|
gr.Markdown("Try these examples:") |
|
gr.Examples( |
|
examples=[ |
|
["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"], |
|
["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."], |
|
], |
|
inputs=[input_type, file_input, question_input], |
|
outputs=output_text, |
|
fn=process_input, |
|
cache_examples=False, |
|
) |
|
|
|
|
|
submit_btn.click( |
|
fn=process_input, |
|
inputs=[input_type, file_input, question_input], |
|
outputs=output_text, |
|
) |
|
|
|
|
|
demo.launch() |