gizemsarsinlar's picture
Update app.py
1eba56e verified
raw
history blame
2.86 kB
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"
model_name = "microsoft/Phi-3.5-vision-instruct"
# Lazy-load the model and processor at runtime
def get_model_and_processor(model_id):
model = AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
torch_dtype=torch.bfloat16
).cuda().eval()
processor = AutoProcessor.from_pretrained(
model_id,
trust_remote_code=True
)
return model, processor
@spaces.GPU(memory=30)
def run_example(image, text_input=None, model_id=model_name):
model, processor = get_model_and_processor(model_id)
prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
image = Image.fromarray(image).convert("RGB")
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
generate_ids = model.generate(
**inputs,
max_new_tokens=1000,
eos_token_id=processor.tokenizer.eos_token_id
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return response
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
# Create the Gradio interface
demo = gr.Blocks(css=css)
with demo:
gr.Markdown("## Phi-3.5 Vision Instruct Demo with Example Inputs")
with gr.Tab(label="Phi-3.5 Input"):
with gr.Row():
with gr.Column():
input_img = gr.Image(label="Input Picture")
model_selector = gr.Dropdown(
choices=[model_name],
label="Model",
value=model_name
)
text_input = gr.Textbox(label="Question")
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
examples = [
["image1.jpeg", "What does this painting tell us explain in detail?"],
["image2.jpg", "What does this painting tell us explain in detail?"],
["image3.jpg", "Describe the scene in this picture."]
]
gr.Examples(
examples=examples,
inputs=[input_img, text_input],
examples_per_page=3
)
submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])
# Queue and launch the demo
demo.queue()
demo.launch(server_name="0.0.0.0")