Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,045 Bytes
1bf7e3b ffb7a8f 1bf7e3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import gradio as gr
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import spaces
# Initialize model and tokenizer
torch.manual_seed(100)
model = AutoModel.from_pretrained(
'openbmb/MiniCPM-V-4_5',
trust_remote_code=True,
attn_implementation='sdpa',
torch_dtype=torch.bfloat16
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
'openbmb/MiniCPM-V-4_5',
trust_remote_code=True
)
@spaces.GPU(duration=120)
def respond(message, history, enable_thinking):
"""
Process user message and generate response
"""
# Build conversation history in the format expected by the model
msgs = []
# Add previous conversation history
for h in history:
user_msg = h[0]
assistant_msg = h[1]
# Parse user message for images and text
user_content = []
if isinstance(user_msg, tuple):
# If user message contains an image
img_path, text = user_msg
img = Image.open(img_path).convert('RGB')
user_content = [img, text] if text else [img]
else:
# Text only message
user_content = [user_msg]
msgs.append({"role": "user", "content": user_content})
if assistant_msg:
msgs.append({"role": "assistant", "content": [assistant_msg]})
# Add current message
current_content = []
if isinstance(message, dict):
# Handle multimodal input
if message.get("files"):
for file_path in message["files"]:
img = Image.open(file_path).convert('RGB')
current_content.append(img)
if message.get("text"):
current_content.append(message["text"])
else:
# Handle text-only input
current_content = [message]
msgs.append({"role": "user", "content": current_content})
# Generate response
try:
answer = model.chat(
msgs=msgs,
tokenizer=tokenizer,
enable_thinking=enable_thinking
)
return answer
except Exception as e:
return f"Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="MiniCPM-V Chatbot") as demo:
gr.Markdown(
"""
# 🤖 MiniCPM-V Multimodal Chatbot
Upload images and ask questions about them, or have a text conversation.
The model supports multi-turn conversations with context memory.
"""
)
with gr.Row():
with gr.Column(scale=4):
chatbot = gr.Chatbot(
height=500,
show_label=False,
container=True,
type="tuples"
)
with gr.Row():
msg = gr.MultimodalTextbox(
interactive=True,
file_types=["image"],
placeholder="Type a message or upload an image...",
show_label=False,
container=False
)
with gr.Row():
clear = gr.Button("🗑️ Clear", size="sm")
submit = gr.Button("📤 Send", variant="primary", size="sm")
with gr.Column(scale=1):
gr.Markdown("### Settings")
enable_thinking = gr.Checkbox(
label="Enable Thinking Mode",
value=False,
info="Enable the model's thinking process"
)
gr.Markdown(
"""
### Examples
- Upload an image and ask "What is in this picture?"
- Ask "What are the main objects visible?"
- Follow up with "What should I pay attention to here?"
"""
)
# Handle message submission
def user_submit(message, history, enable_thinking):
# Format the user message for display
if isinstance(message, dict) and message.get("files"):
# If there are files, create tuple format for chatbot display
user_msg = (message["files"][0], message.get("text", ""))
else:
user_msg = message.get("text", "") if isinstance(message, dict) else message
# Add user message to history
history = history + [(user_msg, None)]
# Generate response
response = respond(message, history[:-1], enable_thinking)
# Update history with response
history[-1] = (history[-1][0], response)
return "", history
# Event handlers
msg.submit(
user_submit,
inputs=[msg, chatbot, enable_thinking],
outputs=[msg, chatbot]
)
submit.click(
user_submit,
inputs=[msg, chatbot, enable_thinking],
outputs=[msg, chatbot]
)
clear.click(
lambda: (None, []),
outputs=[msg, chatbot]
)
if __name__ == "__main__":
demo.launch(share=True) |