Spaces:
Runtime error
Runtime error
File size: 10,064 Bytes
1d84803 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
import os
import gradio as gr
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
# Notes:
# - This demo runs on CPU for broader compatibility. It may be slow compared to GPU.
# - If you have a GPU, you can set device="cuda" and possibly use torch_dtype=torch.bfloat16.
# - MiniCPM-V-4_5 uses trust_remote_code; ensure you trust the source.
# - The model expects multi-modal messages in a chat-like format: [{'role': 'user', 'content': [image, text]}]
# - For multi-turn chat, we persist history in Gradio state and pass it back to model.chat.
MODEL_ID = os.environ.get("MINICPM_MODEL_ID", "openbmb/MiniCPM-V-4_5")
DEVICE = "cpu" # Force CPU per user request
DTYPE = torch.float32 # CPU-friendly dtype
# Lazy global variables (loaded on first launch)
_tokenizer = None
_model = None
def load_model():
global _tokenizer, _model
if _model is None or _tokenizer is None:
# Some platforms require setting no_mmap or local_files_only as needed; adjust if necessary.
_model = AutoModel.from_pretrained(
MODEL_ID,
trust_remote_code=True,
attn_implementation="sdpa", # sdpa is fine on CPU; avoid eager per model note
torch_dtype=DTYPE
)
_model = _model.eval().to(DEVICE)
_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
return _model, _tokenizer
def format_history(history):
"""
Convert Gradio-style chat history into model's expected message format.
history: list of tuples (user_text, assistant_text) where user_text may have an <image> placeholder handled separately.
We will store messages in a structured way in state to retain images explicitly instead of parsing text.
This function is not used directly; we keep the raw message structure in state for fidelity.
"""
return history
def predict(image, user_message, history_state, enable_thinking=False, stream=False):
"""
image: PIL.Image or None
user_message: str
history_state: list of dicts in MiniCPM format [{'role': 'user'|'assistant', 'content':[...]}]
"""
model, tokenizer = load_model()
# Initialize history if empty
msgs = history_state if isinstance(history_state, list) else []
# Build the current user content payload
# The model expects a list mixing image(s) and text; include only provided items.
content = []
if image is not None:
if image.mode != "RGB":
image = image.convert("RGB")
content.append(image)
if user_message and user_message.strip():
content.append(user_message.strip())
if len(content) == 0:
return gr.update(), msgs, "Please provide an image and/or a message."
msgs = msgs + [{'role': 'user', 'content': content}]
# Run generation
try:
# model.chat returns either an iterator (when stream=True) or a string
answer = model.chat(
msgs=msgs,
tokenizer=tokenizer,
enable_thinking=bool(enable_thinking),
stream=bool(stream)
)
if stream:
# Concatenate streamed text
generated = []
for chunk in answer:
generated.append(chunk)
yield "\n".join(["".join(generated)]), msgs, None
final_text = "".join(generated)
else:
final_text = answer
# Append assistant message back into msgs
msgs = msgs + [{"role": "assistant", "content": [final_text]}]
# Return final
yield final_text, msgs, None
except Exception as e:
yield gr.update(), msgs, f"Error: {e}"
def clear_state():
return None, [], None
with gr.Blocks(title="MiniCPM-V-4_5 CPU Gradio Demo") as demo:
gr.Markdown("# MiniCPM-V-4_5 (CPU) Demo")
gr.Markdown("Upload an image (optional) and ask a question. Multi-turn chat is supported. Running on CPU may be slow.")
with gr.Row():
with gr.Column(scale=1):
image_in = gr.Image(type="pil", label="Image (optional)")
user_in = gr.Textbox(label="Your Message", placeholder="Ask a question about the image or general query...", lines=3)
with gr.Row():
think_chk = gr.Checkbox(label="Enable Thinking Mode", value=False)
stream_chk = gr.Checkbox(label="Stream Output", value=False)
with gr.Row():
submit_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear")
with gr.Column(scale=2):
chat_out = gr.Chatbot(label="Chat", type="messages", height=450, avatar_images=(None, None))
status_box = gr.Markdown("", visible=True)
# Hidden state: we store the raw MiniCPM messages, not just text pairs
state_msgs = gr.State([])
def on_submit(image, message, enable_thinking, stream, msgs):
# Kick off streaming generator
# We'll display only last exchange in Chatbot. Convert msgs to Chatbot-friendly format when yielding.
# For Chatbot display, we reconstruct from msgs
def format_for_chatbot(msgs_local):
chat_pairs = []
# Collect pairs by scanning msgs in order
user_tmp = None
for m in msgs_local:
if m["role"] == "user":
# Convert content to displayable string for Chatbot
parts = []
for c in m["content"]:
if isinstance(c, Image.Image):
parts.append("[Image]")
else:
parts.append(str(c))
user_tmp = " ".join(parts).strip() or "[Image]"
elif m["role"] == "assistant":
assistant_text = " ".join([str(x) for x in m["content"]]) if m["content"] else ""
if user_tmp is None:
chat_pairs.append((None, assistant_text))
else:
chat_pairs.append((user_tmp, assistant_text))
user_tmp = None
return chat_pairs
gen = predict(image, message, msgs, enable_thinking, stream)
if stream:
for partial_text, updated_msgs, err in gen:
# Build display history from updated_msgs + current partial response
display_msgs = updated_msgs.copy()
# Don't duplicate assistant msg until finalized; just show in Chatbot via the last pair
chat_history = format_for_chatbot(display_msgs)
if chat_history and isinstance(partial_text, str) and partial_text:
if chat_history and (not chat_history[-1][1] or chat_history[-1][1] == ""):
# replace last tuple assistant part
u, _ = chat_history[-1]
chat_history[-1] = (u, partial_text)
else:
# append live pair
last_user = None
for m in reversed(display_msgs):
if m["role"] == "user":
parts = []
for c in m["content"]:
if isinstance(c, Image.Image):
parts.append("[Image]")
else:
parts.append(str(c))
last_user = " ".join(parts).strip() or "[Image]"
break
chat_history.append((last_user, partial_text))
status = "" if not err else f"{err}"
yield chat_history, updated_msgs, status, gr.update(value=None), gr.update(value=None)
else:
for final_text, updated_msgs, err in gen:
chat_history = []
# Build chat history from updated_msgs
def format_for_chatbot_final(msgs_local):
pairs = []
u_txt = None
for m in msgs_local:
if m["role"] == "user":
parts = []
for c in m["content"]:
if isinstance(c, Image.Image):
parts.append("[Image]")
else:
parts.append(str(c))
u_txt = " ".join(parts).strip() or "[Image]"
elif m["role"] == "assistant":
a_txt = " ".join([str(x) for x in m["content"]]) if m["content"] else ""
if u_txt is None:
pairs.append((None, a_txt))
else:
pairs.append((u_txt, a_txt))
u_txt = None
return pairs
chat_history = format_for_chatbot_final(updated_msgs)
status = "" if not err else f"{err}"
yield chat_history, updated_msgs, status, gr.update(value=None), gr.update(value=None)
submit_btn.click(
on_submit,
inputs=[image_in, user_in, think_chk, stream_chk, state_msgs],
outputs=[chat_out, state_msgs, status_box, user_in, image_in]
)
clear_btn.click(
fn=clear_state,
inputs=[],
outputs=[user_in, state_msgs, status_box]
).then(
lambda: [],
inputs=None,
outputs=chat_out
)
# Preload model on app start (optional; keeps UI responsive on first query)
demo.load(lambda: "Model loading on CPU... Please wait a moment.", outputs=status_box).then(
lambda: (load_model() or True) and "Model loaded. Ready!",
outputs=status_box
)
if __name__ == "__main__":
# Set server_name="0.0.0.0" to expose externally if needed.
demo.queue(max_size=8, concurrency_count=1).launch() |