CGQN's picture
Upload app.py with huggingface_hub
e7b1930 verified
import os
import gradio as gr
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
MODEL_ID = os.environ.get("MINICPM_MODEL_ID", "openbmb/MiniCPM-V-4_5")
# Best practice: set a deterministic seed for reproducibility
torch.manual_seed(100)
def load_model(precision_mode="int4"):
"""
Load MiniCPM-V-4_5 model on CPU with chosen precision.
- precision_mode: "int4" (default) quantized or "fp16" half precision emulation.
Note: True FP16 tensors are not supported on CPU; we use bfloat16 or float32 fallback.
"""
kwargs = dict(trust_remote_code=True, attn_implementation="sdpa")
if precision_mode == "int4":
# BitsAndBytes is not available for CPU only in Transformers' AutoModel consistently across archs,
# but MiniCPM provides CPU-friendly quantization via trust_remote_code. We'll pass load_in_4bit if supported.
try:
model = AutoModel.from_pretrained(
MODEL_ID,
load_in_4bit=True,
device_map="cpu",
**kwargs,
)
dtype_used = "int4"
except Exception:
# Fallback: load in 8-bit or bf16 if 4-bit isn't supported in environment
model = AutoModel.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
device_map="cpu",
**kwargs,
)
dtype_used = "fallback_bf16_or_fp32"
else:
# "fp16" requested: CPU cannot run native fp16; we emulate with bfloat16 if available, otherwise float32
# Many Intel/AMD CPUs support bfloat16 acceleration; if not, it will still run in fp32 math.
model = AutoModel.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
device_map="cpu",
**kwargs,
)
dtype_used = "bf16_or_fp32_on_cpu_for_fp16_request"
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
return model, tokenizer, dtype_used
# Global cache to avoid reloading each time
_state = {"model": None, "tokenizer": None, "mode": None, "dtype_used": None}
def ensure_model(mode):
if _state["model"] is None or _state["mode"] != mode:
_state["model"], _state["tokenizer"], _state["dtype_used"] = load_model(mode)
_state["mode"] = mode
def chat_infer(image: Image.Image, message: str, history, mode: str, enable_thinking: bool):
if image is None and (not history or all((h[0] or "") == "" and (h[1] or "") == "" for h in history)):
return history or [], "Please upload an image or enter a message."
ensure_model(mode)
model, tokenizer = _state["model"], _state["tokenizer"]
# Build msgs from history and current inputs
msgs = []
# Convert history into msgs
# Each item in history is (user, assistant)
for user_msg, assistant_msg in history or []:
if user_msg:
# history may not contain images; only text
msgs.append({"role": "user", "content": [user_msg]})
if assistant_msg:
msgs.append({"role": "assistant", "content": [assistant_msg]})
# Add current user turn
user_content = []
if image is not None:
# Ensure RGB
if image.mode != "RGB":
image = image.convert("RGB")
user_content.append(image)
if message and message.strip():
user_content.append(message.strip())
if not user_content:
return history or [], "Please provide text or image."
msgs.append({"role": "user", "content": user_content})
try:
answer = model.chat(
msgs=msgs,
tokenizer=tokenizer,
enable_thinking=enable_thinking,
)
except Exception as e:
return history or [], f"Inference error: {e}"
# Update history for Gradio chat UI: append the latest pair
history = (history or []) + [(message or "[Image]", answer)]
sys_info = f"Mode: {mode} | Loaded dtype: {_state['dtype_used']} | Device: CPU"
return history, sys_info
def clear_history():
return [], ""
with gr.Blocks(title="MiniCPM-V-4_5 CPU (int4 default, fp16 optional)", fill_height=True) as demo:
gr.Markdown("# MiniCPM-V-4_5 CPU Deployment\n- Modes: int4 (default) and fp16\n- Running on CPU")
with gr.Row():
with gr.Column(scale=2):
chatbox = gr.Chatbot(height=420, label="Chat")
with gr.Row():
img = gr.Image(type="pil", label="Image (optional)")
msg = gr.Textbox(placeholder="Ask a question about the image or general query...", lines=3)
with gr.Row():
send_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear")
with gr.Column(scale=1):
mode = gr.Radio(
choices=["int4", "fp16"],
value="int4",
label="Precision Mode (CPU)",
info="int4 as default. fp16 uses bf16/fp32 on CPU."
)
thinking = gr.Checkbox(label="Enable Thinking Mode", value=False)
sys_out = gr.Markdown("")
def on_send(message, image, history, mode, thinking):
return chat_infer(image, message, history, mode, thinking)
send_btn.click(
fn=on_send,
inputs=[msg, img, chatbox, mode, thinking],
outputs=[chatbox, sys_out],
show_progress=True,
)
# Submit on Enter
msg.submit(
fn=on_send,
inputs=[msg, img, chatbox, mode, thinking],
outputs=[chatbox, sys_out],
show_progress=True,
)
clear_btn.click(fn=clear_history, outputs=[chatbox, sys_out])
if __name__ == "__main__":
# For CPU environments with many threads, you may limit to reduce contention:
torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "4")))
demo.launch()