import os import gradio as gr import torch from PIL import Image from transformers import AutoModel, AutoTokenizer MODEL_ID = os.environ.get("MINICPM_MODEL_ID", "openbmb/MiniCPM-V-4_5") # Best practice: set a deterministic seed for reproducibility torch.manual_seed(100) def load_model(precision_mode="int4"): """ Load MiniCPM-V-4_5 model on CPU with chosen precision. - precision_mode: "int4" (default) quantized or "fp16" half precision emulation. Note: True FP16 tensors are not supported on CPU; we use bfloat16 or float32 fallback. """ kwargs = dict(trust_remote_code=True, attn_implementation="sdpa") if precision_mode == "int4": # BitsAndBytes is not available for CPU only in Transformers' AutoModel consistently across archs, # but MiniCPM provides CPU-friendly quantization via trust_remote_code. We'll pass load_in_4bit if supported. try: model = AutoModel.from_pretrained( MODEL_ID, load_in_4bit=True, device_map="cpu", **kwargs, ) dtype_used = "int4" except Exception: # Fallback: load in 8-bit or bf16 if 4-bit isn't supported in environment model = AutoModel.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, device_map="cpu", **kwargs, ) dtype_used = "fallback_bf16_or_fp32" else: # "fp16" requested: CPU cannot run native fp16; we emulate with bfloat16 if available, otherwise float32 # Many Intel/AMD CPUs support bfloat16 acceleration; if not, it will still run in fp32 math. model = AutoModel.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, device_map="cpu", **kwargs, ) dtype_used = "bf16_or_fp32_on_cpu_for_fp16_request" model = model.eval() tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) return model, tokenizer, dtype_used # Global cache to avoid reloading each time _state = {"model": None, "tokenizer": None, "mode": None, "dtype_used": None} def ensure_model(mode): if _state["model"] is None or _state["mode"] != mode: _state["model"], _state["tokenizer"], _state["dtype_used"] = load_model(mode) _state["mode"] = mode def chat_infer(image: Image.Image, message: str, history, mode: str, enable_thinking: bool): if image is None and (not history or all((h[0] or "") == "" and (h[1] or "") == "" for h in history)): return history or [], "Please upload an image or enter a message." ensure_model(mode) model, tokenizer = _state["model"], _state["tokenizer"] # Build msgs from history and current inputs msgs = [] # Convert history into msgs # Each item in history is (user, assistant) for user_msg, assistant_msg in history or []: if user_msg: # history may not contain images; only text msgs.append({"role": "user", "content": [user_msg]}) if assistant_msg: msgs.append({"role": "assistant", "content": [assistant_msg]}) # Add current user turn user_content = [] if image is not None: # Ensure RGB if image.mode != "RGB": image = image.convert("RGB") user_content.append(image) if message and message.strip(): user_content.append(message.strip()) if not user_content: return history or [], "Please provide text or image." msgs.append({"role": "user", "content": user_content}) try: answer = model.chat( msgs=msgs, tokenizer=tokenizer, enable_thinking=enable_thinking, ) except Exception as e: return history or [], f"Inference error: {e}" # Update history for Gradio chat UI: append the latest pair history = (history or []) + [(message or "[Image]", answer)] sys_info = f"Mode: {mode} | Loaded dtype: {_state['dtype_used']} | Device: CPU" return history, sys_info def clear_history(): return [], "" with gr.Blocks(title="MiniCPM-V-4_5 CPU (int4 default, fp16 optional)", fill_height=True) as demo: gr.Markdown("# MiniCPM-V-4_5 CPU Deployment\n- Modes: int4 (default) and fp16\n- Running on CPU") with gr.Row(): with gr.Column(scale=2): chatbox = gr.Chatbot(height=420, label="Chat") with gr.Row(): img = gr.Image(type="pil", label="Image (optional)") msg = gr.Textbox(placeholder="Ask a question about the image or general query...", lines=3) with gr.Row(): send_btn = gr.Button("Send", variant="primary") clear_btn = gr.Button("Clear") with gr.Column(scale=1): mode = gr.Radio( choices=["int4", "fp16"], value="int4", label="Precision Mode (CPU)", info="int4 as default. fp16 uses bf16/fp32 on CPU." ) thinking = gr.Checkbox(label="Enable Thinking Mode", value=False) sys_out = gr.Markdown("") def on_send(message, image, history, mode, thinking): return chat_infer(image, message, history, mode, thinking) send_btn.click( fn=on_send, inputs=[msg, img, chatbox, mode, thinking], outputs=[chatbox, sys_out], show_progress=True, ) # Submit on Enter msg.submit( fn=on_send, inputs=[msg, img, chatbox, mode, thinking], outputs=[chatbox, sys_out], show_progress=True, ) clear_btn.click(fn=clear_history, outputs=[chatbox, sys_out]) if __name__ == "__main__": # For CPU environments with many threads, you may limit to reduce contention: torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "4"))) demo.launch()