Spaces:

CGQN
/

MiniCPM-V-4_5-from_gpt5

Running

App Files Files Community

CGQN commited on 5 days ago

Commit

e7b1930

verified ·

1 Parent(s): e4be608

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +156 -0

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+import gradio as gr
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+MODEL_ID = os.environ.get("MINICPM_MODEL_ID", "openbmb/MiniCPM-V-4_5")
+# Best practice: set a deterministic seed for reproducibility
+torch.manual_seed(100)
+def load_model(precision_mode="int4"):
+    """
+    Load MiniCPM-V-4_5 model on CPU with chosen precision.
+    - precision_mode: "int4" (default) quantized or "fp16" half precision emulation.
+    Note: True FP16 tensors are not supported on CPU; we use bfloat16 or float32 fallback.
+    """
+    kwargs = dict(trust_remote_code=True, attn_implementation="sdpa")
+    if precision_mode == "int4":
+        # BitsAndBytes is not available for CPU only in Transformers' AutoModel consistently across archs,
+        # but MiniCPM provides CPU-friendly quantization via trust_remote_code. We'll pass load_in_4bit if supported.
+        try:
+            model = AutoModel.from_pretrained(
+                MODEL_ID,
+                load_in_4bit=True,
+                device_map="cpu",
+                **kwargs,
+            )
+            dtype_used = "int4"
+        except Exception:
+            # Fallback: load in 8-bit or bf16 if 4-bit isn't supported in environment
+            model = AutoModel.from_pretrained(
+                MODEL_ID,
+                torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+                device_map="cpu",
+                **kwargs,
+            )
+            dtype_used = "fallback_bf16_or_fp32"
+    else:
+        # "fp16" requested: CPU cannot run native fp16; we emulate with bfloat16 if available, otherwise float32
+        # Many Intel/AMD CPUs support bfloat16 acceleration; if not, it will still run in fp32 math.
+        model = AutoModel.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            device_map="cpu",
+            **kwargs,
+        )
+        dtype_used = "bf16_or_fp32_on_cpu_for_fp16_request"
+    model = model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    return model, tokenizer, dtype_used
+# Global cache to avoid reloading each time
+_state = {"model": None, "tokenizer": None, "mode": None, "dtype_used": None}
+def ensure_model(mode):
+    if _state["model"] is None or _state["mode"] != mode:
+        _state["model"], _state["tokenizer"], _state["dtype_used"] = load_model(mode)
+        _state["mode"] = mode
+def chat_infer(image: Image.Image, message: str, history, mode: str, enable_thinking: bool):
+    if image is None and (not history or all((h[0] or "") == "" and (h[1] or "") == "" for h in history)):
+        return history or [], "Please upload an image or enter a message."
+    ensure_model(mode)
+    model, tokenizer = _state["model"], _state["tokenizer"]
+    # Build msgs from history and current inputs
+    msgs = []
+    # Convert history into msgs
+    # Each item in history is (user, assistant)
+    for user_msg, assistant_msg in history or []:
+        if user_msg:
+            # history may not contain images; only text
+            msgs.append({"role": "user", "content": [user_msg]})
+        if assistant_msg:
+            msgs.append({"role": "assistant", "content": [assistant_msg]})
+    # Add current user turn
+    user_content = []
+    if image is not None:
+        # Ensure RGB
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        user_content.append(image)
+    if message and message.strip():
+        user_content.append(message.strip())
+    if not user_content:
+        return history or [], "Please provide text or image."
+    msgs.append({"role": "user", "content": user_content})
+    try:
+        answer = model.chat(
+            msgs=msgs,
+            tokenizer=tokenizer,
+            enable_thinking=enable_thinking,
+        )
+    except Exception as e:
+        return history or [], f"Inference error: {e}"
+    # Update history for Gradio chat UI: append the latest pair
+    history = (history or []) + [(message or "[Image]", answer)]
+    sys_info = f"Mode: {mode} | Loaded dtype: {_state['dtype_used']} | Device: CPU"
+    return history, sys_info
+def clear_history():
+    return [], ""
+with gr.Blocks(title="MiniCPM-V-4_5 CPU (int4 default, fp16 optional)", fill_height=True) as demo:
+    gr.Markdown("# MiniCPM-V-4_5 CPU Deployment\n- Modes: int4 (default) and fp16\n- Running on CPU")
+    with gr.Row():
+        with gr.Column(scale=2):
+            chatbox = gr.Chatbot(height=420, label="Chat")
+            with gr.Row():
+                img = gr.Image(type="pil", label="Image (optional)")
+            msg = gr.Textbox(placeholder="Ask a question about the image or general query...", lines=3)
+            with gr.Row():
+                send_btn = gr.Button("Send", variant="primary")
+                clear_btn = gr.Button("Clear")
+        with gr.Column(scale=1):
+            mode = gr.Radio(
+                choices=["int4", "fp16"],
+                value="int4",
+                label="Precision Mode (CPU)",
+                info="int4 as default. fp16 uses bf16/fp32 on CPU."
+            )
+            thinking = gr.Checkbox(label="Enable Thinking Mode", value=False)
+            sys_out = gr.Markdown("")
+    def on_send(message, image, history, mode, thinking):
+        return chat_infer(image, message, history, mode, thinking)
+    send_btn.click(
+        fn=on_send,
+        inputs=[msg, img, chatbox, mode, thinking],
+        outputs=[chatbox, sys_out],
+        show_progress=True,
+    )
+    # Submit on Enter
+    msg.submit(
+        fn=on_send,
+        inputs=[msg, img, chatbox, mode, thinking],
+        outputs=[chatbox, sys_out],
+        show_progress=True,
+    )
+    clear_btn.click(fn=clear_history, outputs=[chatbox, sys_out])
+if __name__ == "__main__":
+    # For CPU environments with many threads, you may limit to reduce contention:
+    torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "4")))
+    demo.launch()