Spaces:

akhaliq
/

MiniCPM-V-4_5

Running on Zero

File size: 5,045 Bytes

1bf7e3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffb7a8f
1bf7e3b

import gradio as gr
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import spaces

# Initialize model and tokenizer
torch.manual_seed(100)

model = AutoModel.from_pretrained(
    'openbmb/MiniCPM-V-4_5', 
    trust_remote_code=True,
    attn_implementation='sdpa', 
    torch_dtype=torch.bfloat16
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
    'openbmb/MiniCPM-V-4_5', 
    trust_remote_code=True
)

@spaces.GPU(duration=120)
def respond(message, history, enable_thinking):
    """
    Process user message and generate response
    """
    # Build conversation history in the format expected by the model
    msgs = []
    
    # Add previous conversation history
    for h in history:
        user_msg = h[0]
        assistant_msg = h[1]
        
        # Parse user message for images and text
        user_content = []
        if isinstance(user_msg, tuple):
            # If user message contains an image
            img_path, text = user_msg
            img = Image.open(img_path).convert('RGB')
            user_content = [img, text] if text else [img]
        else:
            # Text only message
            user_content = [user_msg]
        
        msgs.append({"role": "user", "content": user_content})
        if assistant_msg:
            msgs.append({"role": "assistant", "content": [assistant_msg]})
    
    # Add current message
    current_content = []
    if isinstance(message, dict):
        # Handle multimodal input
        if message.get("files"):
            for file_path in message["files"]:
                img = Image.open(file_path).convert('RGB')
                current_content.append(img)
        if message.get("text"):
            current_content.append(message["text"])
    else:
        # Handle text-only input
        current_content = [message]
    
    msgs.append({"role": "user", "content": current_content})
    
    # Generate response
    try:
        answer = model.chat(
            msgs=msgs,
            tokenizer=tokenizer,
            enable_thinking=enable_thinking
        )
        return answer
    except Exception as e:
        return f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="MiniCPM-V Chatbot") as demo:
    gr.Markdown(
        """
        # 🤖 MiniCPM-V Multimodal Chatbot
        
        Upload images and ask questions about them, or have a text conversation.
        The model supports multi-turn conversations with context memory.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(
                height=500,
                show_label=False,
                container=True,
                type="tuples"
            )
            
            with gr.Row():
                msg = gr.MultimodalTextbox(
                    interactive=True,
                    file_types=["image"],
                    placeholder="Type a message or upload an image...",
                    show_label=False,
                    container=False
                )
            
            with gr.Row():
                clear = gr.Button("🗑️ Clear", size="sm")
                submit = gr.Button("📤 Send", variant="primary", size="sm")
        
        with gr.Column(scale=1):
            gr.Markdown("### Settings")
            enable_thinking = gr.Checkbox(
                label="Enable Thinking Mode",
                value=False,
                info="Enable the model's thinking process"
            )
            
            gr.Markdown(
                """
                ### Examples
                - Upload an image and ask "What is in this picture?"
                - Ask "What are the main objects visible?"
                - Follow up with "What should I pay attention to here?"
                """
            )
    
    # Handle message submission
    def user_submit(message, history, enable_thinking):
        # Format the user message for display
        if isinstance(message, dict) and message.get("files"):
            # If there are files, create tuple format for chatbot display
            user_msg = (message["files"][0], message.get("text", ""))
        else:
            user_msg = message.get("text", "") if isinstance(message, dict) else message
        
        # Add user message to history
        history = history + [(user_msg, None)]
        
        # Generate response
        response = respond(message, history[:-1], enable_thinking)
        
        # Update history with response
        history[-1] = (history[-1][0], response)
        
        return "", history
    
    # Event handlers
    msg.submit(
        user_submit,
        inputs=[msg, chatbot, enable_thinking],
        outputs=[msg, chatbot]
    )
    
    submit.click(
        user_submit,
        inputs=[msg, chatbot, enable_thinking],
        outputs=[msg, chatbot]
    )
    
    clear.click(
        lambda: (None, []),
        outputs=[msg, chatbot]
    )

if __name__ == "__main__":
    demo.launch(share=True)