File size: 5,045 Bytes
1bf7e3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffb7a8f
1bf7e3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import gradio as gr
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import spaces

# Initialize model and tokenizer
torch.manual_seed(100)

model = AutoModel.from_pretrained(
    'openbmb/MiniCPM-V-4_5', 
    trust_remote_code=True,
    attn_implementation='sdpa', 
    torch_dtype=torch.bfloat16
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
    'openbmb/MiniCPM-V-4_5', 
    trust_remote_code=True
)

@spaces.GPU(duration=120)
def respond(message, history, enable_thinking):
    """
    Process user message and generate response
    """
    # Build conversation history in the format expected by the model
    msgs = []
    
    # Add previous conversation history
    for h in history:
        user_msg = h[0]
        assistant_msg = h[1]
        
        # Parse user message for images and text
        user_content = []
        if isinstance(user_msg, tuple):
            # If user message contains an image
            img_path, text = user_msg
            img = Image.open(img_path).convert('RGB')
            user_content = [img, text] if text else [img]
        else:
            # Text only message
            user_content = [user_msg]
        
        msgs.append({"role": "user", "content": user_content})
        if assistant_msg:
            msgs.append({"role": "assistant", "content": [assistant_msg]})
    
    # Add current message
    current_content = []
    if isinstance(message, dict):
        # Handle multimodal input
        if message.get("files"):
            for file_path in message["files"]:
                img = Image.open(file_path).convert('RGB')
                current_content.append(img)
        if message.get("text"):
            current_content.append(message["text"])
    else:
        # Handle text-only input
        current_content = [message]
    
    msgs.append({"role": "user", "content": current_content})
    
    # Generate response
    try:
        answer = model.chat(
            msgs=msgs,
            tokenizer=tokenizer,
            enable_thinking=enable_thinking
        )
        return answer
    except Exception as e:
        return f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="MiniCPM-V Chatbot") as demo:
    gr.Markdown(
        """
        # 🤖 MiniCPM-V Multimodal Chatbot
        
        Upload images and ask questions about them, or have a text conversation.
        The model supports multi-turn conversations with context memory.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(
                height=500,
                show_label=False,
                container=True,
                type="tuples"
            )
            
            with gr.Row():
                msg = gr.MultimodalTextbox(
                    interactive=True,
                    file_types=["image"],
                    placeholder="Type a message or upload an image...",
                    show_label=False,
                    container=False
                )
            
            with gr.Row():
                clear = gr.Button("🗑️ Clear", size="sm")
                submit = gr.Button("📤 Send", variant="primary", size="sm")
        
        with gr.Column(scale=1):
            gr.Markdown("### Settings")
            enable_thinking = gr.Checkbox(
                label="Enable Thinking Mode",
                value=False,
                info="Enable the model's thinking process"
            )
            
            gr.Markdown(
                """
                ### Examples
                - Upload an image and ask "What is in this picture?"
                - Ask "What are the main objects visible?"
                - Follow up with "What should I pay attention to here?"
                """
            )
    
    # Handle message submission
    def user_submit(message, history, enable_thinking):
        # Format the user message for display
        if isinstance(message, dict) and message.get("files"):
            # If there are files, create tuple format for chatbot display
            user_msg = (message["files"][0], message.get("text", ""))
        else:
            user_msg = message.get("text", "") if isinstance(message, dict) else message
        
        # Add user message to history
        history = history + [(user_msg, None)]
        
        # Generate response
        response = respond(message, history[:-1], enable_thinking)
        
        # Update history with response
        history[-1] = (history[-1][0], response)
        
        return "", history
    
    # Event handlers
    msg.submit(
        user_submit,
        inputs=[msg, chatbot, enable_thinking],
        outputs=[msg, chatbot]
    )
    
    submit.click(
        user_submit,
        inputs=[msg, chatbot, enable_thinking],
        outputs=[msg, chatbot]
    )
    
    clear.click(
        lambda: (None, []),
        outputs=[msg, chatbot]
    )

if __name__ == "__main__":
    demo.launch(share=True)