orrzxz commited on
Commit
159c520
Β·
verified Β·
1 Parent(s): 8811c0b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +328 -0
app.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from transformers import AutoModel, AutoTokenizer
5
+ import numpy as np
6
+ import tempfile
7
+ import os
8
+ from decord import VideoReader, cpu
9
+ from scipy.spatial import cKDTree
10
+ import math
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+
14
+ # Global variables for model and tokenizer
15
+ model = None
16
+ tokenizer = None
17
+
18
+ def load_model():
19
+ """Load the MiniCPM-V-4.5 model and tokenizer"""
20
+ global model, tokenizer
21
+
22
+ if model is None:
23
+ print("Loading MiniCPM-V-4.5 model...")
24
+ model = AutoModel.from_pretrained(
25
+ 'openbmb/MiniCPM-V-4_5',
26
+ trust_remote_code=True,
27
+ attn_implementation='sdpa',
28
+ torch_dtype=torch.bfloat16,
29
+ device_map="auto"
30
+ )
31
+ model = model.eval()
32
+
33
+ tokenizer = AutoTokenizer.from_pretrained(
34
+ 'openbmb/MiniCPM-V-4_5',
35
+ trust_remote_code=True
36
+ )
37
+ print("Model loaded successfully!")
38
+
39
+ return model, tokenizer
40
+
41
+ def map_to_nearest_scale(values, scale):
42
+ """Map values to nearest scale for temporal IDs"""
43
+ tree = cKDTree(np.asarray(scale)[:, None])
44
+ _, indices = tree.query(np.asarray(values)[:, None])
45
+ return np.asarray(scale)[indices]
46
+
47
+ def group_array(arr, size):
48
+ """Group array into chunks of specified size"""
49
+ return [arr[i:i+size] for i in range(0, len(arr), size)]
50
+
51
+ def uniform_sample(l, n):
52
+ """Uniformly sample n items from list l"""
53
+ gap = len(l) / n
54
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
55
+ return [l[i] for i in idxs]
56
+
57
+ def encode_video(video_path, choose_fps=3, max_frames=180, max_packing=3, time_scale=0.1):
58
+ """Encode video frames with temporal IDs for the model"""
59
+ vr = VideoReader(video_path, ctx=cpu(0))
60
+ fps = vr.get_avg_fps()
61
+ video_duration = len(vr) / fps
62
+
63
+ if choose_fps * int(video_duration) <= max_frames:
64
+ packing_nums = 1
65
+ choose_frames = round(min(choose_fps, round(fps)) * min(max_frames, video_duration))
66
+ else:
67
+ packing_nums = math.ceil(video_duration * choose_fps / max_frames)
68
+ if packing_nums <= max_packing:
69
+ choose_frames = round(video_duration * choose_fps)
70
+ else:
71
+ choose_frames = round(max_frames * max_packing)
72
+ packing_nums = max_packing
73
+
74
+ frame_idx = [i for i in range(0, len(vr))]
75
+ frame_idx = np.array(uniform_sample(frame_idx, choose_frames))
76
+
77
+ print(f'Video duration: {video_duration:.2f}s, frames: {len(frame_idx)}, packing: {packing_nums}')
78
+
79
+ frames = vr.get_batch(frame_idx).asnumpy()
80
+ frame_idx_ts = frame_idx / fps
81
+ scale = np.arange(0, video_duration, time_scale)
82
+ frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / time_scale
83
+ frame_ts_id = frame_ts_id.astype(np.int32)
84
+
85
+ frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
86
+ frame_ts_id_group = group_array(frame_ts_id, packing_nums)
87
+
88
+ return frames, frame_ts_id_group
89
+
90
+ def process_input(
91
+ file_input,
92
+ user_prompt,
93
+ system_prompt,
94
+ fps,
95
+ context_size,
96
+ temperature,
97
+ enable_thinking
98
+ ):
99
+ """Process user input and generate response"""
100
+ try:
101
+ # Load model if not already loaded
102
+ model, tokenizer = load_model()
103
+
104
+ if file_input is None:
105
+ return "Please upload an image or video file."
106
+
107
+ # Determine if input is image or video
108
+ file_path = file_input.name if hasattr(file_input, 'name') else file_input
109
+ file_ext = os.path.splitext(file_path)[1].lower()
110
+
111
+ is_video = file_ext in ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.m4v']
112
+
113
+ # Prepare messages
114
+ msgs = []
115
+
116
+ # Add system prompt if provided
117
+ if system_prompt and system_prompt.strip():
118
+ msgs.append({'role': 'system', 'content': system_prompt.strip()})
119
+
120
+ if is_video:
121
+ # Process video
122
+ frames, frame_ts_id_group = encode_video(file_path, choose_fps=fps)
123
+ msgs.append({'role': 'user', 'content': frames + [user_prompt]})
124
+
125
+ # Generate response for video
126
+ answer = model.chat(
127
+ msgs=msgs,
128
+ tokenizer=tokenizer,
129
+ use_image_id=False,
130
+ max_slice_nums=1,
131
+ temporal_ids=frame_ts_id_group,
132
+ enable_thinking=enable_thinking,
133
+ max_new_tokens=context_size,
134
+ temperature=temperature
135
+ )
136
+ else:
137
+ # Process image
138
+ image = Image.open(file_path).convert('RGB')
139
+ msgs.append({'role': 'user', 'content': [image, user_prompt]})
140
+
141
+ # Generate response for image
142
+ answer = model.chat(
143
+ msgs=msgs,
144
+ tokenizer=tokenizer,
145
+ enable_thinking=enable_thinking,
146
+ max_new_tokens=context_size,
147
+ temperature=temperature
148
+ )
149
+
150
+ return answer
151
+
152
+ except Exception as e:
153
+ return f"Error processing input: {str(e)}"
154
+
155
+ def create_interface():
156
+ """Create and configure Gradio interface"""
157
+
158
+ with gr.Blocks(title="MiniCPM-V-4.5 Multimodal Chat", theme=gr.themes.Soft()) as iface:
159
+ gr.Markdown("""
160
+ # πŸš€ MiniCPM-V-4.5 Multimodal Chat
161
+
162
+ A powerful 8B parameter multimodal model that can understand images and videos with GPT-4V level performance.
163
+
164
+ **Features:**
165
+ - πŸ“Έ Single/Multi-image understanding
166
+ - πŸŽ₯ High refresh rate video understanding (up to 10 FPS)
167
+ - πŸ“„ Strong OCR and document parsing
168
+ - 🧠 Controllable fast/deep thinking mode
169
+ - 🌍 Multilingual support (30+ languages)
170
+ """)
171
+
172
+ with gr.Row():
173
+ with gr.Column(scale=1):
174
+ # File input
175
+ file_input = gr.File(
176
+ label="Upload Image or Video",
177
+ file_types=["image", "video"],
178
+ type="filepath"
179
+ )
180
+
181
+ # Video FPS setting
182
+ fps_slider = gr.Slider(
183
+ minimum=1,
184
+ maximum=30,
185
+ value=5,
186
+ step=1,
187
+ label="Video FPS",
188
+ info="Frames per second for video processing (only applies to videos)"
189
+ )
190
+
191
+ # Context size
192
+ context_size = gr.Slider(
193
+ minimum=512,
194
+ maximum=4096,
195
+ value=2048,
196
+ step=256,
197
+ label="Max Output Tokens",
198
+ info="Maximum number of tokens to generate"
199
+ )
200
+
201
+ # Temperature
202
+ temperature = gr.Slider(
203
+ minimum=0.1,
204
+ maximum=2.0,
205
+ value=0.7,
206
+ step=0.1,
207
+ label="Temperature",
208
+ info="Controls randomness in generation"
209
+ )
210
+
211
+ # Thinking mode
212
+ enable_thinking = gr.Checkbox(
213
+ label="Enable Deep Thinking",
214
+ value=False,
215
+ info="Enable deep thinking mode for complex problem solving"
216
+ )
217
+
218
+ with gr.Column(scale=2):
219
+ # System prompt
220
+ system_prompt = gr.Textbox(
221
+ label="System Prompt (Optional)",
222
+ placeholder="Enter system instructions here...",
223
+ lines=3,
224
+ info="Set the behavior and context for the model"
225
+ )
226
+
227
+ # User prompt
228
+ user_prompt = gr.Textbox(
229
+ label="Your Question",
230
+ placeholder="Describe what you see in the image/video, or ask a specific question...",
231
+ lines=4
232
+ )
233
+
234
+ # Submit button
235
+ submit_btn = gr.Button("πŸš€ Generate Response", variant="primary", size="lg")
236
+
237
+ # Output
238
+ output = gr.Textbox(
239
+ label="Model Response",
240
+ lines=15,
241
+ max_lines=25,
242
+ show_copy_button=True
243
+ )
244
+
245
+ # Examples
246
+ gr.Markdown("## πŸ’‘ Example Prompts")
247
+ gr.Examples(
248
+ examples=[
249
+ ["What objects do you see in this image?"],
250
+ ["Describe the scene in detail."],
251
+ ["What is the main action happening in this video?"],
252
+ ["Read and transcribe any text visible in the image."],
253
+ ["What emotions or mood does this image convey?"],
254
+ ["Analyze the composition and visual elements."],
255
+ ["What might happen next in this sequence?"]
256
+ ],
257
+ inputs=[user_prompt],
258
+ label="Click any example to use it"
259
+ )
260
+
261
+ # Event handlers
262
+ submit_btn.click(
263
+ fn=process_input,
264
+ inputs=[
265
+ file_input,
266
+ user_prompt,
267
+ system_prompt,
268
+ fps_slider,
269
+ context_size,
270
+ temperature,
271
+ enable_thinking
272
+ ],
273
+ outputs=output,
274
+ show_progress=True
275
+ )
276
+
277
+ # Also allow Enter key submission
278
+ user_prompt.submit(
279
+ fn=process_input,
280
+ inputs=[
281
+ file_input,
282
+ user_prompt,
283
+ system_prompt,
284
+ fps_slider,
285
+ context_size,
286
+ temperature,
287
+ enable_thinking
288
+ ],
289
+ outputs=output,
290
+ show_progress=True
291
+ )
292
+
293
+ # Information section
294
+ with gr.Accordion("πŸ“‹ Model Information", open=False):
295
+ gr.Markdown("""
296
+ ### MiniCPM-V-4.5 Specifications
297
+
298
+ - **Parameters**: 8B (Qwen3-8B + SigLIP2-400M)
299
+ - **Video Compression**: 96x compression rate (6 frames β†’ 64 tokens)
300
+ - **Max Resolution**: Up to 1.8M pixels (1344x1344)
301
+ - **Languages**: 30+ languages supported
302
+ - **Performance**: Surpasses GPT-4o-latest on multiple benchmarks
303
+
304
+ ### Usage Tips
305
+
306
+ 1. **For Images**: Upload any image format and ask questions about content, objects, text, or analysis
307
+ 2. **For Videos**: Adjust FPS based on video content (higher FPS for action, lower for static scenes)
308
+ 3. **System Prompt**: Use to set specific roles like "You are an expert art critic" or "Analyze this from a medical perspective"
309
+ 4. **Deep Thinking**: Enable for complex reasoning tasks, analysis, or problem-solving
310
+ 5. **Temperature**: Lower (0.1-0.3) for factual responses, higher (0.7-1.0) for creative outputs
311
+
312
+ ### Supported Formats
313
+ - **Images**: JPG, PNG, JPEG, BMP, GIF, WEBP
314
+ - **Videos**: MP4, AVI, MOV, MKV, WEBM, M4V
315
+ """)
316
+
317
+ return iface
318
+
319
+ if __name__ == "__main__":
320
+ # Create and launch interface
321
+ demo = create_interface()
322
+ demo.queue(max_size=20)
323
+ demo.launch(
324
+ share=True,
325
+ server_name="0.0.0.0",
326
+ server_port=7860,
327
+ show_error=True
328
+ )