tc-mb commited on
Commit
c13aabe
·
1 Parent(s): 3acc1cf

Initial commit: MiniCPM-V-4_5 model

Browse files
Files changed (1) hide show
  1. README.md +105 -0
README.md CHANGED
@@ -132,6 +132,111 @@ When traveling to a karst landscape like this, here are some important tips:
132
  By following these guidelines, you'll have a safe and enjoyable trip while appreciating the stunning natural beauty of places such as Guilin’s karst mountains.
133
  ```
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  ## License
137
  #### Model License
 
132
  By following these guidelines, you'll have a safe and enjoyable trip while appreciating the stunning natural beauty of places such as Guilin’s karst mountains.
133
  ```
134
 
135
+ </details>
136
+
137
+ #### Chat with Video
138
+ <details>
139
+ <summary> Click to view Python code running MiniCPM-V-4_5 by with video input and 3D-Resampler. </summary>
140
+
141
+ ```python
142
+ ## The 3d-resampler compresses multiple frames into 64 tokens by introducing temporal_ids.
143
+ # To achieve this, you need to organize your video data into two corresponding sequences:
144
+ # frames: List[Image]
145
+ # temporal_ids: List[List[Int]].
146
+
147
+ import torch
148
+ from PIL import Image
149
+ from transformers import AutoModel, AutoTokenizer
150
+ from decord import VideoReader, cpu # pip install decord
151
+ from scipy.spatial import cKDTree
152
+ import numpy as np
153
+ import math
154
+
155
+ model = AutoModel.from_pretrained('openbmb/MiniCPM-V-4_5', trust_remote_code=True, # or openbmb/MiniCPM-o-2_6
156
+ attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
157
+ model = model.eval().cuda()
158
+ tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-4_5', trust_remote_code=True) # or openbmb/MiniCPM-o-2_6
159
+
160
+ MAX_NUM_FRAMES=180 # Indicates the maximum number of frames received after the videos are packed. The actual maximum number of valid frames is MAX_NUM_FRAMES * MAX_NUM_PACKING.
161
+ MAX_NUM_PACKING=3 # indicates the maximum packing number of video frames. valid range: 1-6
162
+ TIME_SCALE = 0.1
163
+
164
+ def map_to_nearest_scale(values, scale):
165
+ tree = cKDTree(np.asarray(scale)[:, None])
166
+ _, indices = tree.query(np.asarray(values)[:, None])
167
+ return np.asarray(scale)[indices]
168
+
169
+
170
+ def group_array(arr, size):
171
+ return [arr[i:i+size] for i in range(0, len(arr), size)]
172
+
173
+ def encode_video(video_path, choose_fps=3, force_packing=None):
174
+ def uniform_sample(l, n):
175
+ gap = len(l) / n
176
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
177
+ return [l[i] for i in idxs]
178
+ vr = VideoReader(video_path, ctx=cpu(0))
179
+ fps = vr.get_avg_fps()
180
+ video_duration = len(vr) / fps
181
+
182
+ if choose_fps * int(video_duration) <= MAX_NUM_FRAMES:
183
+ packing_nums = 1
184
+ choose_frames = round(min(choose_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
185
+
186
+ else:
187
+ packing_nums = math.ceil(video_duration * choose_fps / MAX_NUM_FRAMES)
188
+ if packing_nums <= MAX_NUM_PACKING:
189
+ choose_frames = round(video_duration * choose_fps)
190
+ else:
191
+ choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
192
+ packing_nums = MAX_NUM_PACKING
193
+
194
+ frame_idx = [i for i in range(0, len(vr))]
195
+ frame_idx = np.array(uniform_sample(frame_idx, choose_frames))
196
+
197
+ if force_packing:
198
+ packing_nums = min(force_packing, MAX_NUM_PACKING)
199
+
200
+ print(video_path, ' duration:', video_duration)
201
+ print(f'get video frames={len(frame_idx)}, packing_nums={packing_nums}')
202
+
203
+ frames = vr.get_batch(frame_idx).asnumpy()
204
+
205
+ frame_idx_ts = frame_idx / fps
206
+ scale = np.arange(0, video_duration, TIME_SCALE)
207
+
208
+ frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
209
+ frame_ts_id = frame_ts_id.astype(np.int32)
210
+
211
+ assert len(frames) == len(frame_ts_id)
212
+
213
+ frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
214
+ frame_ts_id_group = group_array(frame_ts_id, packing_nums)
215
+
216
+ return frames, frame_ts_id_group
217
+
218
+
219
+ video_path="video_test.mp4"
220
+ fps = 5 # fps for video
221
+ force_packing = None # You can set force_packing to ensure that 3D packing is forcibly enabled; otherwise, encode_video will dynamically set the packing quantity based on the duration.
222
+ frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
223
+
224
+ question = "Describe the video"
225
+ msgs = [
226
+ {'role': 'user', 'content': frames + [question]},
227
+ ]
228
+
229
+
230
+ answer = model.chat(
231
+ msgs=msgs,
232
+ tokenizer=tokenizer,
233
+ use_image_id=False,
234
+ max_slice_nums=1,
235
+ temporal_ids=frame_ts_id_group
236
+ )
237
+ print(answer)
238
+ ```
239
+ </details>
240
 
241
  ## License
242
  #### Model License