tc-mb
commited on
Commit
·
c13aabe
1
Parent(s):
3acc1cf
Initial commit: MiniCPM-V-4_5 model
Browse files
README.md
CHANGED
@@ -132,6 +132,111 @@ When traveling to a karst landscape like this, here are some important tips:
|
|
132 |
By following these guidelines, you'll have a safe and enjoyable trip while appreciating the stunning natural beauty of places such as Guilin’s karst mountains.
|
133 |
```
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
## License
|
137 |
#### Model License
|
|
|
132 |
By following these guidelines, you'll have a safe and enjoyable trip while appreciating the stunning natural beauty of places such as Guilin’s karst mountains.
|
133 |
```
|
134 |
|
135 |
+
</details>
|
136 |
+
|
137 |
+
#### Chat with Video
|
138 |
+
<details>
|
139 |
+
<summary> Click to view Python code running MiniCPM-V-4_5 by with video input and 3D-Resampler. </summary>
|
140 |
+
|
141 |
+
```python
|
142 |
+
## The 3d-resampler compresses multiple frames into 64 tokens by introducing temporal_ids.
|
143 |
+
# To achieve this, you need to organize your video data into two corresponding sequences:
|
144 |
+
# frames: List[Image]
|
145 |
+
# temporal_ids: List[List[Int]].
|
146 |
+
|
147 |
+
import torch
|
148 |
+
from PIL import Image
|
149 |
+
from transformers import AutoModel, AutoTokenizer
|
150 |
+
from decord import VideoReader, cpu # pip install decord
|
151 |
+
from scipy.spatial import cKDTree
|
152 |
+
import numpy as np
|
153 |
+
import math
|
154 |
+
|
155 |
+
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-4_5', trust_remote_code=True, # or openbmb/MiniCPM-o-2_6
|
156 |
+
attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
|
157 |
+
model = model.eval().cuda()
|
158 |
+
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-4_5', trust_remote_code=True) # or openbmb/MiniCPM-o-2_6
|
159 |
+
|
160 |
+
MAX_NUM_FRAMES=180 # Indicates the maximum number of frames received after the videos are packed. The actual maximum number of valid frames is MAX_NUM_FRAMES * MAX_NUM_PACKING.
|
161 |
+
MAX_NUM_PACKING=3 # indicates the maximum packing number of video frames. valid range: 1-6
|
162 |
+
TIME_SCALE = 0.1
|
163 |
+
|
164 |
+
def map_to_nearest_scale(values, scale):
|
165 |
+
tree = cKDTree(np.asarray(scale)[:, None])
|
166 |
+
_, indices = tree.query(np.asarray(values)[:, None])
|
167 |
+
return np.asarray(scale)[indices]
|
168 |
+
|
169 |
+
|
170 |
+
def group_array(arr, size):
|
171 |
+
return [arr[i:i+size] for i in range(0, len(arr), size)]
|
172 |
+
|
173 |
+
def encode_video(video_path, choose_fps=3, force_packing=None):
|
174 |
+
def uniform_sample(l, n):
|
175 |
+
gap = len(l) / n
|
176 |
+
idxs = [int(i * gap + gap / 2) for i in range(n)]
|
177 |
+
return [l[i] for i in idxs]
|
178 |
+
vr = VideoReader(video_path, ctx=cpu(0))
|
179 |
+
fps = vr.get_avg_fps()
|
180 |
+
video_duration = len(vr) / fps
|
181 |
+
|
182 |
+
if choose_fps * int(video_duration) <= MAX_NUM_FRAMES:
|
183 |
+
packing_nums = 1
|
184 |
+
choose_frames = round(min(choose_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
|
185 |
+
|
186 |
+
else:
|
187 |
+
packing_nums = math.ceil(video_duration * choose_fps / MAX_NUM_FRAMES)
|
188 |
+
if packing_nums <= MAX_NUM_PACKING:
|
189 |
+
choose_frames = round(video_duration * choose_fps)
|
190 |
+
else:
|
191 |
+
choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
|
192 |
+
packing_nums = MAX_NUM_PACKING
|
193 |
+
|
194 |
+
frame_idx = [i for i in range(0, len(vr))]
|
195 |
+
frame_idx = np.array(uniform_sample(frame_idx, choose_frames))
|
196 |
+
|
197 |
+
if force_packing:
|
198 |
+
packing_nums = min(force_packing, MAX_NUM_PACKING)
|
199 |
+
|
200 |
+
print(video_path, ' duration:', video_duration)
|
201 |
+
print(f'get video frames={len(frame_idx)}, packing_nums={packing_nums}')
|
202 |
+
|
203 |
+
frames = vr.get_batch(frame_idx).asnumpy()
|
204 |
+
|
205 |
+
frame_idx_ts = frame_idx / fps
|
206 |
+
scale = np.arange(0, video_duration, TIME_SCALE)
|
207 |
+
|
208 |
+
frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
|
209 |
+
frame_ts_id = frame_ts_id.astype(np.int32)
|
210 |
+
|
211 |
+
assert len(frames) == len(frame_ts_id)
|
212 |
+
|
213 |
+
frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
|
214 |
+
frame_ts_id_group = group_array(frame_ts_id, packing_nums)
|
215 |
+
|
216 |
+
return frames, frame_ts_id_group
|
217 |
+
|
218 |
+
|
219 |
+
video_path="video_test.mp4"
|
220 |
+
fps = 5 # fps for video
|
221 |
+
force_packing = None # You can set force_packing to ensure that 3D packing is forcibly enabled; otherwise, encode_video will dynamically set the packing quantity based on the duration.
|
222 |
+
frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)
|
223 |
+
|
224 |
+
question = "Describe the video"
|
225 |
+
msgs = [
|
226 |
+
{'role': 'user', 'content': frames + [question]},
|
227 |
+
]
|
228 |
+
|
229 |
+
|
230 |
+
answer = model.chat(
|
231 |
+
msgs=msgs,
|
232 |
+
tokenizer=tokenizer,
|
233 |
+
use_image_id=False,
|
234 |
+
max_slice_nums=1,
|
235 |
+
temporal_ids=frame_ts_id_group
|
236 |
+
)
|
237 |
+
print(answer)
|
238 |
+
```
|
239 |
+
</details>
|
240 |
|
241 |
## License
|
242 |
#### Model License
|