AnasKAN commited on
Commit
05f57ab
·
1 Parent(s): fa81297

Initial commit: Add Gradio app + cleaned demo videos (via Git LFS) + preset cache + requirements

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.avi filter=lfs diff=lfs merge=lfs -text
37
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ *.mov filter=lfs diff=lfs merge=lfs -text
39
+ *.mkv filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # ignore raw video dumps by default
2
+ assets/videos_raw/
3
+ # common temp dirs
4
+ tmp/
5
+ .cache/
6
+ __pycache__/
app.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os, time, json, hashlib, glob
3
+ import cv2, numpy as np
4
+ import torch, gradio as gr
5
+ from transformers import AutoProcessor, AutoModelForImageTextToText
6
+
7
+ # ----------------------------------------------------------------------------------
8
+ # Config
9
+ # ----------------------------------------------------------------------------------
10
+ MODEL_PATH = "AnasKAN/SmolVLM2-500M-Video-Instruct-video-feedback"
11
+ PROCESSOR_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
12
+
13
+ # Directories (relative to repo root when running in HF Spaces)
14
+ REPO_ROOT = os.path.dirname(__file__)
15
+ PRESET_CACHE_DIR = os.path.join(REPO_ROOT, "preset_cache")
16
+ PRELOADED_CLEAN_DIR = os.path.join(REPO_ROOT, "assets", "videos_clean")
17
+ RUNTIME_CACHE_DIR = "/tmp/moraqeb_cache"
18
+ RUNTIME_CLEAN_DIR = "/tmp/clean_videos"
19
+
20
+ os.makedirs(RUNTIME_CACHE_DIR, exist_ok=True)
21
+ os.makedirs(RUNTIME_CLEAN_DIR, exist_ok=True)
22
+
23
+ # Device / dtype
24
+ has_cuda = torch.cuda.is_available()
25
+ device = torch.device("cuda" if has_cuda else "cpu")
26
+ dtype = torch.bfloat16 if has_cuda else torch.float32
27
+
28
+ # Video cleaning config (used only for uploaded videos)
29
+ target_size = (640, 360) # (w, h)
30
+ target_frames = 128
31
+ output_fps = 24
32
+ fourcc = cv2.VideoWriter_fourcc(*"XVID")
33
+
34
+ def sample_indices(n_frames_src, n_target):
35
+ if n_frames_src <= 0: return []
36
+ idxs = np.linspace(0, n_frames_src - 1, min(n_target, n_frames_src), dtype=int)
37
+ return idxs.tolist()
38
+
39
+ def clean_single_video(video_path, save_dir=RUNTIME_CLEAN_DIR, overwrite=True):
40
+ os.makedirs(save_dir, exist_ok=True)
41
+ base = os.path.splitext(os.path.basename(video_path))[0]
42
+ dst_path = os.path.join(save_dir, f"{base}_clean.avi")
43
+ if os.path.exists(dst_path) and not overwrite:
44
+ print(f"[SKIP] {dst_path} exists")
45
+ return dst_path
46
+ cap = cv2.VideoCapture(video_path)
47
+ if not cap.isOpened():
48
+ print(f"[ERROR] Cannot open {video_path}")
49
+ return None
50
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
51
+ idxs = sample_indices(total_frames, target_frames)
52
+ frames, grabbed, cur = [], 0, 0
53
+ next_targets = set(idxs)
54
+ while True:
55
+ ret, frame = cap.read()
56
+ if not ret: break
57
+ if cur in next_targets:
58
+ frame = cv2.resize(frame, target_size, interpolation=cv2.INTER_AREA)
59
+ frames.append(frame); grabbed += 1
60
+ if grabbed == len(idxs): break
61
+ cur += 1
62
+ cap.release()
63
+ if not frames:
64
+ print(f"[ERROR] No frames from {video_path}")
65
+ return None
66
+ while len(frames) < target_frames:
67
+ frames.append(frames[-1])
68
+ out = cv2.VideoWriter(dst_path, fourcc, output_fps, target_size)
69
+ for f in frames: out.write(f)
70
+ out.release()
71
+ print(f"[OK] Cleaned {video_path} -> {dst_path} ({len(frames)} frames @ {output_fps} FPS)")
72
+ return dst_path
73
+
74
+ # ----------------------------------------------------------------------------------
75
+ # Caching helpers
76
+ # ----------------------------------------------------------------------------------
77
+ def _sha1_file(path, chunk=1024*1024):
78
+ h = hashlib.sha1()
79
+ with open(path, "rb") as f:
80
+ while True:
81
+ b = f.read(chunk)
82
+ if not b: break
83
+ h.update(b)
84
+ return h.hexdigest()
85
+
86
+ def _make_cache_key(cleaned_path, question, gen_kwargs):
87
+ vid_hash = _sha1_file(cleaned_path)
88
+ payload = {
89
+ "model": MODEL_PATH,
90
+ "video_hash": vid_hash,
91
+ "question": (question or "").strip(),
92
+ "gen": {
93
+ "do_sample": bool(gen_kwargs.get("do_sample", False)),
94
+ "max_new_tokens": int(gen_kwargs.get("max_new_tokens", 128)),
95
+ "temperature": float(gen_kwargs.get("temperature", 0.7)) if gen_kwargs.get("do_sample") else None,
96
+ "top_p": float(gen_kwargs.get("top_p", 0.9)) if gen_kwargs.get("do_sample") else None,
97
+ },
98
+ }
99
+ raw = json.dumps(payload, sort_keys=True).encode("utf-8")
100
+ return hashlib.sha1(raw).hexdigest()
101
+
102
+ def _cache_path(dir_, key): return os.path.join(dir_, f"{key}.json")
103
+
104
+ def preset_cache_get(key):
105
+ p = _cache_path(PRESET_CACHE_DIR, key)
106
+ if os.path.exists(p):
107
+ try:
108
+ with open(p, "r", encoding="utf-8") as f:
109
+ print("[PRESET] HIT", os.path.basename(p))
110
+ return json.load(f)
111
+ except Exception as e:
112
+ print(f"[PRESET] Failed to read {p}: {e}")
113
+ return None
114
+
115
+ def runtime_cache_get(key):
116
+ p = _cache_path(RUNTIME_CACHE_DIR, key)
117
+ if os.path.exists(p):
118
+ try:
119
+ with open(p, "r", encoding="utf-8") as f:
120
+ print("[CACHE] HIT", os.path.basename(p))
121
+ return json.load(f)
122
+ except Exception as e:
123
+ print(f"[CACHE] Failed to read {p}: {e}")
124
+ return None
125
+
126
+ def runtime_cache_put(key, data):
127
+ p = _cache_path(RUNTIME_CACHE_DIR, key)
128
+ try:
129
+ with open(p, "w", encoding="utf-8") as f:
130
+ json.dump(data, f, ensure_ascii=False)
131
+ return True
132
+ except Exception as e:
133
+ print(f"[CACHE] Failed to write {p}: {e}")
134
+ return False
135
+
136
+ def cache_clear():
137
+ n = 0
138
+ for fp in glob.glob(os.path.join(RUNTIME_CACHE_DIR, "*.json")):
139
+ try: os.remove(fp); n += 1
140
+ except: pass
141
+ return n
142
+
143
+ # ----------------------------------------------------------------------------------
144
+ # Lazy pipeline (loaded only on cache miss)
145
+ # ----------------------------------------------------------------------------------
146
+ _model = None
147
+ _processor = None
148
+
149
+ def load_pipeline():
150
+ global _model, _processor
151
+ if _model is not None and _processor is not None:
152
+ print("[INFO] Pipeline already loaded")
153
+ return _processor, _model
154
+ print(f"[INFO] Loading processor: {PROCESSOR_ID}")
155
+ _processor = AutoProcessor.from_pretrained(PROCESSOR_ID)
156
+ print(f"[INFO] Loading model: {MODEL_PATH}")
157
+ _model = AutoModelForImageTextToText.from_pretrained(
158
+ MODEL_PATH,
159
+ torch_dtype=dtype,
160
+ _attn_implementation="eager",
161
+ low_cpu_mem_usage=True,
162
+ ).to(device).eval()
163
+ print("[INFO] Pipeline ready ✅")
164
+ return _processor, _model
165
+
166
+ def build_messages(video_path: str, question: str):
167
+ q = (question or "").strip() or "Caption the video."
168
+ return [{
169
+ "role": "user",
170
+ "content": [
171
+ {"type": "video", "path": video_path},
172
+ {"type": "text", "text": q},
173
+ ],
174
+ }]
175
+
176
+ # ----------------------------------------------------------------------------------
177
+ # Inference handlers
178
+ # ----------------------------------------------------------------------------------
179
+ def _maybe_clean_uploaded(video_path):
180
+ if video_path and os.path.commonpath([os.path.abspath(video_path), PRELOADED_CLEAN_DIR]) == PRELOADED_CLEAN_DIR:
181
+ print("[INFO] Using preloaded cleaned video:", video_path)
182
+ return video_path
183
+ print(f"[INFO] Cleaning uploaded video {video_path} ...")
184
+ return clean_single_video(video_path, save_dir=RUNTIME_CLEAN_DIR, overwrite=True)
185
+
186
+ def _infer_core(cleaned_path, question, gen_kwargs, use_cache=True):
187
+ key = _make_cache_key(cleaned_path, question, gen_kwargs)
188
+
189
+ if use_cache:
190
+ preset = preset_cache_get(key)
191
+ if preset:
192
+ return preset["answer"], preset.get("latency_s", 0.0), preset.get("tokens_per_s", 0.0), preset.get("new_tokens", 0)
193
+ cached = runtime_cache_get(key)
194
+ if cached:
195
+ return cached["answer"], cached["latency_s"], cached["tokens_per_s"], cached["new_tokens"]
196
+
197
+ processor, model = load_pipeline()
198
+ print("[INFO] Tokenizing ...")
199
+ messages = build_messages(cleaned_path, question)
200
+ inputs = processor.apply_chat_template(
201
+ messages,
202
+ add_generation_prompt=True,
203
+ tokenize=True,
204
+ return_dict=True,
205
+ return_tensors="pt",
206
+ video_load_backend="decord",
207
+ ).to(device, dtype=dtype)
208
+
209
+ print(f"[INFO] Generating with {gen_kwargs}")
210
+ start = time.time()
211
+ with torch.inference_mode():
212
+ generated_ids = model.generate(**inputs, **gen_kwargs)
213
+ elapsed = time.time() - start
214
+ print(f"[INFO] Generation took {elapsed:.2f}s")
215
+
216
+ raw_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
217
+ out_text = raw_text.split("Assistant:", 1)[1].strip() if "Assistant:" in raw_text else raw_text.strip()
218
+
219
+ new_tokens = int(generated_ids.shape[-1] - inputs["input_ids"].shape[-1])
220
+ tps = (new_tokens / elapsed) if elapsed > 0 else 0.0
221
+ print(f"[INFO] Decoded {new_tokens} tokens @ {tps:.2f} tok/s")
222
+
223
+ if use_cache:
224
+ runtime_cache_put(key, {
225
+ "answer": out_text,
226
+ "latency_s": round(elapsed, 3),
227
+ "tokens_per_s": round(tps, 2),
228
+ "new_tokens": new_tokens,
229
+ })
230
+ return out_text, round(elapsed, 3), round(tps, 2), new_tokens
231
+
232
+ # 1) For uploaded videos
233
+ def infer_upload(video, question, max_new_tokens, do_sample, temperature, top_p, use_cache):
234
+ if video is None:
235
+ return "Please upload a video.", 0.0, 0.0, 0
236
+ cleaned_path = _maybe_clean_uploaded(video)
237
+ if cleaned_path is None:
238
+ return "Video cleaning failed.", 0.0, 0.0, 0
239
+ gen_kwargs = dict(do_sample=bool(do_sample), max_new_tokens=int(max_new_tokens))
240
+ if do_sample:
241
+ gen_kwargs.update(temperature=float(temperature), top_p=float(top_p))
242
+ return _infer_core(cleaned_path, question, gen_kwargs, use_cache=bool(use_cache))
243
+
244
+ # 2) For preloaded cleaned videos (no cleaning)
245
+ def infer_preloaded(preloaded_name, question, max_new_tokens, do_sample, temperature, top_p, use_cache):
246
+ if not preloaded_name:
247
+ return "Pick a preloaded video.", 0.0, 0.0, 0
248
+ cleaned_path = os.path.join(PRELOADED_CLEAN_DIR, preloaded_name)
249
+ if not os.path.exists(cleaned_path):
250
+ return f"Video not found: {cleaned_path}", 0.0, 0.0, 0
251
+ gen_kwargs = dict(do_sample=bool(do_sample), max_new_tokens=int(max_new_tokens))
252
+ if do_sample:
253
+ gen_kwargs.update(temperature=float(temperature), top_p=float(top_p))
254
+ return _infer_core(cleaned_path, question, gen_kwargs, use_cache=bool(use_cache))
255
+
256
+ def _ui_clear_cache():
257
+ removed = cache_clear()
258
+ return gr.update(value=f"Cleared {removed} runtime cached items.")
259
+
260
+ # ----------------------------------------------------------------------------------
261
+ # UI helpers
262
+ # ----------------------------------------------------------------------------------
263
+ def _list_preloaded_clean():
264
+ if not os.path.isdir(PRELOADED_CLEAN_DIR):
265
+ return []
266
+ exts = (".avi", ".mp4", ".mov", ".mkv")
267
+ return [f for f in sorted(os.listdir(PRELOADED_CLEAN_DIR)) if f.lower().endswith(exts)]
268
+
269
+ def _resolve_preloaded_path(name):
270
+ if not name: return None
271
+ path = os.path.join(PRELOADED_CLEAN_DIR, name)
272
+ return path if os.path.exists(path) else None
273
+
274
+ # ----------------------------------------------------------------------------------
275
+ # UI
276
+ # ----------------------------------------------------------------------------------
277
+ with gr.Blocks(fill_height=True, theme=gr.themes.Soft()) as demo:
278
+ gr.Markdown("# 🧪 Moraqeb — Distilled VLM Demo")
279
+ gr.Markdown(
280
+ "Choose a **preloaded cleaned video** (fast, uses preset cache if available) or **upload your own**. "
281
+ "On a cache hit, the app returns instantly **without loading the model**."
282
+ )
283
+
284
+ with gr.Tabs():
285
+ # --- Preloaded cleaned videos tab ---
286
+ with gr.Tab("Preloaded (recommended)"):
287
+ with gr.Row():
288
+ with gr.Column(scale=1):
289
+ preloaded = gr.Dropdown(
290
+ choices=_list_preloaded_clean(),
291
+ label="Pick a cleaned video (assets/videos_clean/)",
292
+ interactive=True
293
+ )
294
+ # 👇 Video preview that updates when you pick a file
295
+ preview = gr.Video(label="Preview", autoplay=False)
296
+
297
+ # When dropdown changes, update the video preview source to the actual file path
298
+ preloaded.change(
299
+ fn=_resolve_preloaded_path,
300
+ inputs=preloaded,
301
+ outputs=preview,
302
+ )
303
+ with gr.Column(scale=1):
304
+ question_pre = gr.Textbox(
305
+ label="Question",
306
+ value="Can you describe the entire video in detail from start to finish?",
307
+ lines=3,
308
+ )
309
+ max_new_tokens_pre = gr.Slider(8, 512, value=128, step=8, label="Max new tokens")
310
+ do_sample_pre = gr.Checkbox(value=False, label="do_sample (enable sampling)")
311
+ temperature_pre = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="temperature")
312
+ top_p_pre = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
313
+ use_cache_pre = gr.Checkbox(value=True, label="Use cache")
314
+ run_btn_pre = gr.Button("Run on Preloaded Video", variant="primary")
315
+
316
+ with gr.Row():
317
+ answer_pre = gr.Textbox(label="Answer", lines=8)
318
+ with gr.Row():
319
+ latency_pre = gr.Number(label="Latency (s)")
320
+ tps_pre = gr.Number(label="Speed (tokens/sec)")
321
+ ntoks_pre = gr.Number(label="New tokens")
322
+
323
+ # --- Upload tab ---
324
+ with gr.Tab("Upload your video"):
325
+ with gr.Row():
326
+ with gr.Column(scale=1):
327
+ video = gr.Video(label="Upload video", sources=["upload"], autoplay=False)
328
+ with gr.Column(scale=1):
329
+ question = gr.Textbox(
330
+ label="Question",
331
+ value="Can you describe the entire video in detail from start to finish?",
332
+ lines=3,
333
+ )
334
+ max_new_tokens = gr.Slider(8, 512, value=128, step=8, label="Max new tokens")
335
+ do_sample = gr.Checkbox(value=False, label="do_sample (enable sampling)")
336
+ temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="temperature")
337
+ top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
338
+ use_cache = gr.Checkbox(value=True, label="Use cache")
339
+ run_btn = gr.Button("Run Inference", variant="primary")
340
+
341
+ with gr.Row():
342
+ answer = gr.Textbox(label="Answer", lines=8)
343
+ with gr.Row():
344
+ latency = gr.Number(label="Latency (s)")
345
+ tps = gr.Number(label="Speed (tokens/sec)")
346
+ ntoks = gr.Number(label="New tokens")
347
+
348
+ # Cache controls
349
+ with gr.Row():
350
+ clear_cache_btn = gr.Button("Clear RUNTIME Cache")
351
+ clear_cache_status = gr.Textbox(label="Cache status", interactive=False)
352
+
353
+ # Wiring
354
+ run_btn_pre.click(
355
+ fn=infer_preloaded,
356
+ inputs=[preloaded, question_pre, max_new_tokens_pre, do_sample_pre, temperature_pre, top_p_pre, use_cache_pre],
357
+ outputs=[answer_pre, latency_pre, tps_pre, ntoks_pre],
358
+ api_name="infer_preloaded",
359
+ queue=True,
360
+ )
361
+
362
+ run_btn.click(
363
+ fn=infer_upload,
364
+ inputs=[video, question, max_new_tokens, do_sample, temperature, top_p, use_cache],
365
+ outputs=[answer, latency, tps, ntoks],
366
+ api_name="infer_upload",
367
+ queue=True,
368
+ )
369
+
370
+ clear_cache_btn.click(_ui_clear_cache, inputs=[], outputs=[clear_cache_status])
371
+ gr.Markdown(f"Hardware: **{'CUDA' if has_cuda else 'CPU'}**, dtype **{dtype}**")
372
+
373
+ if __name__ == "__main__":
374
+ demo.launch(server_port=7860, share=True)
assets/videos_clean/.gitkeep ADDED
File without changes
assets/videos_clean/D001_01_clean.avi ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:576701c174d369c5068846afec3206ac0400d56a01881663ee2f2427744c9200
3
+ size 1290430
assets/videos_clean/D001_01_clean.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8404e646b70d5f7fecd0731a297aabbbe569c245ae830fe2a16bc7397e53ce7b
3
+ size 391808
assets/videos_clean/D001_02_clean.avi ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ab1c1f9e5f35842bffb0e2be74a21090ab63a1c5a2b7d8dc963851333338c1f
3
+ size 1568782
assets/videos_clean/D001_02_clean.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b6303b268ec2694c641e2ccd3582dd41844aeaba5c0cf90c12ef37731ab2749
3
+ size 449352
assets/videos_clean/D001_03_clean.avi ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e899475ce01c39141dbe7677220061072c6d79ca4efa1afafff6b215586f5b2
3
+ size 1835904
preset_cache/.gitkeep ADDED
File without changes
preset_cache/13f046ecf2c58fc9a18344662b0a8cc0af902402.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"answer": "The video begins with a view of a gated area, likely a parking lot or a restricted zone, featuring a black metal gate with yellow and white striped pillars. In the foreground, there is a red and black motorbike parked near the gate. A person in a white shirt and dark pants is standing near the gate, facing away from the camera. The background shows a busy street with various vehicles, including a green bus and a white car, and a few pedestrians walking on the sidewalk. The scene remains static, with no significant changes in the environment or the actions of the individuals. The video then transitions to a similar view of", "latency_s": 5.581, "tokens_per_s": 22.93, "new_tokens": 128}
preset_cache/278b7615758af40ae3ba459bf15661daac74e5e4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"answer": "The video begins with a view of a gated area, likely a parking lot or a restricted zone, featuring a black metal gate with decorative elements. In the foreground, a red and black motorbike is parked on the right side, and a person in a white shirt and dark pants is standing near the gate, possibly waiting or observing. The background shows a busy street with various vehicles, including a white car, a green bus, and a bus with a blue roof. A person in a white shirt and dark pants walks from the right side of the frame towards the center, passing by the motorbike. The scene then shifts", "latency_s": 5.837, "tokens_per_s": 21.93, "new_tokens": 128}
preset_cache/a06907c817bf41ccf1add0d9d72c245a63f64947.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"answer": "The video begins with a view of a gated area, likely a parking lot or a restricted zone, featuring a black metal gate with yellow and white stripes. In the foreground, there is a red and black motorbike parked on a concrete surface. To the left of the gate, there is a small building with a blue door and a window. Several vehicles are parked in the background, including a white car, a black SUV, and a green bus. A person in a red shirt and dark pants is seen walking towards the gate from the right side of the frame. Another individual, dressed in a dark outfit, approaches the", "latency_s": 5.746, "tokens_per_s": 22.28, "new_tokens": 128}
preset_cache/manifest.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ cache_file
2
+ 13f046ecf2c58fc9a18344662b0a8cc0af902402.json
3
+ 278b7615758af40ae3ba459bf15661daac74e5e4.json
4
+ a06907c817bf41ccf1add0d9d72c245a63f64947.json
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.40.0
2
+ transformers==4.54.0
3
+ accelerate
4
+ bitsandbytes
5
+ decord
6
+ av
7
+ huggingface_hub
8
+ torch
9
+ num2words
10
+ opencv-python
11
+ numpy
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10