Initial commit: Add Gradio app + cleaned demo videos (via Git LFS) + preset cache + requirements
Browse files- .gitattributes +4 -0
- .gitignore +6 -0
- app.py +374 -0
- assets/videos_clean/.gitkeep +0 -0
- assets/videos_clean/D001_01_clean.avi +3 -0
- assets/videos_clean/D001_01_clean.mp4 +3 -0
- assets/videos_clean/D001_02_clean.avi +3 -0
- assets/videos_clean/D001_02_clean.mp4 +3 -0
- assets/videos_clean/D001_03_clean.avi +3 -0
- preset_cache/.gitkeep +0 -0
- preset_cache/13f046ecf2c58fc9a18344662b0a8cc0af902402.json +1 -0
- preset_cache/278b7615758af40ae3ba459bf15661daac74e5e4.json +1 -0
- preset_cache/a06907c817bf41ccf1add0d9d72c245a63f64947.json +1 -0
- preset_cache/manifest.csv +4 -0
- requirements.txt +11 -0
- runtime.txt +1 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.avi filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
38 |
+
*.mov filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.mkv filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ignore raw video dumps by default
|
2 |
+
assets/videos_raw/
|
3 |
+
# common temp dirs
|
4 |
+
tmp/
|
5 |
+
.cache/
|
6 |
+
__pycache__/
|
app.py
ADDED
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os, time, json, hashlib, glob
|
3 |
+
import cv2, numpy as np
|
4 |
+
import torch, gradio as gr
|
5 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText
|
6 |
+
|
7 |
+
# ----------------------------------------------------------------------------------
|
8 |
+
# Config
|
9 |
+
# ----------------------------------------------------------------------------------
|
10 |
+
MODEL_PATH = "AnasKAN/SmolVLM2-500M-Video-Instruct-video-feedback"
|
11 |
+
PROCESSOR_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
|
12 |
+
|
13 |
+
# Directories (relative to repo root when running in HF Spaces)
|
14 |
+
REPO_ROOT = os.path.dirname(__file__)
|
15 |
+
PRESET_CACHE_DIR = os.path.join(REPO_ROOT, "preset_cache")
|
16 |
+
PRELOADED_CLEAN_DIR = os.path.join(REPO_ROOT, "assets", "videos_clean")
|
17 |
+
RUNTIME_CACHE_DIR = "/tmp/moraqeb_cache"
|
18 |
+
RUNTIME_CLEAN_DIR = "/tmp/clean_videos"
|
19 |
+
|
20 |
+
os.makedirs(RUNTIME_CACHE_DIR, exist_ok=True)
|
21 |
+
os.makedirs(RUNTIME_CLEAN_DIR, exist_ok=True)
|
22 |
+
|
23 |
+
# Device / dtype
|
24 |
+
has_cuda = torch.cuda.is_available()
|
25 |
+
device = torch.device("cuda" if has_cuda else "cpu")
|
26 |
+
dtype = torch.bfloat16 if has_cuda else torch.float32
|
27 |
+
|
28 |
+
# Video cleaning config (used only for uploaded videos)
|
29 |
+
target_size = (640, 360) # (w, h)
|
30 |
+
target_frames = 128
|
31 |
+
output_fps = 24
|
32 |
+
fourcc = cv2.VideoWriter_fourcc(*"XVID")
|
33 |
+
|
34 |
+
def sample_indices(n_frames_src, n_target):
|
35 |
+
if n_frames_src <= 0: return []
|
36 |
+
idxs = np.linspace(0, n_frames_src - 1, min(n_target, n_frames_src), dtype=int)
|
37 |
+
return idxs.tolist()
|
38 |
+
|
39 |
+
def clean_single_video(video_path, save_dir=RUNTIME_CLEAN_DIR, overwrite=True):
|
40 |
+
os.makedirs(save_dir, exist_ok=True)
|
41 |
+
base = os.path.splitext(os.path.basename(video_path))[0]
|
42 |
+
dst_path = os.path.join(save_dir, f"{base}_clean.avi")
|
43 |
+
if os.path.exists(dst_path) and not overwrite:
|
44 |
+
print(f"[SKIP] {dst_path} exists")
|
45 |
+
return dst_path
|
46 |
+
cap = cv2.VideoCapture(video_path)
|
47 |
+
if not cap.isOpened():
|
48 |
+
print(f"[ERROR] Cannot open {video_path}")
|
49 |
+
return None
|
50 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
51 |
+
idxs = sample_indices(total_frames, target_frames)
|
52 |
+
frames, grabbed, cur = [], 0, 0
|
53 |
+
next_targets = set(idxs)
|
54 |
+
while True:
|
55 |
+
ret, frame = cap.read()
|
56 |
+
if not ret: break
|
57 |
+
if cur in next_targets:
|
58 |
+
frame = cv2.resize(frame, target_size, interpolation=cv2.INTER_AREA)
|
59 |
+
frames.append(frame); grabbed += 1
|
60 |
+
if grabbed == len(idxs): break
|
61 |
+
cur += 1
|
62 |
+
cap.release()
|
63 |
+
if not frames:
|
64 |
+
print(f"[ERROR] No frames from {video_path}")
|
65 |
+
return None
|
66 |
+
while len(frames) < target_frames:
|
67 |
+
frames.append(frames[-1])
|
68 |
+
out = cv2.VideoWriter(dst_path, fourcc, output_fps, target_size)
|
69 |
+
for f in frames: out.write(f)
|
70 |
+
out.release()
|
71 |
+
print(f"[OK] Cleaned {video_path} -> {dst_path} ({len(frames)} frames @ {output_fps} FPS)")
|
72 |
+
return dst_path
|
73 |
+
|
74 |
+
# ----------------------------------------------------------------------------------
|
75 |
+
# Caching helpers
|
76 |
+
# ----------------------------------------------------------------------------------
|
77 |
+
def _sha1_file(path, chunk=1024*1024):
|
78 |
+
h = hashlib.sha1()
|
79 |
+
with open(path, "rb") as f:
|
80 |
+
while True:
|
81 |
+
b = f.read(chunk)
|
82 |
+
if not b: break
|
83 |
+
h.update(b)
|
84 |
+
return h.hexdigest()
|
85 |
+
|
86 |
+
def _make_cache_key(cleaned_path, question, gen_kwargs):
|
87 |
+
vid_hash = _sha1_file(cleaned_path)
|
88 |
+
payload = {
|
89 |
+
"model": MODEL_PATH,
|
90 |
+
"video_hash": vid_hash,
|
91 |
+
"question": (question or "").strip(),
|
92 |
+
"gen": {
|
93 |
+
"do_sample": bool(gen_kwargs.get("do_sample", False)),
|
94 |
+
"max_new_tokens": int(gen_kwargs.get("max_new_tokens", 128)),
|
95 |
+
"temperature": float(gen_kwargs.get("temperature", 0.7)) if gen_kwargs.get("do_sample") else None,
|
96 |
+
"top_p": float(gen_kwargs.get("top_p", 0.9)) if gen_kwargs.get("do_sample") else None,
|
97 |
+
},
|
98 |
+
}
|
99 |
+
raw = json.dumps(payload, sort_keys=True).encode("utf-8")
|
100 |
+
return hashlib.sha1(raw).hexdigest()
|
101 |
+
|
102 |
+
def _cache_path(dir_, key): return os.path.join(dir_, f"{key}.json")
|
103 |
+
|
104 |
+
def preset_cache_get(key):
|
105 |
+
p = _cache_path(PRESET_CACHE_DIR, key)
|
106 |
+
if os.path.exists(p):
|
107 |
+
try:
|
108 |
+
with open(p, "r", encoding="utf-8") as f:
|
109 |
+
print("[PRESET] HIT", os.path.basename(p))
|
110 |
+
return json.load(f)
|
111 |
+
except Exception as e:
|
112 |
+
print(f"[PRESET] Failed to read {p}: {e}")
|
113 |
+
return None
|
114 |
+
|
115 |
+
def runtime_cache_get(key):
|
116 |
+
p = _cache_path(RUNTIME_CACHE_DIR, key)
|
117 |
+
if os.path.exists(p):
|
118 |
+
try:
|
119 |
+
with open(p, "r", encoding="utf-8") as f:
|
120 |
+
print("[CACHE] HIT", os.path.basename(p))
|
121 |
+
return json.load(f)
|
122 |
+
except Exception as e:
|
123 |
+
print(f"[CACHE] Failed to read {p}: {e}")
|
124 |
+
return None
|
125 |
+
|
126 |
+
def runtime_cache_put(key, data):
|
127 |
+
p = _cache_path(RUNTIME_CACHE_DIR, key)
|
128 |
+
try:
|
129 |
+
with open(p, "w", encoding="utf-8") as f:
|
130 |
+
json.dump(data, f, ensure_ascii=False)
|
131 |
+
return True
|
132 |
+
except Exception as e:
|
133 |
+
print(f"[CACHE] Failed to write {p}: {e}")
|
134 |
+
return False
|
135 |
+
|
136 |
+
def cache_clear():
|
137 |
+
n = 0
|
138 |
+
for fp in glob.glob(os.path.join(RUNTIME_CACHE_DIR, "*.json")):
|
139 |
+
try: os.remove(fp); n += 1
|
140 |
+
except: pass
|
141 |
+
return n
|
142 |
+
|
143 |
+
# ----------------------------------------------------------------------------------
|
144 |
+
# Lazy pipeline (loaded only on cache miss)
|
145 |
+
# ----------------------------------------------------------------------------------
|
146 |
+
_model = None
|
147 |
+
_processor = None
|
148 |
+
|
149 |
+
def load_pipeline():
|
150 |
+
global _model, _processor
|
151 |
+
if _model is not None and _processor is not None:
|
152 |
+
print("[INFO] Pipeline already loaded")
|
153 |
+
return _processor, _model
|
154 |
+
print(f"[INFO] Loading processor: {PROCESSOR_ID}")
|
155 |
+
_processor = AutoProcessor.from_pretrained(PROCESSOR_ID)
|
156 |
+
print(f"[INFO] Loading model: {MODEL_PATH}")
|
157 |
+
_model = AutoModelForImageTextToText.from_pretrained(
|
158 |
+
MODEL_PATH,
|
159 |
+
torch_dtype=dtype,
|
160 |
+
_attn_implementation="eager",
|
161 |
+
low_cpu_mem_usage=True,
|
162 |
+
).to(device).eval()
|
163 |
+
print("[INFO] Pipeline ready ✅")
|
164 |
+
return _processor, _model
|
165 |
+
|
166 |
+
def build_messages(video_path: str, question: str):
|
167 |
+
q = (question or "").strip() or "Caption the video."
|
168 |
+
return [{
|
169 |
+
"role": "user",
|
170 |
+
"content": [
|
171 |
+
{"type": "video", "path": video_path},
|
172 |
+
{"type": "text", "text": q},
|
173 |
+
],
|
174 |
+
}]
|
175 |
+
|
176 |
+
# ----------------------------------------------------------------------------------
|
177 |
+
# Inference handlers
|
178 |
+
# ----------------------------------------------------------------------------------
|
179 |
+
def _maybe_clean_uploaded(video_path):
|
180 |
+
if video_path and os.path.commonpath([os.path.abspath(video_path), PRELOADED_CLEAN_DIR]) == PRELOADED_CLEAN_DIR:
|
181 |
+
print("[INFO] Using preloaded cleaned video:", video_path)
|
182 |
+
return video_path
|
183 |
+
print(f"[INFO] Cleaning uploaded video {video_path} ...")
|
184 |
+
return clean_single_video(video_path, save_dir=RUNTIME_CLEAN_DIR, overwrite=True)
|
185 |
+
|
186 |
+
def _infer_core(cleaned_path, question, gen_kwargs, use_cache=True):
|
187 |
+
key = _make_cache_key(cleaned_path, question, gen_kwargs)
|
188 |
+
|
189 |
+
if use_cache:
|
190 |
+
preset = preset_cache_get(key)
|
191 |
+
if preset:
|
192 |
+
return preset["answer"], preset.get("latency_s", 0.0), preset.get("tokens_per_s", 0.0), preset.get("new_tokens", 0)
|
193 |
+
cached = runtime_cache_get(key)
|
194 |
+
if cached:
|
195 |
+
return cached["answer"], cached["latency_s"], cached["tokens_per_s"], cached["new_tokens"]
|
196 |
+
|
197 |
+
processor, model = load_pipeline()
|
198 |
+
print("[INFO] Tokenizing ...")
|
199 |
+
messages = build_messages(cleaned_path, question)
|
200 |
+
inputs = processor.apply_chat_template(
|
201 |
+
messages,
|
202 |
+
add_generation_prompt=True,
|
203 |
+
tokenize=True,
|
204 |
+
return_dict=True,
|
205 |
+
return_tensors="pt",
|
206 |
+
video_load_backend="decord",
|
207 |
+
).to(device, dtype=dtype)
|
208 |
+
|
209 |
+
print(f"[INFO] Generating with {gen_kwargs}")
|
210 |
+
start = time.time()
|
211 |
+
with torch.inference_mode():
|
212 |
+
generated_ids = model.generate(**inputs, **gen_kwargs)
|
213 |
+
elapsed = time.time() - start
|
214 |
+
print(f"[INFO] Generation took {elapsed:.2f}s")
|
215 |
+
|
216 |
+
raw_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
217 |
+
out_text = raw_text.split("Assistant:", 1)[1].strip() if "Assistant:" in raw_text else raw_text.strip()
|
218 |
+
|
219 |
+
new_tokens = int(generated_ids.shape[-1] - inputs["input_ids"].shape[-1])
|
220 |
+
tps = (new_tokens / elapsed) if elapsed > 0 else 0.0
|
221 |
+
print(f"[INFO] Decoded {new_tokens} tokens @ {tps:.2f} tok/s")
|
222 |
+
|
223 |
+
if use_cache:
|
224 |
+
runtime_cache_put(key, {
|
225 |
+
"answer": out_text,
|
226 |
+
"latency_s": round(elapsed, 3),
|
227 |
+
"tokens_per_s": round(tps, 2),
|
228 |
+
"new_tokens": new_tokens,
|
229 |
+
})
|
230 |
+
return out_text, round(elapsed, 3), round(tps, 2), new_tokens
|
231 |
+
|
232 |
+
# 1) For uploaded videos
|
233 |
+
def infer_upload(video, question, max_new_tokens, do_sample, temperature, top_p, use_cache):
|
234 |
+
if video is None:
|
235 |
+
return "Please upload a video.", 0.0, 0.0, 0
|
236 |
+
cleaned_path = _maybe_clean_uploaded(video)
|
237 |
+
if cleaned_path is None:
|
238 |
+
return "Video cleaning failed.", 0.0, 0.0, 0
|
239 |
+
gen_kwargs = dict(do_sample=bool(do_sample), max_new_tokens=int(max_new_tokens))
|
240 |
+
if do_sample:
|
241 |
+
gen_kwargs.update(temperature=float(temperature), top_p=float(top_p))
|
242 |
+
return _infer_core(cleaned_path, question, gen_kwargs, use_cache=bool(use_cache))
|
243 |
+
|
244 |
+
# 2) For preloaded cleaned videos (no cleaning)
|
245 |
+
def infer_preloaded(preloaded_name, question, max_new_tokens, do_sample, temperature, top_p, use_cache):
|
246 |
+
if not preloaded_name:
|
247 |
+
return "Pick a preloaded video.", 0.0, 0.0, 0
|
248 |
+
cleaned_path = os.path.join(PRELOADED_CLEAN_DIR, preloaded_name)
|
249 |
+
if not os.path.exists(cleaned_path):
|
250 |
+
return f"Video not found: {cleaned_path}", 0.0, 0.0, 0
|
251 |
+
gen_kwargs = dict(do_sample=bool(do_sample), max_new_tokens=int(max_new_tokens))
|
252 |
+
if do_sample:
|
253 |
+
gen_kwargs.update(temperature=float(temperature), top_p=float(top_p))
|
254 |
+
return _infer_core(cleaned_path, question, gen_kwargs, use_cache=bool(use_cache))
|
255 |
+
|
256 |
+
def _ui_clear_cache():
|
257 |
+
removed = cache_clear()
|
258 |
+
return gr.update(value=f"Cleared {removed} runtime cached items.")
|
259 |
+
|
260 |
+
# ----------------------------------------------------------------------------------
|
261 |
+
# UI helpers
|
262 |
+
# ----------------------------------------------------------------------------------
|
263 |
+
def _list_preloaded_clean():
|
264 |
+
if not os.path.isdir(PRELOADED_CLEAN_DIR):
|
265 |
+
return []
|
266 |
+
exts = (".avi", ".mp4", ".mov", ".mkv")
|
267 |
+
return [f for f in sorted(os.listdir(PRELOADED_CLEAN_DIR)) if f.lower().endswith(exts)]
|
268 |
+
|
269 |
+
def _resolve_preloaded_path(name):
|
270 |
+
if not name: return None
|
271 |
+
path = os.path.join(PRELOADED_CLEAN_DIR, name)
|
272 |
+
return path if os.path.exists(path) else None
|
273 |
+
|
274 |
+
# ----------------------------------------------------------------------------------
|
275 |
+
# UI
|
276 |
+
# ----------------------------------------------------------------------------------
|
277 |
+
with gr.Blocks(fill_height=True, theme=gr.themes.Soft()) as demo:
|
278 |
+
gr.Markdown("# 🧪 Moraqeb — Distilled VLM Demo")
|
279 |
+
gr.Markdown(
|
280 |
+
"Choose a **preloaded cleaned video** (fast, uses preset cache if available) or **upload your own**. "
|
281 |
+
"On a cache hit, the app returns instantly **without loading the model**."
|
282 |
+
)
|
283 |
+
|
284 |
+
with gr.Tabs():
|
285 |
+
# --- Preloaded cleaned videos tab ---
|
286 |
+
with gr.Tab("Preloaded (recommended)"):
|
287 |
+
with gr.Row():
|
288 |
+
with gr.Column(scale=1):
|
289 |
+
preloaded = gr.Dropdown(
|
290 |
+
choices=_list_preloaded_clean(),
|
291 |
+
label="Pick a cleaned video (assets/videos_clean/)",
|
292 |
+
interactive=True
|
293 |
+
)
|
294 |
+
# 👇 Video preview that updates when you pick a file
|
295 |
+
preview = gr.Video(label="Preview", autoplay=False)
|
296 |
+
|
297 |
+
# When dropdown changes, update the video preview source to the actual file path
|
298 |
+
preloaded.change(
|
299 |
+
fn=_resolve_preloaded_path,
|
300 |
+
inputs=preloaded,
|
301 |
+
outputs=preview,
|
302 |
+
)
|
303 |
+
with gr.Column(scale=1):
|
304 |
+
question_pre = gr.Textbox(
|
305 |
+
label="Question",
|
306 |
+
value="Can you describe the entire video in detail from start to finish?",
|
307 |
+
lines=3,
|
308 |
+
)
|
309 |
+
max_new_tokens_pre = gr.Slider(8, 512, value=128, step=8, label="Max new tokens")
|
310 |
+
do_sample_pre = gr.Checkbox(value=False, label="do_sample (enable sampling)")
|
311 |
+
temperature_pre = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="temperature")
|
312 |
+
top_p_pre = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
|
313 |
+
use_cache_pre = gr.Checkbox(value=True, label="Use cache")
|
314 |
+
run_btn_pre = gr.Button("Run on Preloaded Video", variant="primary")
|
315 |
+
|
316 |
+
with gr.Row():
|
317 |
+
answer_pre = gr.Textbox(label="Answer", lines=8)
|
318 |
+
with gr.Row():
|
319 |
+
latency_pre = gr.Number(label="Latency (s)")
|
320 |
+
tps_pre = gr.Number(label="Speed (tokens/sec)")
|
321 |
+
ntoks_pre = gr.Number(label="New tokens")
|
322 |
+
|
323 |
+
# --- Upload tab ---
|
324 |
+
with gr.Tab("Upload your video"):
|
325 |
+
with gr.Row():
|
326 |
+
with gr.Column(scale=1):
|
327 |
+
video = gr.Video(label="Upload video", sources=["upload"], autoplay=False)
|
328 |
+
with gr.Column(scale=1):
|
329 |
+
question = gr.Textbox(
|
330 |
+
label="Question",
|
331 |
+
value="Can you describe the entire video in detail from start to finish?",
|
332 |
+
lines=3,
|
333 |
+
)
|
334 |
+
max_new_tokens = gr.Slider(8, 512, value=128, step=8, label="Max new tokens")
|
335 |
+
do_sample = gr.Checkbox(value=False, label="do_sample (enable sampling)")
|
336 |
+
temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="temperature")
|
337 |
+
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
|
338 |
+
use_cache = gr.Checkbox(value=True, label="Use cache")
|
339 |
+
run_btn = gr.Button("Run Inference", variant="primary")
|
340 |
+
|
341 |
+
with gr.Row():
|
342 |
+
answer = gr.Textbox(label="Answer", lines=8)
|
343 |
+
with gr.Row():
|
344 |
+
latency = gr.Number(label="Latency (s)")
|
345 |
+
tps = gr.Number(label="Speed (tokens/sec)")
|
346 |
+
ntoks = gr.Number(label="New tokens")
|
347 |
+
|
348 |
+
# Cache controls
|
349 |
+
with gr.Row():
|
350 |
+
clear_cache_btn = gr.Button("Clear RUNTIME Cache")
|
351 |
+
clear_cache_status = gr.Textbox(label="Cache status", interactive=False)
|
352 |
+
|
353 |
+
# Wiring
|
354 |
+
run_btn_pre.click(
|
355 |
+
fn=infer_preloaded,
|
356 |
+
inputs=[preloaded, question_pre, max_new_tokens_pre, do_sample_pre, temperature_pre, top_p_pre, use_cache_pre],
|
357 |
+
outputs=[answer_pre, latency_pre, tps_pre, ntoks_pre],
|
358 |
+
api_name="infer_preloaded",
|
359 |
+
queue=True,
|
360 |
+
)
|
361 |
+
|
362 |
+
run_btn.click(
|
363 |
+
fn=infer_upload,
|
364 |
+
inputs=[video, question, max_new_tokens, do_sample, temperature, top_p, use_cache],
|
365 |
+
outputs=[answer, latency, tps, ntoks],
|
366 |
+
api_name="infer_upload",
|
367 |
+
queue=True,
|
368 |
+
)
|
369 |
+
|
370 |
+
clear_cache_btn.click(_ui_clear_cache, inputs=[], outputs=[clear_cache_status])
|
371 |
+
gr.Markdown(f"Hardware: **{'CUDA' if has_cuda else 'CPU'}**, dtype **{dtype}**")
|
372 |
+
|
373 |
+
if __name__ == "__main__":
|
374 |
+
demo.launch(server_port=7860, share=True)
|
assets/videos_clean/.gitkeep
ADDED
File without changes
|
assets/videos_clean/D001_01_clean.avi
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:576701c174d369c5068846afec3206ac0400d56a01881663ee2f2427744c9200
|
3 |
+
size 1290430
|
assets/videos_clean/D001_01_clean.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8404e646b70d5f7fecd0731a297aabbbe569c245ae830fe2a16bc7397e53ce7b
|
3 |
+
size 391808
|
assets/videos_clean/D001_02_clean.avi
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ab1c1f9e5f35842bffb0e2be74a21090ab63a1c5a2b7d8dc963851333338c1f
|
3 |
+
size 1568782
|
assets/videos_clean/D001_02_clean.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b6303b268ec2694c641e2ccd3582dd41844aeaba5c0cf90c12ef37731ab2749
|
3 |
+
size 449352
|
assets/videos_clean/D001_03_clean.avi
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e899475ce01c39141dbe7677220061072c6d79ca4efa1afafff6b215586f5b2
|
3 |
+
size 1835904
|
preset_cache/.gitkeep
ADDED
File without changes
|
preset_cache/13f046ecf2c58fc9a18344662b0a8cc0af902402.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"answer": "The video begins with a view of a gated area, likely a parking lot or a restricted zone, featuring a black metal gate with yellow and white striped pillars. In the foreground, there is a red and black motorbike parked near the gate. A person in a white shirt and dark pants is standing near the gate, facing away from the camera. The background shows a busy street with various vehicles, including a green bus and a white car, and a few pedestrians walking on the sidewalk. The scene remains static, with no significant changes in the environment or the actions of the individuals. The video then transitions to a similar view of", "latency_s": 5.581, "tokens_per_s": 22.93, "new_tokens": 128}
|
preset_cache/278b7615758af40ae3ba459bf15661daac74e5e4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"answer": "The video begins with a view of a gated area, likely a parking lot or a restricted zone, featuring a black metal gate with decorative elements. In the foreground, a red and black motorbike is parked on the right side, and a person in a white shirt and dark pants is standing near the gate, possibly waiting or observing. The background shows a busy street with various vehicles, including a white car, a green bus, and a bus with a blue roof. A person in a white shirt and dark pants walks from the right side of the frame towards the center, passing by the motorbike. The scene then shifts", "latency_s": 5.837, "tokens_per_s": 21.93, "new_tokens": 128}
|
preset_cache/a06907c817bf41ccf1add0d9d72c245a63f64947.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"answer": "The video begins with a view of a gated area, likely a parking lot or a restricted zone, featuring a black metal gate with yellow and white stripes. In the foreground, there is a red and black motorbike parked on a concrete surface. To the left of the gate, there is a small building with a blue door and a window. Several vehicles are parked in the background, including a white car, a black SUV, and a green bus. A person in a red shirt and dark pants is seen walking towards the gate from the right side of the frame. Another individual, dressed in a dark outfit, approaches the", "latency_s": 5.746, "tokens_per_s": 22.28, "new_tokens": 128}
|
preset_cache/manifest.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cache_file
|
2 |
+
13f046ecf2c58fc9a18344662b0a8cc0af902402.json
|
3 |
+
278b7615758af40ae3ba459bf15661daac74e5e4.json
|
4 |
+
a06907c817bf41ccf1add0d9d72c245a63f64947.json
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.40.0
|
2 |
+
transformers==4.54.0
|
3 |
+
accelerate
|
4 |
+
bitsandbytes
|
5 |
+
decord
|
6 |
+
av
|
7 |
+
huggingface_hub
|
8 |
+
torch
|
9 |
+
num2words
|
10 |
+
opencv-python
|
11 |
+
numpy
|
runtime.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python-3.10
|