Spaces:

AnasKAN
/

Moraqeb

Running

App Files Files Community

AnasKAN commited on 8 days ago

Commit

05f57ab

1 Parent(s): fa81297

Initial commit: Add Gradio app + cleaned demo videos (via Git LFS) + preset cache + requirements

Browse files

Files changed (16) hide show

.gitattributes +4 -0
.gitignore +6 -0
app.py +374 -0
assets/videos_clean/.gitkeep +0 -0
assets/videos_clean/D001_01_clean.avi +3 -0
assets/videos_clean/D001_01_clean.mp4 +3 -0
assets/videos_clean/D001_02_clean.avi +3 -0
assets/videos_clean/D001_02_clean.mp4 +3 -0
assets/videos_clean/D001_03_clean.avi +3 -0
preset_cache/.gitkeep +0 -0
preset_cache/13f046ecf2c58fc9a18344662b0a8cc0af902402.json +1 -0
preset_cache/278b7615758af40ae3ba459bf15661daac74e5e4.json +1 -0
preset_cache/a06907c817bf41ccf1add0d9d72c245a63f64947.json +1 -0
preset_cache/manifest.csv +4 -0
requirements.txt +11 -0
runtime.txt +1 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.avi filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.mov filter=lfs diff=lfs merge=lfs -text
+*.mkv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+# ignore raw video dumps by default
+assets/videos_raw/
+# common temp dirs
+tmp/
+.cache/
+__pycache__/

app.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import os, time, json, hashlib, glob
+import cv2, numpy as np
+import torch, gradio as gr
+from transformers import AutoProcessor, AutoModelForImageTextToText
+# ----------------------------------------------------------------------------------
+# Config
+# ----------------------------------------------------------------------------------
+MODEL_PATH   = "AnasKAN/SmolVLM2-500M-Video-Instruct-video-feedback"
+PROCESSOR_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+# Directories (relative to repo root when running in HF Spaces)
+REPO_ROOT            = os.path.dirname(__file__)
+PRESET_CACHE_DIR     = os.path.join(REPO_ROOT, "preset_cache")
+PRELOADED_CLEAN_DIR  = os.path.join(REPO_ROOT, "assets", "videos_clean")
+RUNTIME_CACHE_DIR    = "/tmp/moraqeb_cache"
+RUNTIME_CLEAN_DIR    = "/tmp/clean_videos"
+os.makedirs(RUNTIME_CACHE_DIR, exist_ok=True)
+os.makedirs(RUNTIME_CLEAN_DIR, exist_ok=True)
+# Device / dtype
+has_cuda = torch.cuda.is_available()
+device   = torch.device("cuda" if has_cuda else "cpu")
+dtype    = torch.bfloat16 if has_cuda else torch.float32
+# Video cleaning config (used only for uploaded videos)
+target_size   = (640, 360)   # (w, h)
+target_frames = 128
+output_fps    = 24
+fourcc        = cv2.VideoWriter_fourcc(*"XVID")
+def sample_indices(n_frames_src, n_target):
+    if n_frames_src <= 0: return []
+    idxs = np.linspace(0, n_frames_src - 1, min(n_target, n_frames_src), dtype=int)
+    return idxs.tolist()
+def clean_single_video(video_path, save_dir=RUNTIME_CLEAN_DIR, overwrite=True):
+    os.makedirs(save_dir, exist_ok=True)
+    base = os.path.splitext(os.path.basename(video_path))[0]
+    dst_path = os.path.join(save_dir, f"{base}_clean.avi")
+    if os.path.exists(dst_path) and not overwrite:
+        print(f"[SKIP] {dst_path} exists")
+        return dst_path
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"[ERROR] Cannot open {video_path}")
+        return None
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    idxs = sample_indices(total_frames, target_frames)
+    frames, grabbed, cur = [], 0, 0
+    next_targets = set(idxs)
+    while True:
+        ret, frame = cap.read()
+        if not ret: break
+        if cur in next_targets:
+            frame = cv2.resize(frame, target_size, interpolation=cv2.INTER_AREA)
+            frames.append(frame); grabbed += 1
+            if grabbed == len(idxs): break
+        cur += 1
+    cap.release()
+    if not frames:
+        print(f"[ERROR] No frames from {video_path}")
+        return None
+    while len(frames) < target_frames:
+        frames.append(frames[-1])
+    out = cv2.VideoWriter(dst_path, fourcc, output_fps, target_size)
+    for f in frames: out.write(f)
+    out.release()
+    print(f"[OK] Cleaned {video_path} -> {dst_path} ({len(frames)} frames @ {output_fps} FPS)")
+    return dst_path
+# ----------------------------------------------------------------------------------
+# Caching helpers
+# ----------------------------------------------------------------------------------
+def _sha1_file(path, chunk=1024*1024):
+    h = hashlib.sha1()
+    with open(path, "rb") as f:
+        while True:
+            b = f.read(chunk)
+            if not b: break
+            h.update(b)
+    return h.hexdigest()
+def _make_cache_key(cleaned_path, question, gen_kwargs):
+    vid_hash = _sha1_file(cleaned_path)
+    payload = {
+        "model": MODEL_PATH,
+        "video_hash": vid_hash,
+        "question": (question or "").strip(),
+        "gen": {
+            "do_sample": bool(gen_kwargs.get("do_sample", False)),
+            "max_new_tokens": int(gen_kwargs.get("max_new_tokens", 128)),
+            "temperature": float(gen_kwargs.get("temperature", 0.7)) if gen_kwargs.get("do_sample") else None,
+            "top_p": float(gen_kwargs.get("top_p", 0.9)) if gen_kwargs.get("do_sample") else None,
+        },
+    }
+    raw = json.dumps(payload, sort_keys=True).encode("utf-8")
+    return hashlib.sha1(raw).hexdigest()
+def _cache_path(dir_, key): return os.path.join(dir_, f"{key}.json")
+def preset_cache_get(key):
+    p = _cache_path(PRESET_CACHE_DIR, key)
+    if os.path.exists(p):
+        try:
+            with open(p, "r", encoding="utf-8") as f:
+                print("[PRESET] HIT", os.path.basename(p))
+                return json.load(f)
+        except Exception as e:
+            print(f"[PRESET] Failed to read {p}: {e}")
+    return None
+def runtime_cache_get(key):
+    p = _cache_path(RUNTIME_CACHE_DIR, key)
+    if os.path.exists(p):
+        try:
+            with open(p, "r", encoding="utf-8") as f:
+                print("[CACHE] HIT", os.path.basename(p))
+                return json.load(f)
+        except Exception as e:
+            print(f"[CACHE] Failed to read {p}: {e}")
+    return None
+def runtime_cache_put(key, data):
+    p = _cache_path(RUNTIME_CACHE_DIR, key)
+    try:
+        with open(p, "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False)
+        return True
+    except Exception as e:
+        print(f"[CACHE] Failed to write {p}: {e}")
+        return False
+def cache_clear():
+    n = 0
+    for fp in glob.glob(os.path.join(RUNTIME_CACHE_DIR, "*.json")):
+        try: os.remove(fp); n += 1
+        except: pass
+    return n
+# ----------------------------------------------------------------------------------
+# Lazy pipeline (loaded only on cache miss)
+# ----------------------------------------------------------------------------------
+_model = None
+_processor = None
+def load_pipeline():
+    global _model, _processor
+    if _model is not None and _processor is not None:
+        print("[INFO] Pipeline already loaded")
+        return _processor, _model
+    print(f"[INFO] Loading processor: {PROCESSOR_ID}")
+    _processor = AutoProcessor.from_pretrained(PROCESSOR_ID)
+    print(f"[INFO] Loading model: {MODEL_PATH}")
+    _model = AutoModelForImageTextToText.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=dtype,
+        _attn_implementation="eager",
+        low_cpu_mem_usage=True,
+    ).to(device).eval()
+    print("[INFO] Pipeline ready ✅")
+    return _processor, _model
+def build_messages(video_path: str, question: str):
+    q = (question or "").strip() or "Caption the video."
+    return [{
+        "role": "user",
+        "content": [
+            {"type": "video", "path": video_path},
+            {"type": "text", "text": q},
+        ],
+    }]
+# ----------------------------------------------------------------------------------
+# Inference handlers
+# ----------------------------------------------------------------------------------
+def _maybe_clean_uploaded(video_path):
+    if video_path and os.path.commonpath([os.path.abspath(video_path), PRELOADED_CLEAN_DIR]) == PRELOADED_CLEAN_DIR:
+        print("[INFO] Using preloaded cleaned video:", video_path)
+        return video_path
+    print(f"[INFO] Cleaning uploaded video {video_path} ...")
+    return clean_single_video(video_path, save_dir=RUNTIME_CLEAN_DIR, overwrite=True)
+def _infer_core(cleaned_path, question, gen_kwargs, use_cache=True):
+    key = _make_cache_key(cleaned_path, question, gen_kwargs)
+    if use_cache:
+        preset = preset_cache_get(key)
+        if preset:
+            return preset["answer"], preset.get("latency_s", 0.0), preset.get("tokens_per_s", 0.0), preset.get("new_tokens", 0)
+        cached = runtime_cache_get(key)
+        if cached:
+            return cached["answer"], cached["latency_s"], cached["tokens_per_s"], cached["new_tokens"]
+    processor, model = load_pipeline()
+    print("[INFO] Tokenizing ...")
+    messages = build_messages(cleaned_path, question)
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+        video_load_backend="decord",
+    ).to(device, dtype=dtype)
+    print(f"[INFO] Generating with {gen_kwargs}")
+    start = time.time()
+    with torch.inference_mode():
+        generated_ids = model.generate(**inputs, **gen_kwargs)
+    elapsed = time.time() - start
+    print(f"[INFO] Generation took {elapsed:.2f}s")
+    raw_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    out_text = raw_text.split("Assistant:", 1)[1].strip() if "Assistant:" in raw_text else raw_text.strip()
+    new_tokens = int(generated_ids.shape[-1] - inputs["input_ids"].shape[-1])
+    tps = (new_tokens / elapsed) if elapsed > 0 else 0.0
+    print(f"[INFO] Decoded {new_tokens} tokens @ {tps:.2f} tok/s")
+    if use_cache:
+        runtime_cache_put(key, {
+            "answer": out_text,
+            "latency_s": round(elapsed, 3),
+            "tokens_per_s": round(tps, 2),
+            "new_tokens": new_tokens,
+        })
+    return out_text, round(elapsed, 3), round(tps, 2), new_tokens
+# 1) For uploaded videos
+def infer_upload(video, question, max_new_tokens, do_sample, temperature, top_p, use_cache):
+    if video is None:
+        return "Please upload a video.", 0.0, 0.0, 0
+    cleaned_path = _maybe_clean_uploaded(video)
+    if cleaned_path is None:
+        return "Video cleaning failed.", 0.0, 0.0, 0
+    gen_kwargs = dict(do_sample=bool(do_sample), max_new_tokens=int(max_new_tokens))
+    if do_sample:
+        gen_kwargs.update(temperature=float(temperature), top_p=float(top_p))
+    return _infer_core(cleaned_path, question, gen_kwargs, use_cache=bool(use_cache))
+# 2) For preloaded cleaned videos (no cleaning)
+def infer_preloaded(preloaded_name, question, max_new_tokens, do_sample, temperature, top_p, use_cache):
+    if not preloaded_name:
+        return "Pick a preloaded video.", 0.0, 0.0, 0
+    cleaned_path = os.path.join(PRELOADED_CLEAN_DIR, preloaded_name)
+    if not os.path.exists(cleaned_path):
+        return f"Video not found: {cleaned_path}", 0.0, 0.0, 0
+    gen_kwargs = dict(do_sample=bool(do_sample), max_new_tokens=int(max_new_tokens))
+    if do_sample:
+        gen_kwargs.update(temperature=float(temperature), top_p=float(top_p))
+    return _infer_core(cleaned_path, question, gen_kwargs, use_cache=bool(use_cache))
+def _ui_clear_cache():
+    removed = cache_clear()
+    return gr.update(value=f"Cleared {removed} runtime cached items.")
+# ----------------------------------------------------------------------------------
+# UI helpers
+# ----------------------------------------------------------------------------------
+def _list_preloaded_clean():
+    if not os.path.isdir(PRELOADED_CLEAN_DIR):
+        return []
+    exts = (".avi", ".mp4", ".mov", ".mkv")
+    return [f for f in sorted(os.listdir(PRELOADED_CLEAN_DIR)) if f.lower().endswith(exts)]
+def _resolve_preloaded_path(name):
+    if not name: return None
+    path = os.path.join(PRELOADED_CLEAN_DIR, name)
+    return path if os.path.exists(path) else None
+# ----------------------------------------------------------------------------------
+# UI
+# ----------------------------------------------------------------------------------
+with gr.Blocks(fill_height=True, theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🧪 Moraqeb — Distilled VLM Demo")
+    gr.Markdown(
+        "Choose a **preloaded cleaned video** (fast, uses preset cache if available) or **upload your own**. "
+        "On a cache hit, the app returns instantly **without loading the model**."
+    )
+    with gr.Tabs():
+        # --- Preloaded cleaned videos tab ---
+        with gr.Tab("Preloaded (recommended)"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    preloaded = gr.Dropdown(
+                        choices=_list_preloaded_clean(),
+                        label="Pick a cleaned video (assets/videos_clean/)",
+                        interactive=True
+                    )
+                    # 👇 Video preview that updates when you pick a file
+                    preview = gr.Video(label="Preview", autoplay=False)
+                    # When dropdown changes, update the video preview source to the actual file path
+                    preloaded.change(
+                        fn=_resolve_preloaded_path,
+                        inputs=preloaded,
+                        outputs=preview,
+                    )
+                with gr.Column(scale=1):
+                    question_pre = gr.Textbox(
+                        label="Question",
+                        value="Can you describe the entire video in detail from start to finish?",
+                        lines=3,
+                    )
+                    max_new_tokens_pre = gr.Slider(8, 512, value=128, step=8, label="Max new tokens")
+                    do_sample_pre = gr.Checkbox(value=False, label="do_sample (enable sampling)")
+                    temperature_pre = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="temperature")
+                    top_p_pre = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
+                    use_cache_pre = gr.Checkbox(value=True, label="Use cache")
+                    run_btn_pre = gr.Button("Run on Preloaded Video", variant="primary")
+            with gr.Row():
+                answer_pre = gr.Textbox(label="Answer", lines=8)
+            with gr.Row():
+                latency_pre = gr.Number(label="Latency (s)")
+                tps_pre     = gr.Number(label="Speed (tokens/sec)")
+                ntoks_pre   = gr.Number(label="New tokens")
+        # --- Upload tab ---
+        with gr.Tab("Upload your video"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    video = gr.Video(label="Upload video", sources=["upload"], autoplay=False)
+                with gr.Column(scale=1):
+                    question = gr.Textbox(
+                        label="Question",
+                        value="Can you describe the entire video in detail from start to finish?",
+                        lines=3,
+                    )
+                    max_new_tokens = gr.Slider(8, 512, value=128, step=8, label="Max new tokens")
+                    do_sample = gr.Checkbox(value=False, label="do_sample (enable sampling)")
+                    temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="temperature")
+                    top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
+                    use_cache = gr.Checkbox(value=True, label="Use cache")
+                    run_btn = gr.Button("Run Inference", variant="primary")
+            with gr.Row():
+                answer = gr.Textbox(label="Answer", lines=8)
+            with gr.Row():
+                latency = gr.Number(label="Latency (s)")
+                tps     = gr.Number(label="Speed (tokens/sec)")
+                ntoks   = gr.Number(label="New tokens")
+        # Cache controls
+        with gr.Row():
+            clear_cache_btn = gr.Button("Clear RUNTIME Cache")
+            clear_cache_status = gr.Textbox(label="Cache status", interactive=False)
+    # Wiring
+    run_btn_pre.click(
+        fn=infer_preloaded,
+        inputs=[preloaded, question_pre, max_new_tokens_pre, do_sample_pre, temperature_pre, top_p_pre, use_cache_pre],
+        outputs=[answer_pre, latency_pre, tps_pre, ntoks_pre],
+        api_name="infer_preloaded",
+        queue=True,
+    )
+    run_btn.click(
+        fn=infer_upload,
+        inputs=[video, question, max_new_tokens, do_sample, temperature, top_p, use_cache],
+        outputs=[answer, latency, tps, ntoks],
+        api_name="infer_upload",
+        queue=True,
+    )
+    clear_cache_btn.click(_ui_clear_cache, inputs=[], outputs=[clear_cache_status])
+    gr.Markdown(f"Hardware: **{'CUDA' if has_cuda else 'CPU'}**, dtype **{dtype}**")
+if __name__ == "__main__":
+    demo.launch(server_port=7860, share=True)

assets/videos_clean/.gitkeep ADDED Viewed

File without changes

assets/videos_clean/D001_01_clean.avi ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:576701c174d369c5068846afec3206ac0400d56a01881663ee2f2427744c9200
+size 1290430

assets/videos_clean/D001_01_clean.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8404e646b70d5f7fecd0731a297aabbbe569c245ae830fe2a16bc7397e53ce7b
+size 391808

assets/videos_clean/D001_02_clean.avi ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ab1c1f9e5f35842bffb0e2be74a21090ab63a1c5a2b7d8dc963851333338c1f
+size 1568782

assets/videos_clean/D001_02_clean.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b6303b268ec2694c641e2ccd3582dd41844aeaba5c0cf90c12ef37731ab2749
+size 449352

assets/videos_clean/D001_03_clean.avi ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e899475ce01c39141dbe7677220061072c6d79ca4efa1afafff6b215586f5b2
+size 1835904

preset_cache/.gitkeep ADDED Viewed

File without changes

preset_cache/13f046ecf2c58fc9a18344662b0a8cc0af902402.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"answer": "The video begins with a view of a gated area, likely a parking lot or a restricted zone, featuring a black metal gate with yellow and white striped pillars. In the foreground, there is a red and black motorbike parked near the gate. A person in a white shirt and dark pants is standing near the gate, facing away from the camera. The background shows a busy street with various vehicles, including a green bus and a white car, and a few pedestrians walking on the sidewalk. The scene remains static, with no significant changes in the environment or the actions of the individuals. The video then transitions to a similar view of", "latency_s": 5.581, "tokens_per_s": 22.93, "new_tokens": 128}

preset_cache/278b7615758af40ae3ba459bf15661daac74e5e4.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"answer": "The video begins with a view of a gated area, likely a parking lot or a restricted zone, featuring a black metal gate with decorative elements. In the foreground, a red and black motorbike is parked on the right side, and a person in a white shirt and dark pants is standing near the gate, possibly waiting or observing. The background shows a busy street with various vehicles, including a white car, a green bus, and a bus with a blue roof. A person in a white shirt and dark pants walks from the right side of the frame towards the center, passing by the motorbike. The scene then shifts", "latency_s": 5.837, "tokens_per_s": 21.93, "new_tokens": 128}

preset_cache/a06907c817bf41ccf1add0d9d72c245a63f64947.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"answer": "The video begins with a view of a gated area, likely a parking lot or a restricted zone, featuring a black metal gate with yellow and white stripes. In the foreground, there is a red and black motorbike parked on a concrete surface. To the left of the gate, there is a small building with a blue door and a window. Several vehicles are parked in the background, including a white car, a black SUV, and a green bus. A person in a red shirt and dark pants is seen walking towards the gate from the right side of the frame. Another individual, dressed in a dark outfit, approaches the", "latency_s": 5.746, "tokens_per_s": 22.28, "new_tokens": 128}

preset_cache/manifest.csv ADDED Viewed

	@@ -0,0 +1,4 @@

+cache_file
+13f046ecf2c58fc9a18344662b0a8cc0af902402.json
+278b7615758af40ae3ba459bf15661daac74e5e4.json
+a06907c817bf41ccf1add0d9d72c245a63f64947.json

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio>=4.40.0
+transformers==4.54.0
+accelerate
+bitsandbytes
+decord
+av
+huggingface_hub
+torch
+num2words
+opencv-python
+numpy

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.10