Spaces:

NikhilJoson
/

CCTV_Assistant

Running on Zero

App Files Files Community

NikhilJoson commited on 24 days ago

Commit

3ce93a6

verified ·

1 Parent(s): 2d9ab99

Update app.py

Browse files

Files changed (1) hide show

app.py +219 -297

app.py CHANGED Viewed

@@ -1,366 +1,288 @@
 import gradio as gr
 import torch
 import os
 import tempfile
-import shutil
 from PIL import Image
 from tqdm import tqdm
 from torch.utils.data import DataLoader
 from moviepy.editor import VideoFileClip
-import numpy as np
-import gc
 from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
-# Global variables to store model and processor
 model = None
 processor = None
 video_embeddings = []
 video_clips = []
-temp_dirs = []
-def cleanup_temp_files():
-    """Clean up temporary files and directories"""
-    global temp_dirs
-    for temp_dir in temp_dirs:
-        try:
-            if os.path.exists(temp_dir):
-                shutil.rmtree(temp_dir)
-        except:
-            pass
-    temp_dirs = []
-    gc.collect()
-def load_model():
-    """Load the ColQwen2.5 Omni model and processor"""
     global model, processor
-    if model is None:
-        try:
-            print("Loading ColQwen2.5 Omni model...")
-            model = ColQwen2_5Omni.from_pretrained(
-                "vidore/colqwen-omni-v0.1",
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-                attn_implementation="eager",  # Use eager attention instead of flash-attn
-                trust_remote_code=True
-            ).eval()
-            processor = ColQwen2_5OmniProcessor.from_pretrained(
-                "manu/colqwen-omni-v0.1",
-                trust_remote_code=True
-            )
-            print("Model loaded successfully!")
-            return True
-        except Exception as e:
-            print(f"Error loading model: {e}")
-            return False
-    return True
-def split_video_into_clips(video_path, clip_duration=10):
-    """Split video into clips of specified duration"""
     clips = []
-    temp_dir = tempfile.mkdtemp()
-    temp_dirs.append(temp_dir)
     try:
-        # Load the video
-        print(f"Loading video: {video_path}")
-        video = VideoFileClip(video_path)
-        duration = video.duration
-        print(f"Video duration: {duration:.2f} seconds")
-        # Calculate number of clips
-        num_clips = int(np.ceil(duration / clip_duration))
-        print(f"Creating {num_clips} clips of {clip_duration} seconds each")
-        for i in range(num_clips):
-            start_time = i * clip_duration
-            end_time = min((i + 1) * clip_duration, duration)
-            print(f"Processing clip {i+1}/{num_clips}: {start_time:.1f}s - {end_time:.1f}s")
-            # Extract clip
-            clip = video.subclip(start_time, end_time)
-            # Save clip to temporary file
-            clip_path = os.path.join(temp_dir, f"clip_{i:03d}.mp4")
-            clip.write_videofile(
-                clip_path,
-                verbose=False,
-                logger=None,
-                temp_audiofile_path=temp_dir
-            )
-            clips.append(clip_path)
-            clip.close()
-        video.close()
-        print(f"Successfully created {len(clips)} clips")
-        return clips, temp_dir
-    except Exception as e:
-        print(f"Error splitting video: {e}")
-        return [], temp_dir
-def embed_video_clips(clips):
-    """Embed video clips using ColQwen2.5 Omni"""
-    global model, processor
-    if not clips:
-        return []
-    embeddings = []
-    print("Generating embeddings for video clips...")
-    try:
-        # Process clips one by one to avoid memory issues
-        for i, clip_path in enumerate(tqdm(clips, desc="Embedding clips")):
             try:
-                # Process single clip
-                batch_doc = processor.process_videos([clip_path])
-                with torch.no_grad():
-                    batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
-                    embeddings_doc = model(**batch_doc)
-                    embeddings.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
-                # Clear GPU memory after each clip
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
             except Exception as e:
-                print(f"Error processing clip {i}: {e}")
-                # Add a dummy embedding to maintain clip indexing
-                if embeddings:
-                    embeddings.append(torch.zeros_like(embeddings[0]))
-    except Exception as e:
-        print(f"Error in embedding process: {e}")
-        return []
-    return embeddings
-def search_clips(query, embeddings, clips, top_k=3):
-    """Search for relevant clips based on query"""
-    global model, processor
-    if not embeddings or not query.strip():
-        return []
-    try:
-        # Process the query
-        batch_queries = processor.process_queries([query])
-        batch_queries = {k: v.to(model.device) for k, v in batch_queries.items()}
-        # Get query embeddings
-        with torch.no_grad():
-            query_embeddings = model(**batch_queries)
-        # Calculate scores
-        scores = processor.score_multi_vector(query_embeddings, embeddings)
-        # Get top-k results
-        top_indices = torch.topk(scores[0], min(top_k, len(clips))).indices
-        results = []
-        for idx in top_indices:
-            if idx < len(clips):  # Safety check
-                results.append({
-                    'clip_path': clips[idx],
-                    'score': scores[0][idx].item(),
-                    'clip_index': idx.item()
-                })
-        return results
     except Exception as e:
-        print(f"Error searching clips: {e}")
-        return []
 def process_video(video_file):
-    """Main function to process uploaded video"""
-    global video_embeddings, video_clips
-    if video_file is None:
-        return "❌ Please upload a video file."
-    # Clean up previous session
-    cleanup_temp_files()
     try:
-        # Load model if not already loaded
-        yield "🔄 Loading AI model..."
-        if not load_model():
-            yield "❌ Failed to load AI model. Please try again."
-            return
-        # Split video into clips
-        yield "🎬 Splitting video into 10-second clips..."
-        clips, temp_dir = split_video_into_clips(video_file, clip_duration=10)
-        if not clips:
-            yield "❌ Error: Could not split video into clips."
-            return
-        # Embed clips
-        yield f"🧠 Analyzing {len(clips)} video clips (this may take a few minutes)..."
-        embeddings = embed_video_clips(clips)
-        if not embeddings:
-            yield "❌ Error: Could not generate embeddings for video clips."
-            return
-        # Store globally for querying
         video_embeddings = embeddings
-        video_clips = clips
-        yield f"✅ Successfully processed video into {len(clips)} clips! You can now search for specific content."
     except Exception as e:
-        yield f"❌ Error processing video: {str(e)}"
-def query_video(query_text, top_k=3):
-    """Query the processed video clips"""
-    global video_embeddings, video_clips
     if not video_embeddings:
-        return "⚠️ Please process a video first.", None
-    if not query_text.strip():
-        return "⚠️ Please enter a search query.", None
     try:
-        # Search for relevant clips
-        results = search_clips(query_text, video_embeddings, video_clips, top_k)
-        if not results:
-            return "❌ No results found for your query.", None
-        # Prepare results for display
-        result_text = f"🎯 Found {len(results)} relevant clips:\n\n"
-        for i, result in enumerate(results, 1):
-            clip_time_start = result['clip_index'] * 10  # Each clip is 10 seconds
-            clip_time_end = clip_time_start + 10
-            result_text += f"**Clip {i}:** {clip_time_start}s-{clip_time_end}s (Relevance: {result['score']:.3f})\n"
-        # Return the best matching clip for display
-        best_clip = results[0]['clip_path']
-        return result_text, best_clip
     except Exception as e:
-        return f"❌ Error querying video: {str(e)}", None
-# Custom CSS for better styling
-css = """
-.gradio-container {
-    max-width: 1200px !important;
-}
-.video-container {
-    max-height: 500px;
-}
-"""
 # Create Gradio interface
-with gr.Blocks(css=css, title="CCTV Video Search") as demo:
-    gr.Markdown("""
-    # 🔍 AI-Powered CCTV Video Search
-    **For Security Professionals & Law Enforcement**
-    Upload surveillance footage and search for specific incidents, people, or activities using natural language.
-    The system automatically processes videos into searchable segments for rapid investigation.
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### 📹 Video Upload & Processing")
-            video_input = gr.Video(
-                label="Upload CCTV Video",
-                height=300
-            )
-            process_btn = gr.Button(
-                "🚀 Process Video",
-                variant="primary",
-                size="lg"
-            )
-            process_status = gr.Textbox(
-                label="Processing Status",
-                interactive=False,
-                lines=2
-            )
-        with gr.Column(scale=1):
-            gr.Markdown("### 🔎 Search & Results")
-            query_input = gr.Textbox(
-                label="Search Query",
-                placeholder="Examples: 'person in red shirt', 'suspicious activity', 'vehicle entering', 'people fighting'",
-                lines=2
-            )
-            with gr.Row():
-                top_k_slider = gr.Slider(
-                    minimum=1,
-                    maximum=5,
-                    value=3,
-                    step=1,
-                    label="Number of results"
                 )
-                search_btn = gr.Button("🔍 Search", variant="secondary")
-            search_results = gr.Textbox(
-                label="Search Results",
-                interactive=False,
-                lines=6
             )
-    with gr.Row():
-        gr.Markdown("### 📺 Best Matching Clip")
-        result_video = gr.Video(
-            label="Most Relevant Clip",
-            height=400
         )
-    gr.Markdown("""
-    ### 💡 Usage Tips:
-    - **Upload**: Supported formats include MP4, AVI, MOV, etc.
-    - **Wait**: Processing may take several minutes depending on video length
-    - **Search**: Use descriptive queries like "person wearing blue jacket" or "car speeding"
-    - **Review**: Check multiple results to find the exact moment you're looking for
-    ### ⚖️ Legal Notice:
-    This tool is intended for authorized security personnel and law enforcement only.
-    Ensure proper legal authority before analyzing surveillance footage.
-    """)
-    # Event handlers
-    process_btn.click(
-        fn=process_video,
-        inputs=[video_input],
-        outputs=[process_status]
-    )
-    search_btn.click(
-        fn=query_video,
-        inputs=[query_input, top_k_slider],
-        outputs=[search_results, result_video]
-    )
-    # Allow enter key to trigger search
-    query_input.submit(
-        fn=query_video,
-        inputs=[query_input, top_k_slider],
-        outputs=[search_results, result_video]
-    )
-# Launch the app
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

 import gradio as gr
 import torch
+import cv2
 import os
 import tempfile
+import numpy as np
 from PIL import Image
 from tqdm import tqdm
 from torch.utils.data import DataLoader
 from moviepy.editor import VideoFileClip
 from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
+import warnings
+warnings.filterwarnings("ignore")
+# Global variables to store model, processor, and embeddings
 model = None
 processor = None
 video_embeddings = []
 video_clips = []
+def initialize_model():
+    """Initialize the ColQwen2.5 Omni model and processor"""
     global model, processor
+    try:
+        # Load model with eager attention (no flash-attn)
+        model = ColQwen2_5Omni.from_pretrained(
+            "vidore/colqwen-omni-v0.1",
+            torch_dtype=torch.bfloat16,
+            device_map="cuda" if torch.cuda.is_available() else "cpu",
+            attn_implementation="eager",  # Use eager instead of flash-attn
+        ).eval()
+        processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
+        return "✅ Model loaded successfully!"
+    except Exception as e:
+        return f"❌ Error loading model: {str(e)}"
+def cut_video_into_clips(video_path, clip_duration=10):
+    """Cut video into clips of specified duration (default 10 seconds)"""
     clips = []
+    clip_paths = []
     try:
+        # Use OpenCV for more reliable video processing on HF Spaces
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        duration = total_frames / fps
+        # Calculate frames per clip
+        frames_per_clip = int(fps * clip_duration)
+        clip_count = 0
+        current_frame = 0
+        while current_frame < total_frames:
+            # Create temporary file for this clip
+            temp_clip = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
+            temp_clip_path = temp_clip.name
+            temp_clip.close()
+            # Use moviepy for the actual cutting (more reliable for output)
             try:
+                start_time = current_frame / fps
+                end_time = min((current_frame + frames_per_clip) / fps, duration)
+                video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
+                video_clip.write_videofile(temp_clip_path, verbose=False, logger=None)
+                video_clip.close()
+                clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s)")
+                clip_paths.append(temp_clip_path)
+                clip_count += 1
+                current_frame += frames_per_clip
             except Exception as e:
+                print(f"Error creating clip {clip_count}: {str(e)}")
+                continue
+        cap.release()
+        return clips, clip_paths
     except Exception as e:
+        return [], []
 def process_video(video_file):
+    """Process uploaded video: cut into clips and generate embeddings"""
+    global model, processor, video_embeddings, video_clips
+    if model is None:
+        return "❌ Model not loaded. Please wait for initialization to complete.", []
+    if video_file is None:
+        return "❌ Please upload a video file.", []
     try:
+        # Reset previous data
+        video_embeddings = []
+        video_clips = []
+        # Cut video into 10-second clips
+        status_msg = "🎬 Cutting video into 10-second clips..."
+        clips_info, clip_paths = cut_video_into_clips(video_file.name, clip_duration=10)
+        if not clip_paths:
+            return "❌ Error cutting video into clips.", []
+        status_msg += f"\n✅ Created {len(clip_paths)} clips"
+        # Process each clip with the model
+        status_msg += "\n🔄 Generating embeddings for video clips..."
+        # Create dataloader for batch processing
+        dataloader = DataLoader(
+            dataset=clip_paths,
+            batch_size=1,
+            shuffle=False,
+            collate_fn=lambda x: processor.process_videos(x),
+        )
+        embeddings = []
+        for i, batch_doc in enumerate(tqdm(dataloader, desc="Processing clips")):
+            with torch.no_grad():
+                # Move to device
+                device = next(model.parameters()).device
+                batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
+                # Generate embeddings
+                embedding = model(**batch_doc)
+                embeddings.extend(list(torch.unbind(embedding.to("cpu"))))
         video_embeddings = embeddings
+        video_clips = clip_paths
+        status_msg += f"\n✅ Generated embeddings for {len(embeddings)} clips"
+        status_msg += "\n🎯 Ready for queries!"
+        return status_msg, clips_info
     except Exception as e:
+        return f"❌ Error processing video: {str(e)}", []
+def search_video_clips(query):
+    """Search through video clips using text query"""
+    global model, processor, video_embeddings, video_clips
+    if model is None:
+        return "❌ Model not loaded.", None, ""
     if not video_embeddings:
+        return "❌ No video processed. Please upload and process a video first.", None, ""
+    if not query.strip():
+        return "❌ Please enter a search query.", None, ""
     try:
+        # Process query
+        batch_queries = processor.process_queries([query])
+        device = next(model.parameters()).device
+        batch_queries = {k: v.to(device) for k, v in batch_queries.items()}
+        # Generate query embedding
+        with torch.no_grad():
+            query_embedding = model(**batch_queries)
+        # Calculate scores
+        scores = processor.score_multi_vector(query_embedding, video_embeddings)
+        # Find best match
+        best_clip_idx = scores[0].argmax().item()
+        best_score = scores[0][best_clip_idx].item()
+        # Get the best matching clip
+        best_clip_path = video_clips[best_clip_idx]
+        result_text = f"🎯 Best match: Clip {best_clip_idx + 1}\n"
+        result_text += f"📊 Similarity score: {best_score:.4f}\n"
+        result_text += f"🔍 Query: '{query}'"
+        # Return top 3 results text
+        top_3_scores = torch.topk(scores[0], min(3, len(scores[0])))
+        rankings = "\n\n📋 Top 3 Results:\n"
+        for i, (score, idx) in enumerate(zip(top_3_scores.values, top_3_scores.indices)):
+            rankings += f"{i+1}. Clip {idx+1} (Score: {score:.4f})\n"
+        return result_text + rankings, best_clip_path, f"Best matching clip for: '{query}'"
     except Exception as e:
+        return f"❌ Error during search: {str(e)}", None, ""
 # Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="Video RAG with ColQwen2.5 Omni", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🎬 Video RAG with ColQwen2.5 Omni")
+        gr.Markdown("Upload a video, and it will be automatically cut into 10-second clips. Then search through the clips using natural language queries!")
+        # Initialize model on startup
+        with gr.Row():
+            init_btn = gr.Button("🚀 Initialize Model", variant="primary")
+            init_status = gr.Textbox(label="Initialization Status", value="Click 'Initialize Model' to start")
+        init_btn.click(initialize_model, outputs=[init_status])
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("## 📤 Upload Video")
+                video_input = gr.File(
+                    label="Upload Video File",
+                    file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
+                    type="filepath"
+                )
+                process_btn = gr.Button("🎬 Process Video", variant="secondary")
+                processing_status = gr.Textbox(
+                    label="Processing Status",
+                    lines=6,
+                    value="Upload a video and click 'Process Video' to start"
+                )
+                clips_list = gr.JSON(
+                    label="Generated Clips",
+                    value=[]
                 )
+            with gr.Column(scale=1):
+                gr.Markdown("## 🔍 Search Clips")
+                query_input = gr.Textbox(
+                    label="Search Query",
+                    placeholder="e.g., 'a dragon spitting fire', 'person running', 'car driving'",
+                    lines=2
+                )
+                search_btn = gr.Button("🎯 Search", variant="primary")
+                search_results = gr.Textbox(
+                    label="Search Results",
+                    lines=8
+                )
+        with gr.Row():
+            result_video = gr.Video(
+                label="Best Matching Clip",
+                visible=True
             )
+        # Event handlers
+        process_btn.click(
+            process_video,
+            inputs=[video_input],
+            outputs=[processing_status, clips_list]
         )
+        search_btn.click(
+            search_video_clips,
+            inputs=[query_input],
+            outputs=[search_results, result_video, result_video]
+        )
+        # Auto-search on Enter
+        query_input.submit(
+            search_video_clips,
+            inputs=[query_input],
+            outputs=[search_results, result_video, result_video]
+        )
+        gr.Markdown("""
+        ## 📝 Instructions:
+        1. **Initialize**: Click 'Initialize Model' and wait for completion
+        2. **Upload**: Choose a video file (MP4, AVI, MOV, MKV, WebM)
+        3. **Process**: Click 'Process Video' to cut it into 10-second clips
+        4. **Search**: Enter a query describing what you're looking for
+        5. **Results**: View the best matching clip and similarity scores
+        ## 🔧 Features:
+        - ✂️ Automatic video segmentation into 10-second clips
+        - 🧠 AI-powered semantic video search using ColQwen2.5 Omni
+        - 🎯 Real-time similarity scoring and ranking
+        - 📱 OpenCV-based video processing for HF Spaces compatibility
+        - ⚡ Eager attention implementation (no flash-attn dependency)
+        """)
+    return demo
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()