NikhilJoson commited on
Commit
3ce93a6
Β·
verified Β·
1 Parent(s): 2d9ab99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +219 -297
app.py CHANGED
@@ -1,366 +1,288 @@
1
  import gradio as gr
2
  import torch
 
3
  import os
4
  import tempfile
5
- import shutil
6
  from PIL import Image
7
  from tqdm import tqdm
8
  from torch.utils.data import DataLoader
9
  from moviepy.editor import VideoFileClip
10
- import numpy as np
11
- import gc
12
-
13
  from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
 
 
14
 
15
- # Global variables to store model and processor
16
  model = None
17
  processor = None
18
  video_embeddings = []
19
  video_clips = []
20
- temp_dirs = []
21
 
22
- def cleanup_temp_files():
23
- """Clean up temporary files and directories"""
24
- global temp_dirs
25
- for temp_dir in temp_dirs:
26
- try:
27
- if os.path.exists(temp_dir):
28
- shutil.rmtree(temp_dir)
29
- except:
30
- pass
31
- temp_dirs = []
32
- gc.collect()
33
-
34
- def load_model():
35
- """Load the ColQwen2.5 Omni model and processor"""
36
  global model, processor
37
 
38
- if model is None:
39
- try:
40
- print("Loading ColQwen2.5 Omni model...")
41
- model = ColQwen2_5Omni.from_pretrained(
42
- "vidore/colqwen-omni-v0.1",
43
- torch_dtype=torch.bfloat16,
44
- device_map="auto",
45
- attn_implementation="eager", # Use eager attention instead of flash-attn
46
- trust_remote_code=True
47
- ).eval()
48
- processor = ColQwen2_5OmniProcessor.from_pretrained(
49
- "manu/colqwen-omni-v0.1",
50
- trust_remote_code=True
51
- )
52
- print("Model loaded successfully!")
53
- return True
54
- except Exception as e:
55
- print(f"Error loading model: {e}")
56
- return False
57
 
58
- return True
 
59
 
60
- def split_video_into_clips(video_path, clip_duration=10):
61
- """Split video into clips of specified duration"""
62
  clips = []
63
- temp_dir = tempfile.mkdtemp()
64
- temp_dirs.append(temp_dir)
65
 
66
  try:
67
- # Load the video
68
- print(f"Loading video: {video_path}")
69
- video = VideoFileClip(video_path)
70
- duration = video.duration
 
71
 
72
- print(f"Video duration: {duration:.2f} seconds")
 
73
 
74
- # Calculate number of clips
75
- num_clips = int(np.ceil(duration / clip_duration))
76
- print(f"Creating {num_clips} clips of {clip_duration} seconds each")
77
 
78
- for i in range(num_clips):
79
- start_time = i * clip_duration
80
- end_time = min((i + 1) * clip_duration, duration)
 
 
81
 
82
- print(f"Processing clip {i+1}/{num_clips}: {start_time:.1f}s - {end_time:.1f}s")
83
-
84
- # Extract clip
85
- clip = video.subclip(start_time, end_time)
86
-
87
- # Save clip to temporary file
88
- clip_path = os.path.join(temp_dir, f"clip_{i:03d}.mp4")
89
- clip.write_videofile(
90
- clip_path,
91
- verbose=False,
92
- logger=None,
93
- temp_audiofile_path=temp_dir
94
- )
95
-
96
- clips.append(clip_path)
97
- clip.close()
98
-
99
- video.close()
100
- print(f"Successfully created {len(clips)} clips")
101
- return clips, temp_dir
102
-
103
- except Exception as e:
104
- print(f"Error splitting video: {e}")
105
- return [], temp_dir
106
-
107
- def embed_video_clips(clips):
108
- """Embed video clips using ColQwen2.5 Omni"""
109
- global model, processor
110
-
111
- if not clips:
112
- return []
113
-
114
- embeddings = []
115
-
116
- print("Generating embeddings for video clips...")
117
-
118
- try:
119
- # Process clips one by one to avoid memory issues
120
- for i, clip_path in enumerate(tqdm(clips, desc="Embedding clips")):
121
  try:
122
- # Process single clip
123
- batch_doc = processor.process_videos([clip_path])
124
 
125
- with torch.no_grad():
126
- batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
127
- embeddings_doc = model(**batch_doc)
128
- embeddings.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
 
 
 
 
 
129
 
130
- # Clear GPU memory after each clip
131
- if torch.cuda.is_available():
132
- torch.cuda.empty_cache()
133
-
134
  except Exception as e:
135
- print(f"Error processing clip {i}: {e}")
136
- # Add a dummy embedding to maintain clip indexing
137
- if embeddings:
138
- embeddings.append(torch.zeros_like(embeddings[0]))
139
-
140
- except Exception as e:
141
- print(f"Error in embedding process: {e}")
142
- return []
143
-
144
- return embeddings
145
-
146
- def search_clips(query, embeddings, clips, top_k=3):
147
- """Search for relevant clips based on query"""
148
- global model, processor
149
-
150
- if not embeddings or not query.strip():
151
- return []
152
-
153
- try:
154
- # Process the query
155
- batch_queries = processor.process_queries([query])
156
- batch_queries = {k: v.to(model.device) for k, v in batch_queries.items()}
157
-
158
- # Get query embeddings
159
- with torch.no_grad():
160
- query_embeddings = model(**batch_queries)
161
-
162
- # Calculate scores
163
- scores = processor.score_multi_vector(query_embeddings, embeddings)
164
-
165
- # Get top-k results
166
- top_indices = torch.topk(scores[0], min(top_k, len(clips))).indices
167
-
168
- results = []
169
- for idx in top_indices:
170
- if idx < len(clips): # Safety check
171
- results.append({
172
- 'clip_path': clips[idx],
173
- 'score': scores[0][idx].item(),
174
- 'clip_index': idx.item()
175
- })
176
-
177
- return results
178
 
 
 
 
179
  except Exception as e:
180
- print(f"Error searching clips: {e}")
181
- return []
182
 
183
  def process_video(video_file):
184
- """Main function to process uploaded video"""
185
- global video_embeddings, video_clips
186
 
187
- if video_file is None:
188
- return "❌ Please upload a video file."
189
 
190
- # Clean up previous session
191
- cleanup_temp_files()
192
 
193
  try:
194
- # Load model if not already loaded
195
- yield "πŸ”„ Loading AI model..."
196
- if not load_model():
197
- yield "❌ Failed to load AI model. Please try again."
198
- return
199
 
200
- # Split video into clips
201
- yield "🎬 Splitting video into 10-second clips..."
202
- clips, temp_dir = split_video_into_clips(video_file, clip_duration=10)
203
 
204
- if not clips:
205
- yield "❌ Error: Could not split video into clips."
206
- return
207
 
208
- # Embed clips
209
- yield f"🧠 Analyzing {len(clips)} video clips (this may take a few minutes)..."
210
- embeddings = embed_video_clips(clips)
211
 
212
- if not embeddings:
213
- yield "❌ Error: Could not generate embeddings for video clips."
214
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
- # Store globally for querying
217
  video_embeddings = embeddings
218
- video_clips = clips
 
 
 
219
 
220
- yield f"βœ… Successfully processed video into {len(clips)} clips! You can now search for specific content."
221
 
222
  except Exception as e:
223
- yield f"❌ Error processing video: {str(e)}"
224
 
225
- def query_video(query_text, top_k=3):
226
- """Query the processed video clips"""
227
- global video_embeddings, video_clips
 
 
 
228
 
229
  if not video_embeddings:
230
- return "⚠️ Please process a video first.", None
231
 
232
- if not query_text.strip():
233
- return "⚠️ Please enter a search query.", None
234
 
235
  try:
236
- # Search for relevant clips
237
- results = search_clips(query_text, video_embeddings, video_clips, top_k)
 
 
 
 
 
 
 
 
 
238
 
239
- if not results:
240
- return "❌ No results found for your query.", None
 
241
 
242
- # Prepare results for display
243
- result_text = f"🎯 Found {len(results)} relevant clips:\n\n"
244
 
245
- for i, result in enumerate(results, 1):
246
- clip_time_start = result['clip_index'] * 10 # Each clip is 10 seconds
247
- clip_time_end = clip_time_start + 10
248
- result_text += f"**Clip {i}:** {clip_time_start}s-{clip_time_end}s (Relevance: {result['score']:.3f})\n"
249
 
250
- # Return the best matching clip for display
251
- best_clip = results[0]['clip_path']
 
 
 
252
 
253
- return result_text, best_clip
254
 
255
  except Exception as e:
256
- return f"❌ Error querying video: {str(e)}", None
257
-
258
- # Custom CSS for better styling
259
- css = """
260
- .gradio-container {
261
- max-width: 1200px !important;
262
- }
263
- .video-container {
264
- max-height: 500px;
265
- }
266
- """
267
 
268
  # Create Gradio interface
269
- with gr.Blocks(css=css, title="CCTV Video Search") as demo:
270
- gr.Markdown("""
271
- # πŸ” AI-Powered CCTV Video Search
272
-
273
- **For Security Professionals & Law Enforcement**
274
-
275
- Upload surveillance footage and search for specific incidents, people, or activities using natural language.
276
- The system automatically processes videos into searchable segments for rapid investigation.
277
- """)
278
-
279
- with gr.Row():
280
- with gr.Column(scale=1):
281
- gr.Markdown("### πŸ“Ή Video Upload & Processing")
282
- video_input = gr.Video(
283
- label="Upload CCTV Video",
284
- height=300
285
- )
286
- process_btn = gr.Button(
287
- "πŸš€ Process Video",
288
- variant="primary",
289
- size="lg"
290
- )
291
- process_status = gr.Textbox(
292
- label="Processing Status",
293
- interactive=False,
294
- lines=2
295
- )
296
-
297
- with gr.Column(scale=1):
298
- gr.Markdown("### πŸ”Ž Search & Results")
299
- query_input = gr.Textbox(
300
- label="Search Query",
301
- placeholder="Examples: 'person in red shirt', 'suspicious activity', 'vehicle entering', 'people fighting'",
302
- lines=2
303
- )
304
-
305
- with gr.Row():
306
- top_k_slider = gr.Slider(
307
- minimum=1,
308
- maximum=5,
309
- value=3,
310
- step=1,
311
- label="Number of results"
312
  )
313
- search_btn = gr.Button("πŸ” Search", variant="secondary")
314
 
315
- search_results = gr.Textbox(
316
- label="Search Results",
317
- interactive=False,
318
- lines=6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  )
320
-
321
- with gr.Row():
322
- gr.Markdown("### πŸ“Ί Best Matching Clip")
323
- result_video = gr.Video(
324
- label="Most Relevant Clip",
325
- height=400
326
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
- gr.Markdown("""
329
- ### πŸ’‘ Usage Tips:
330
- - **Upload**: Supported formats include MP4, AVI, MOV, etc.
331
- - **Wait**: Processing may take several minutes depending on video length
332
- - **Search**: Use descriptive queries like "person wearing blue jacket" or "car speeding"
333
- - **Review**: Check multiple results to find the exact moment you're looking for
334
-
335
- ### βš–οΈ Legal Notice:
336
- This tool is intended for authorized security personnel and law enforcement only.
337
- Ensure proper legal authority before analyzing surveillance footage.
338
- """)
339
-
340
- # Event handlers
341
- process_btn.click(
342
- fn=process_video,
343
- inputs=[video_input],
344
- outputs=[process_status]
345
- )
346
-
347
- search_btn.click(
348
- fn=query_video,
349
- inputs=[query_input, top_k_slider],
350
- outputs=[search_results, result_video]
351
- )
352
-
353
- # Allow enter key to trigger search
354
- query_input.submit(
355
- fn=query_video,
356
- inputs=[query_input, top_k_slider],
357
- outputs=[search_results, result_video]
358
- )
359
 
360
- # Launch the app
361
  if __name__ == "__main__":
362
- demo.launch(
363
- server_name="0.0.0.0",
364
- server_port=7860,
365
- share=False
366
- )
 
1
  import gradio as gr
2
  import torch
3
+ import cv2
4
  import os
5
  import tempfile
6
+ import numpy as np
7
  from PIL import Image
8
  from tqdm import tqdm
9
  from torch.utils.data import DataLoader
10
  from moviepy.editor import VideoFileClip
 
 
 
11
  from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
12
+ import warnings
13
+ warnings.filterwarnings("ignore")
14
 
15
+ # Global variables to store model, processor, and embeddings
16
  model = None
17
  processor = None
18
  video_embeddings = []
19
  video_clips = []
 
20
 
21
+ def initialize_model():
22
+ """Initialize the ColQwen2.5 Omni model and processor"""
 
 
 
 
 
 
 
 
 
 
 
 
23
  global model, processor
24
 
25
+ try:
26
+ # Load model with eager attention (no flash-attn)
27
+ model = ColQwen2_5Omni.from_pretrained(
28
+ "vidore/colqwen-omni-v0.1",
29
+ torch_dtype=torch.bfloat16,
30
+ device_map="cuda" if torch.cuda.is_available() else "cpu",
31
+ attn_implementation="eager", # Use eager instead of flash-attn
32
+ ).eval()
33
+
34
+ processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
35
+ return "βœ… Model loaded successfully!"
 
 
 
 
 
 
 
 
36
 
37
+ except Exception as e:
38
+ return f"❌ Error loading model: {str(e)}"
39
 
40
+ def cut_video_into_clips(video_path, clip_duration=10):
41
+ """Cut video into clips of specified duration (default 10 seconds)"""
42
  clips = []
43
+ clip_paths = []
 
44
 
45
  try:
46
+ # Use OpenCV for more reliable video processing on HF Spaces
47
+ cap = cv2.VideoCapture(video_path)
48
+ fps = cap.get(cv2.CAP_PROP_FPS)
49
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
50
+ duration = total_frames / fps
51
 
52
+ # Calculate frames per clip
53
+ frames_per_clip = int(fps * clip_duration)
54
 
55
+ clip_count = 0
56
+ current_frame = 0
 
57
 
58
+ while current_frame < total_frames:
59
+ # Create temporary file for this clip
60
+ temp_clip = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
61
+ temp_clip_path = temp_clip.name
62
+ temp_clip.close()
63
 
64
+ # Use moviepy for the actual cutting (more reliable for output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  try:
66
+ start_time = current_frame / fps
67
+ end_time = min((current_frame + frames_per_clip) / fps, duration)
68
 
69
+ video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
70
+ video_clip.write_videofile(temp_clip_path, verbose=False, logger=None)
71
+ video_clip.close()
72
+
73
+ clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s)")
74
+ clip_paths.append(temp_clip_path)
75
+
76
+ clip_count += 1
77
+ current_frame += frames_per_clip
78
 
 
 
 
 
79
  except Exception as e:
80
+ print(f"Error creating clip {clip_count}: {str(e)}")
81
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ cap.release()
84
+ return clips, clip_paths
85
+
86
  except Exception as e:
87
+ return [], []
 
88
 
89
  def process_video(video_file):
90
+ """Process uploaded video: cut into clips and generate embeddings"""
91
+ global model, processor, video_embeddings, video_clips
92
 
93
+ if model is None:
94
+ return "❌ Model not loaded. Please wait for initialization to complete.", []
95
 
96
+ if video_file is None:
97
+ return "❌ Please upload a video file.", []
98
 
99
  try:
100
+ # Reset previous data
101
+ video_embeddings = []
102
+ video_clips = []
 
 
103
 
104
+ # Cut video into 10-second clips
105
+ status_msg = "🎬 Cutting video into 10-second clips..."
106
+ clips_info, clip_paths = cut_video_into_clips(video_file.name, clip_duration=10)
107
 
108
+ if not clip_paths:
109
+ return "❌ Error cutting video into clips.", []
 
110
 
111
+ status_msg += f"\nβœ… Created {len(clip_paths)} clips"
 
 
112
 
113
+ # Process each clip with the model
114
+ status_msg += "\nπŸ”„ Generating embeddings for video clips..."
115
+
116
+ # Create dataloader for batch processing
117
+ dataloader = DataLoader(
118
+ dataset=clip_paths,
119
+ batch_size=1,
120
+ shuffle=False,
121
+ collate_fn=lambda x: processor.process_videos(x),
122
+ )
123
+
124
+ embeddings = []
125
+ for i, batch_doc in enumerate(tqdm(dataloader, desc="Processing clips")):
126
+ with torch.no_grad():
127
+ # Move to device
128
+ device = next(model.parameters()).device
129
+ batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
130
+
131
+ # Generate embeddings
132
+ embedding = model(**batch_doc)
133
+ embeddings.extend(list(torch.unbind(embedding.to("cpu"))))
134
 
 
135
  video_embeddings = embeddings
136
+ video_clips = clip_paths
137
+
138
+ status_msg += f"\nβœ… Generated embeddings for {len(embeddings)} clips"
139
+ status_msg += "\n🎯 Ready for queries!"
140
 
141
+ return status_msg, clips_info
142
 
143
  except Exception as e:
144
+ return f"❌ Error processing video: {str(e)}", []
145
 
146
+ def search_video_clips(query):
147
+ """Search through video clips using text query"""
148
+ global model, processor, video_embeddings, video_clips
149
+
150
+ if model is None:
151
+ return "❌ Model not loaded.", None, ""
152
 
153
  if not video_embeddings:
154
+ return "❌ No video processed. Please upload and process a video first.", None, ""
155
 
156
+ if not query.strip():
157
+ return "❌ Please enter a search query.", None, ""
158
 
159
  try:
160
+ # Process query
161
+ batch_queries = processor.process_queries([query])
162
+ device = next(model.parameters()).device
163
+ batch_queries = {k: v.to(device) for k, v in batch_queries.items()}
164
+
165
+ # Generate query embedding
166
+ with torch.no_grad():
167
+ query_embedding = model(**batch_queries)
168
+
169
+ # Calculate scores
170
+ scores = processor.score_multi_vector(query_embedding, video_embeddings)
171
 
172
+ # Find best match
173
+ best_clip_idx = scores[0].argmax().item()
174
+ best_score = scores[0][best_clip_idx].item()
175
 
176
+ # Get the best matching clip
177
+ best_clip_path = video_clips[best_clip_idx]
178
 
179
+ result_text = f"🎯 Best match: Clip {best_clip_idx + 1}\n"
180
+ result_text += f"πŸ“Š Similarity score: {best_score:.4f}\n"
181
+ result_text += f"πŸ” Query: '{query}'"
 
182
 
183
+ # Return top 3 results text
184
+ top_3_scores = torch.topk(scores[0], min(3, len(scores[0])))
185
+ rankings = "\n\nπŸ“‹ Top 3 Results:\n"
186
+ for i, (score, idx) in enumerate(zip(top_3_scores.values, top_3_scores.indices)):
187
+ rankings += f"{i+1}. Clip {idx+1} (Score: {score:.4f})\n"
188
 
189
+ return result_text + rankings, best_clip_path, f"Best matching clip for: '{query}'"
190
 
191
  except Exception as e:
192
+ return f"❌ Error during search: {str(e)}", None, ""
 
 
 
 
 
 
 
 
 
 
193
 
194
  # Create Gradio interface
195
+ def create_interface():
196
+ with gr.Blocks(title="Video RAG with ColQwen2.5 Omni", theme=gr.themes.Soft()) as demo:
197
+ gr.Markdown("# 🎬 Video RAG with ColQwen2.5 Omni")
198
+ gr.Markdown("Upload a video, and it will be automatically cut into 10-second clips. Then search through the clips using natural language queries!")
199
+
200
+ # Initialize model on startup
201
+ with gr.Row():
202
+ init_btn = gr.Button("πŸš€ Initialize Model", variant="primary")
203
+ init_status = gr.Textbox(label="Initialization Status", value="Click 'Initialize Model' to start")
204
+
205
+ init_btn.click(initialize_model, outputs=[init_status])
206
+
207
+ with gr.Row():
208
+ with gr.Column(scale=1):
209
+ gr.Markdown("## πŸ“€ Upload Video")
210
+ video_input = gr.File(
211
+ label="Upload Video File",
212
+ file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
213
+ type="filepath"
214
+ )
215
+ process_btn = gr.Button("🎬 Process Video", variant="secondary")
216
+
217
+ processing_status = gr.Textbox(
218
+ label="Processing Status",
219
+ lines=6,
220
+ value="Upload a video and click 'Process Video' to start"
221
+ )
222
+
223
+ clips_list = gr.JSON(
224
+ label="Generated Clips",
225
+ value=[]
 
 
 
 
 
 
 
 
 
 
 
 
226
  )
 
227
 
228
+ with gr.Column(scale=1):
229
+ gr.Markdown("## πŸ” Search Clips")
230
+ query_input = gr.Textbox(
231
+ label="Search Query",
232
+ placeholder="e.g., 'a dragon spitting fire', 'person running', 'car driving'",
233
+ lines=2
234
+ )
235
+ search_btn = gr.Button("🎯 Search", variant="primary")
236
+
237
+ search_results = gr.Textbox(
238
+ label="Search Results",
239
+ lines=8
240
+ )
241
+
242
+ with gr.Row():
243
+ result_video = gr.Video(
244
+ label="Best Matching Clip",
245
+ visible=True
246
  )
247
+
248
+ # Event handlers
249
+ process_btn.click(
250
+ process_video,
251
+ inputs=[video_input],
252
+ outputs=[processing_status, clips_list]
253
  )
254
+
255
+ search_btn.click(
256
+ search_video_clips,
257
+ inputs=[query_input],
258
+ outputs=[search_results, result_video, result_video]
259
+ )
260
+
261
+ # Auto-search on Enter
262
+ query_input.submit(
263
+ search_video_clips,
264
+ inputs=[query_input],
265
+ outputs=[search_results, result_video, result_video]
266
+ )
267
+
268
+ gr.Markdown("""
269
+ ## πŸ“ Instructions:
270
+ 1. **Initialize**: Click 'Initialize Model' and wait for completion
271
+ 2. **Upload**: Choose a video file (MP4, AVI, MOV, MKV, WebM)
272
+ 3. **Process**: Click 'Process Video' to cut it into 10-second clips
273
+ 4. **Search**: Enter a query describing what you're looking for
274
+ 5. **Results**: View the best matching clip and similarity scores
275
+
276
+ ## πŸ”§ Features:
277
+ - βœ‚οΈ Automatic video segmentation into 10-second clips
278
+ - 🧠 AI-powered semantic video search using ColQwen2.5 Omni
279
+ - 🎯 Real-time similarity scoring and ranking
280
+ - πŸ“± OpenCV-based video processing for HF Spaces compatibility
281
+ - ⚑ Eager attention implementation (no flash-attn dependency)
282
+ """)
283
 
284
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
 
286
  if __name__ == "__main__":
287
+ demo = create_interface()
288
+ demo.launch()