NikhilJoson commited on
Commit
9c81743
Β·
verified Β·
1 Parent(s): 166f7f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +358 -227
app.py CHANGED
@@ -4,248 +4,372 @@ import cv2
4
  import os
5
  import tempfile
6
  import numpy as np
 
7
  from PIL import Image
8
  from tqdm import tqdm
9
  from torch.utils.data import DataLoader
10
- from moviepy.editor import VideoFileClip
11
  from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
12
  import spaces
13
  import warnings
14
  warnings.filterwarnings("ignore")
15
 
16
- # Global variables to store embeddings and clips (NOT model - that's loaded per GPU call)
17
- video_embeddings = []
18
- video_clips = []
19
 
20
- def load_model():
21
- """Load model and processor inside GPU function"""
22
- try:
23
- model = ColQwen2_5Omni.from_pretrained(
 
 
 
 
24
  "vidore/colqwen-omni-v0.1",
25
  torch_dtype=torch.bfloat16,
26
  device_map="cuda" if torch.cuda.is_available() else "cpu",
27
- attn_implementation="eager", # Use eager instead of flash-attn
28
  ).eval()
29
 
30
- processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
31
- return model, processor
32
- except Exception as e:
33
- raise Exception(f"Error loading model: {str(e)}")
34
-
35
- def initialize_model():
36
- """Initialize model on CPU (for status check only)"""
37
- try:
38
- # Just return success message - actual loading happens in GPU functions
39
- return "βœ… Ready to process! Model will be loaded when you upload a video."
40
- except Exception as e:
41
- return f"❌ Error: {str(e)}"
42
-
43
- def cut_video_into_clips(video_path, clip_duration=10):
44
- """Cut video into clips of specified duration (default 10 seconds)
45
-
46
- Handles videos of any length - the last clip will be shorter if video
47
- duration is not exactly divisible by clip_duration.
48
- """
49
- clips = []
50
- clip_paths = []
51
-
52
- try:
53
- # Use OpenCV for more reliable video processing on HF Spaces
54
- cap = cv2.VideoCapture(video_path)
55
- fps = cap.get(cv2.CAP_PROP_FPS)
56
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
57
- duration = total_frames / fps
58
-
59
- print(f"Video info: {duration:.2f}s total, {fps:.2f} FPS, {total_frames} frames")
60
-
61
- # Calculate frames per clip
62
- frames_per_clip = int(fps * clip_duration)
63
-
64
- clip_count = 0
65
- current_frame = 0
66
 
67
- while current_frame < total_frames:
68
- # Create temporary file for this clip
69
- temp_clip = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
70
- temp_clip_path = temp_clip.name
71
- temp_clip.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- # Use moviepy for the actual cutting (more reliable for output)
74
- try:
75
- start_time = current_frame / fps
76
- end_time = min((current_frame + frames_per_clip) / fps, duration)
77
 
78
- # Skip clips that are too short (less than 1 second)
79
- clip_duration_actual = end_time - start_time
80
- if clip_duration_actual < 1.0:
81
- print(f"Skipping clip {clip_count + 1} - too short ({clip_duration_actual:.1f}s)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  current_frame += frames_per_clip
83
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
86
- video_clip.write_videofile(temp_clip_path, verbose=False, logger=None)
87
- video_clip.close()
88
-
89
- # More detailed clip info showing actual duration
90
- if clip_duration_actual < clip_duration:
91
- clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s) [Final clip - {clip_duration_actual:.1f}s]")
92
  else:
93
- clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s)")
94
 
95
- clip_paths.append(temp_clip_path)
96
 
97
- clip_count += 1
98
- current_frame += frames_per_clip
 
 
 
 
 
 
 
 
99
 
100
- except Exception as e:
101
- print(f"Error creating clip {clip_count}: {str(e)}")
102
- current_frame += frames_per_clip # Still advance to avoid infinite loop
103
- continue
104
-
105
- cap.release()
106
- print(f"Successfully created {len(clip_paths)} clips from {duration:.2f}s video")
107
- return clips, clip_paths
108
-
109
- except Exception as e:
110
- print(f"Error in cut_video_into_clips: {str(e)}")
111
- return [], []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  @spaces.GPU
114
- def process_video(video_file):
115
- """Process uploaded video: cut into clips and generate embeddings"""
116
- global video_embeddings, video_clips
117
-
118
- if video_file is None:
119
- return "❌ Please upload a video file.", []
120
-
121
- try:
122
- # Load model inside GPU function
123
- status_msg = "πŸ”„ Loading model..."
124
- model, processor = load_model()
125
- status_msg += "\nβœ… Model loaded successfully!"
126
-
127
- # Reset previous data
128
- video_embeddings = []
129
- video_clips = []
130
-
131
- # Cut video into 10-second clips
132
- status_msg += "\n🎬 Cutting video into 10-second clips..."
133
- clips_info, clip_paths = cut_video_into_clips(video_file.name, clip_duration=10)
134
-
135
- if not clip_paths:
136
- return "❌ Error cutting video into clips.", []
137
-
138
- status_msg += f"\nβœ… Created {len(clip_paths)} clips"
139
-
140
- # Process each clip with the model
141
- status_msg += "\nπŸ”„ Generating embeddings for video clips..."
142
-
143
- # Create dataloader for batch processing
144
- dataloader = DataLoader(
145
- dataset=clip_paths,
146
- batch_size=1,
147
- shuffle=False,
148
- collate_fn=lambda x: processor.process_videos(x),
149
- )
150
-
151
- embeddings = []
152
- for i, batch_doc in enumerate(tqdm(dataloader, desc="Processing clips")):
153
- with torch.no_grad():
154
- # Move to device
155
- device = next(model.parameters()).device
156
- batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
157
-
158
- # Generate embeddings
159
- embedding = model(**batch_doc)
160
- embeddings.extend(list(torch.unbind(embedding.to("cpu"))))
161
-
162
- video_embeddings = embeddings
163
- video_clips = clip_paths
164
-
165
- status_msg += f"\nβœ… Generated embeddings for {len(embeddings)} clips"
166
- status_msg += "\n🎯 Ready for queries!"
167
-
168
- return status_msg, clips_info
169
-
170
- except Exception as e:
171
- return f"❌ Error processing video: {str(e)}", []
172
 
173
  @spaces.GPU
174
- def search_video_clips(query):
175
- """Search through video clips using text query"""
176
- global video_embeddings, video_clips
177
-
178
- if not video_embeddings:
179
- return "❌ No video processed. Please upload and process a video first.", None, ""
180
-
181
- if not query.strip():
182
- return "❌ Please enter a search query.", None, ""
183
-
184
  try:
185
- # Load model inside GPU function
186
- model, processor = load_model()
187
-
188
- # Process query
189
- batch_queries = processor.process_queries([query])
190
- device = next(model.parameters()).device
191
- batch_queries = {k: v.to(device) for k, v in batch_queries.items()}
192
-
193
- # Generate query embedding
194
- with torch.no_grad():
195
- query_embedding = model(**batch_queries)
196
-
197
- # Calculate scores
198
- scores = processor.score_multi_vector(query_embedding, video_embeddings)
199
 
200
- # Find best match
201
- best_clip_idx = scores[0].argmax().item()
202
- best_score = scores[0][best_clip_idx].item()
203
 
204
- # Get the best matching clip
205
- best_clip_path = video_clips[best_clip_idx]
206
 
207
- result_text = f"🎯 Best match: Clip {best_clip_idx + 1}\n"
208
- result_text += f"πŸ“Š Similarity score: {best_score:.4f}\n"
209
- result_text += f"πŸ” Query: '{query}'"
210
 
211
- # Return top 3 results text
212
- top_3_scores = torch.topk(scores[0], min(3, len(scores[0])))
213
- rankings = "\n\nπŸ“‹ Top 3 Results:\n"
214
- for i, (score, idx) in enumerate(zip(top_3_scores.values, top_3_scores.indices)):
215
- rankings += f"{i+1}. Clip {idx+1} (Score: {score:.4f})\n"
216
-
217
- return result_text + rankings, best_clip_path, f"Best matching clip for: '{query}'"
218
 
219
  except Exception as e:
220
- return f"❌ Error during search: {str(e)}", None, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  # Create Gradio interface
223
  def create_interface():
224
- with gr.Blocks(title="Video RAG with ColQwen2.5 Omni", theme=gr.themes.Soft()) as demo:
225
- gr.Markdown("# 🎬 Video RAG with ColQwen2.5 Omni")
226
- gr.Markdown("Upload a video, and it will be automatically cut into 10-second clips. Then search through the clips using natural language queries!")
227
-
228
- # Initialize model on startup
229
- with gr.Row():
230
- init_btn = gr.Button("πŸš€ Initialize Model", variant="primary")
231
- init_status = gr.Textbox(label="Initialization Status", value="Click 'Initialize Model' to start")
232
-
233
- init_btn.click(initialize_model, outputs=[init_status])
234
 
235
  with gr.Row():
236
  with gr.Column(scale=1):
237
- gr.Markdown("## πŸ“€ Upload Video")
238
  video_input = gr.File(
239
  label="Upload Video File",
240
  file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
241
  type="filepath"
242
  )
243
- process_btn = gr.Button("🎬 Process Video", variant="secondary")
 
 
 
 
 
 
 
 
244
 
245
  processing_status = gr.Textbox(
246
  label="Processing Status",
247
- lines=6,
248
- value="Upload a video and click 'Process Video' to start"
249
  )
250
 
251
  clips_list = gr.JSON(
@@ -254,63 +378,70 @@ def create_interface():
254
  )
255
 
256
  with gr.Column(scale=1):
257
- gr.Markdown("## πŸ” Search Clips")
258
  query_input = gr.Textbox(
259
- label="Search Query",
260
- placeholder="e.g., 'a dragon spitting fire', 'person running', 'car driving'",
261
- lines=2
262
  )
263
- search_btn = gr.Button("🎯 Search", variant="primary")
264
 
265
- search_results = gr.Textbox(
266
- label="Search Results",
267
- lines=8
 
268
  )
269
 
270
- with gr.Row():
271
- result_video = gr.Video(
272
- label="Best Matching Clip",
273
- visible=True
274
- )
275
-
276
  # Event handlers
277
  process_btn.click(
278
- process_video,
279
- inputs=[video_input],
280
- outputs=[processing_status, clips_list]
281
  )
282
 
283
- search_btn.click(
284
- search_video_clips,
285
  inputs=[query_input],
286
- outputs=[search_results, result_video, result_video]
287
  )
288
 
289
- # Auto-search on Enter
290
  query_input.submit(
291
- search_video_clips,
292
  inputs=[query_input],
293
- outputs=[search_results, result_video, result_video]
294
  )
295
 
296
  gr.Markdown("""
297
- ## πŸ“ Instructions:
298
- 1. **Initialize**: Click 'Initialize Model' and wait for completion
299
- 2. **Upload**: Choose a video file (MP4, AVI, MOV, MKV, WebM)
300
- 3. **Process**: Click 'Process Video' to cut it into 10-second clips
301
- 4. **Search**: Enter a query describing what you're looking for
302
- 5. **Results**: View the best matching clip and similarity scores
 
 
 
 
 
 
 
 
 
 
303
 
304
  ## πŸ”§ Features:
305
- - βœ‚οΈ Automatic video segmentation into 10-second clips
306
- - 🧠 AI-powered semantic video search using ColQwen2.5 Omni
307
- - 🎯 Real-time similarity scoring and ranking
308
- - πŸ“± OpenCV-based video processing for HF Spaces compatibility
309
- - ⚑ Eager attention implementation (no flash-attn dependency)
 
 
310
  """)
311
 
312
  return demo
313
 
 
314
  if __name__ == "__main__":
315
  demo = create_interface()
316
  demo.launch()
 
4
  import os
5
  import tempfile
6
  import numpy as np
7
+ import pickle
8
  from PIL import Image
9
  from tqdm import tqdm
10
  from torch.utils.data import DataLoader
11
+ from moviepy import VideoFileClip
12
  from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
13
  import spaces
14
  import warnings
15
  warnings.filterwarnings("ignore")
16
 
 
 
 
17
 
18
+ class VideoRAGProcessor:
19
+ """Class to handle model initialization and video processing"""
20
+
21
+ def __init__(self):
22
+ """Initialize model and processor directly"""
23
+ print("Loading ColQwen2.5 Omni model... This may take a few minutes.")
24
+
25
+ self.model = ColQwen2_5Omni.from_pretrained(
26
  "vidore/colqwen-omni-v0.1",
27
  torch_dtype=torch.bfloat16,
28
  device_map="cuda" if torch.cuda.is_available() else "cpu",
29
+ attn_implementation="eager",
30
  ).eval()
31
 
32
+ self.processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ print("Model loaded successfully!")
35
+
36
+ def cut_video_into_clips(self, video_path, clip_duration=5):
37
+ """Cut video into clips of specified duration (default 5 seconds)"""
38
+ clips = []
39
+ clip_paths = []
40
+ clip_timestamps = []
41
+
42
+ try:
43
+ clips_dir = "./video_clips"
44
+ os.makedirs(clips_dir, exist_ok=True)
45
+
46
+ cap = cv2.VideoCapture(video_path)
47
+ fps = cap.get(cv2.CAP_PROP_FPS)
48
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
49
+ duration = total_frames / fps
50
+
51
+ print(f"Video info: {duration:.2f}s total, {fps:.2f} FPS, {total_frames} frames")
52
+
53
+ frames_per_clip = int(fps * clip_duration)
54
+ clip_count = 0
55
+ current_frame = 0
56
 
57
+ while current_frame < total_frames:
58
+ clip_filename = f"clip_{clip_count + 1}.mp4"
59
+ clip_path = os.path.join(clips_dir, clip_filename)
 
60
 
61
+ try:
62
+ start_time = current_frame / fps
63
+ end_time = min((current_frame + frames_per_clip) / fps, duration)
64
+
65
+ clip_duration_actual = end_time - start_time
66
+ if clip_duration_actual < 1.0:
67
+ print(f"Skipping clip {clip_count + 1} - too short ({clip_duration_actual:.1f}s)")
68
+ current_frame += frames_per_clip
69
+ continue
70
+
71
+ if end_time >= duration:
72
+ end_time = duration - 0.1
73
+ clip_duration_actual = end_time - start_time
74
+ if clip_duration_actual < 1.0:
75
+ print(f"Skipping final clip - too short after adjustment ({clip_duration_actual:.1f}s)")
76
+ break
77
+
78
+ try:
79
+ video_clip = VideoFileClip(video_path)
80
+ sub_clip = video_clip.subclip(start_time, end_time)
81
+ sub_clip.write_videofile(clip_path, verbose=False, logger=None)
82
+ video_clip.close()
83
+ except AttributeError:
84
+ try:
85
+ video_clip = VideoFileClip(video_path)
86
+ sub_clip = video_clip.subclipped(start_time, end_time)
87
+ sub_clip.write_videofile(clip_path, verbose=False, logger=None)
88
+ video_clip.close()
89
+ except (AttributeError, Exception):
90
+ import subprocess
91
+ cmd = [
92
+ 'ffmpeg', '-i', video_path,
93
+ '-ss', str(start_time),
94
+ '-t', str(clip_duration_actual),
95
+ '-c', 'copy',
96
+ '-avoid_negative_ts', 'make_zero',
97
+ '-y', clip_path
98
+ ]
99
+ subprocess.run(cmd, capture_output=True, check=True)
100
+
101
+ clip_timestamps.append({
102
+ 'clip_id': clip_count + 1,
103
+ 'start_time': start_time,
104
+ 'end_time': end_time,
105
+ 'duration': clip_duration_actual
106
+ })
107
+
108
+ if clip_duration_actual < clip_duration:
109
+ clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s) [Final clip - {clip_duration_actual:.1f}s]")
110
+ else:
111
+ clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s)")
112
+
113
+ clip_paths.append(clip_path)
114
+ clip_count += 1
115
+ current_frame += frames_per_clip
116
+
117
+ except Exception as e:
118
+ print(f"Error creating clip {clip_count}: {str(e)}")
119
  current_frame += frames_per_clip
120
  continue
121
+
122
+ cap.release()
123
+ print(f"Successfully created {len(clip_paths)} clips from {duration:.2f}s video")
124
+ return clips, clip_paths, clip_timestamps
125
+
126
+ except Exception as e:
127
+ print(f"Error in cut_video_into_clips: {str(e)}")
128
+ return [], [], []
129
+
130
+ def process_and_analyze_video(self, video_file, query=None):
131
+ """Process video and optionally analyze with query in single GPU call"""
132
+ if video_file is None:
133
+ return "❌ Please upload a video file.", [], ""
134
+
135
+ try:
136
+ status_msg = "🎬 Processing video..."
137
+
138
+ # Clean up old clips
139
+ clips_dir = "./video_clips"
140
+ if os.path.exists(clips_dir):
141
+ for file in os.listdir(clips_dir):
142
+ try:
143
+ os.remove(os.path.join(clips_dir, file))
144
+ except:
145
+ pass
146
+
147
+ # Cut video into clips
148
+ status_msg += "\n🎬 Cutting video into 5-second clips..."
149
+ clips_info, clip_paths, clip_timestamps = self.cut_video_into_clips(video_file.name, clip_duration=5)
150
+
151
+ if not clip_paths:
152
+ return "❌ Error cutting video into clips.", [], ""
153
+
154
+ status_msg += f"\nβœ… Created {len(clip_paths)} clips"
155
+
156
+ # Generate embeddings
157
+ status_msg += "\nπŸ”„ Generating embeddings for video clips..."
158
+ embeddings = []
159
+
160
+ for i, clip_path in enumerate(tqdm(clip_paths, desc="Processing clips")):
161
+ try:
162
+ batch_doc = self.processor.process_videos([clip_path])
163
+
164
+ with torch.no_grad():
165
+ device = next(self.model.parameters()).device
166
+ batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
167
+ embedding = self.model(**batch_doc)
168
+ embeddings.extend(list(torch.unbind(embedding.to("cpu"))))
169
+
170
+ except Exception as e:
171
+ print(f"Error processing clip {i+1}: {e}")
172
+ continue
173
+
174
+ status_msg += f"\nβœ… Generated embeddings for {len(embeddings)} clips"
175
+
176
+ # Save embeddings and metadata to disk for persistence
177
+ embeddings_data = {
178
+ 'embeddings': embeddings,
179
+ 'clip_timestamps': clip_timestamps,
180
+ 'clips_info': clips_info
181
+ }
182
+
183
+ with open('./video_embeddings.pkl', 'wb') as f:
184
+ pickle.dump(embeddings_data, f)
185
+
186
+ status_msg += "\nπŸ’Ύ Embeddings saved to disk"
187
+
188
+ # If query provided, analyze immediately
189
+ analysis_result = ""
190
+ if query and query.strip():
191
+ status_msg += f"\nπŸ” Analyzing query: '{query}'"
192
+ analysis_result = self._analyze_with_embeddings(query, embeddings, clip_timestamps)
193
+ else:
194
+ status_msg += "\n🎯 Ready for queries!"
195
+
196
+ return status_msg, clips_info, analysis_result
197
+
198
+ except Exception as e:
199
+ return f"❌ Error processing video: {str(e)}", [], ""
200
+
201
+ def _analyze_with_embeddings(self, query, embeddings, clip_timestamps):
202
+ """Internal method to analyze with provided embeddings"""
203
+ try:
204
+ # Process query
205
+ batch_queries = self.processor.process_queries([query])
206
+ device = next(self.model.parameters()).device
207
+ batch_queries = {k: v.to(device) for k, v in batch_queries.items()}
208
+
209
+ # Generate query embedding
210
+ with torch.no_grad():
211
+ query_embedding = self.model(**batch_queries)
212
+
213
+ # Calculate scores
214
+ scores = self.processor.score_multi_vector(query_embedding, embeddings)
215
+
216
+ relevance_threshold = 0.5
217
+ relevant_clips = []
218
+
219
+ for idx, score in enumerate(scores[0]):
220
+ if score.item() > relevance_threshold:
221
+ timestamp_info = clip_timestamps[idx]
222
+ relevant_clips.append({
223
+ 'clip_id': idx + 1,
224
+ 'score': score.item(),
225
+ 'start_time': timestamp_info['start_time'],
226
+ 'end_time': timestamp_info['end_time']
227
+ })
228
+
229
+ relevant_clips.sort(key=lambda x: x['score'], reverse=True)
230
+
231
+ # Generate response
232
+ if relevant_clips:
233
+ question_words = ['does', 'do', 'is', 'are', 'was', 'were', 'can', 'could', 'will', 'would', 'has', 'have']
234
+ is_question = any(query.lower().strip().startswith(word) for word in question_words) or query.strip().endswith('?')
235
 
236
+ if is_question:
237
+ response = f"βœ… **Yes.** The following moments show activity matching '{query}':\n\n"
 
 
 
 
 
238
  else:
239
+ response = f"πŸ” **Analysis Results** for '{query}':\n\n"
240
 
241
+ response += "**πŸ“ Relevant Time Segments:**\n"
242
 
243
+ for i, clip in enumerate(relevant_clips[:5]):
244
+ start_min = int(clip['start_time'] // 60)
245
+ start_sec = int(clip['start_time'] % 60)
246
+ end_min = int(clip['end_time'] // 60)
247
+ end_sec = int(clip['end_time'] % 60)
248
+
249
+ confidence = "High" if clip['score'] > 0.8 else "Medium" if clip['score'] > 0.65 else "Low"
250
+
251
+ response += f"β€’ **{start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}** "
252
+ response += f"(Confidence: {confidence}, Score: {clip['score']:.3f})\n"
253
 
254
+ total_duration = sum(clip['end_time'] - clip['start_time'] for clip in relevant_clips)
255
+ response += f"\nπŸ“Š **Summary:** {len(relevant_clips)} relevant segment(s) found, "
256
+ response += f"totaling {total_duration:.1f} seconds of relevant footage."
257
+
258
+ else:
259
+ is_question = any(query.lower().strip().startswith(word) for word in ['does', 'do', 'is', 'are', 'was', 'were', 'can', 'could', 'will', 'would', 'has', 'have']) or query.strip().endswith('?')
260
+
261
+ if is_question:
262
+ response = f"❌ **No.** No clear evidence found for '{query}' in the analyzed footage."
263
+ else:
264
+ response = f"πŸ” **No Results** found for '{query}' in the analyzed footage."
265
+
266
+ response += f"\n\nπŸ’‘ **Suggestion:** Try rephrasing your query or check if the activity occurs in a different time period."
267
+
268
+ best_score = max(scores[0]).item()
269
+ response += f"\n\nπŸ”§ **Technical Details:**\n"
270
+ response += f"β€’ Analyzed {len(embeddings)} video segments\n"
271
+ response += f"β€’ Highest similarity score: {best_score:.3f}\n"
272
+ response += f"β€’ Relevance threshold: {relevance_threshold}\n"
273
+
274
+ return response
275
+
276
+ except Exception as e:
277
+ return f"❌ Error during analysis: {str(e)}"
278
+
279
+
280
+ # Initialize processor instance (this will be recreated in each GPU call)
281
+ def get_video_rag():
282
+ return VideoRAGProcessor()
283
+
284
 
285
  @spaces.GPU
286
+ def process_video_only(video_file):
287
+ """Process video without query"""
288
+ video_rag = get_video_rag()
289
+ status, clips, _ = video_rag.process_and_analyze_video(video_file)
290
+ return status, clips
291
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  @spaces.GPU
294
+ def process_video_with_query(video_file, query):
295
+ """Process video and analyze query in single GPU call"""
296
+ video_rag = get_video_rag()
297
+ status, clips, analysis = video_rag.process_and_analyze_video(video_file, query)
298
+ return status, clips, analysis
299
+
300
+
301
+ @spaces.GPU
302
+ def analyze_with_saved_embeddings(query):
303
+ """Analyze query using saved embeddings"""
304
  try:
305
+ # Load saved embeddings
306
+ if not os.path.exists('./video_embeddings.pkl'):
307
+ return "❌ No video processed. Please upload and process a video first."
 
 
 
 
 
 
 
 
 
 
 
308
 
309
+ with open('./video_embeddings.pkl', 'rb') as f:
310
+ data = pickle.load(f)
 
311
 
312
+ embeddings = data['embeddings']
313
+ clip_timestamps = data['clip_timestamps']
314
 
315
+ # Initialize processor for analysis
316
+ video_rag = get_video_rag()
317
+ result = video_rag._analyze_with_embeddings(query, embeddings, clip_timestamps)
318
 
319
+ return result
 
 
 
 
 
 
320
 
321
  except Exception as e:
322
+ return f"❌ Error loading embeddings or analyzing: {str(e)}"
323
+
324
+
325
+ # Gradio interface functions
326
+ def process_video_interface(video_file):
327
+ """Interface function for processing video only"""
328
+ return process_video_only(video_file)
329
+
330
+
331
+ def analyze_query_interface(query):
332
+ """Interface function for analyzing query"""
333
+ return analyze_with_saved_embeddings(query)
334
+
335
+
336
+ def process_and_analyze_interface(video_file, query):
337
+ """Interface function for processing video with immediate query"""
338
+ if query and query.strip():
339
+ return process_video_with_query(video_file, query)
340
+ else:
341
+ status, clips = process_video_only(video_file)
342
+ return status, clips, ""
343
+
344
 
345
  # Create Gradio interface
346
  def create_interface():
347
+ with gr.Blocks(title="Security Camera AI Assistant", theme=gr.themes.Soft()) as demo:
348
+ gr.Markdown("# πŸŽ₯ Security Camera AI Assistant")
349
+ gr.Markdown("Upload security footage and ask questions about what happened. Get detailed analysis with precise timestamps!")
 
 
 
 
 
 
 
350
 
351
  with gr.Row():
352
  with gr.Column(scale=1):
353
+ gr.Markdown("## πŸ“€ Upload Security Footage")
354
  video_input = gr.File(
355
  label="Upload Video File",
356
  file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
357
  type="filepath"
358
  )
359
+
360
+ # Option to process with immediate query
361
+ immediate_query = gr.Textbox(
362
+ label="Optional: Query to analyze immediately after processing",
363
+ placeholder="Leave empty to just process, or enter a query to analyze right away",
364
+ lines=2
365
+ )
366
+
367
+ process_btn = gr.Button("🎬 Process Video", variant="primary", size="lg")
368
 
369
  processing_status = gr.Textbox(
370
  label="Processing Status",
371
+ lines=8,
372
+ value="Model is ready! Upload a video and click 'Process Video' to start"
373
  )
374
 
375
  clips_list = gr.JSON(
 
378
  )
379
 
380
  with gr.Column(scale=1):
381
+ gr.Markdown("## πŸ” Ask Questions About the Footage")
382
  query_input = gr.Textbox(
383
+ label="Security Analysis Query",
384
+ placeholder="e.g., 'Does a person in grey shirt approach the building?', 'Is there any suspicious activity?', 'Show me when cars are parked'",
385
+ lines=3
386
  )
387
+ analyze_btn = gr.Button("πŸ” Analyze Footage", variant="secondary", size="lg")
388
 
389
+ analysis_results = gr.Textbox(
390
+ label="Analysis Results",
391
+ lines=15,
392
+ value="Process a video first, then ask questions about what you want to find in the footage."
393
  )
394
 
 
 
 
 
 
 
395
  # Event handlers
396
  process_btn.click(
397
+ process_and_analyze_interface,
398
+ inputs=[video_input, immediate_query],
399
+ outputs=[processing_status, clips_list, analysis_results]
400
  )
401
 
402
+ analyze_btn.click(
403
+ analyze_query_interface,
404
  inputs=[query_input],
405
+ outputs=[analysis_results]
406
  )
407
 
 
408
  query_input.submit(
409
+ analyze_query_interface,
410
  inputs=[query_input],
411
+ outputs=[analysis_results]
412
  )
413
 
414
  gr.Markdown("""
415
+ ## πŸ“ How to Use:
416
+ 1. **Upload**: Choose your security camera video file (MP4, AVI, MOV, MKV, WebM)
417
+ 2. **Process**: Click 'Process Video' to analyze the footage (this creates 5-second segments)
418
+ 3. **Ask**: Type your question about what you want to find in the footage
419
+ 4. **Analyze**: Click 'Analyze Footage' to get detailed results with timestamps
420
+
421
+ ## πŸ’‘ Pro Tip:
422
+ - You can enter a query in the "Optional" field to analyze immediately after processing
423
+ - This saves GPU time by doing both operations in a single call
424
+
425
+ ## πŸ” Example Questions:
426
+ - "Does a person in red clothing enter the building?"
427
+ - "Is there any suspicious activity near the entrance?"
428
+ - "Show me when vehicles are present"
429
+ - "Are there people walking by during daytime?"
430
+ - "Is there movement in the parking area?"
431
 
432
  ## πŸ”§ Features:
433
+ - βœ‚οΈ Automatic video segmentation into 5-second clips
434
+ - 🧠 AI-powered semantic video analysis using ColQwen2.5 Omni
435
+ - πŸ“ Precise timestamp reporting (MM:SS format)
436
+ - πŸ“Š Confidence scoring for each detection
437
+ - 🎯 Yes/No question answering for security queries
438
+ - ⚑ Smart relevance filtering to show only significant matches
439
+ - πŸ’Ύ Persistent embeddings storage for multiple queries
440
  """)
441
 
442
  return demo
443
 
444
+
445
  if __name__ == "__main__":
446
  demo = create_interface()
447
  demo.launch()