Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -4,248 +4,372 @@ import cv2
|
|
4 |
import os
|
5 |
import tempfile
|
6 |
import numpy as np
|
|
|
7 |
from PIL import Image
|
8 |
from tqdm import tqdm
|
9 |
from torch.utils.data import DataLoader
|
10 |
-
from moviepy
|
11 |
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
|
12 |
import spaces
|
13 |
import warnings
|
14 |
warnings.filterwarnings("ignore")
|
15 |
|
16 |
-
# Global variables to store embeddings and clips (NOT model - that's loaded per GPU call)
|
17 |
-
video_embeddings = []
|
18 |
-
video_clips = []
|
19 |
|
20 |
-
|
21 |
-
"""
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
24 |
"vidore/colqwen-omni-v0.1",
|
25 |
torch_dtype=torch.bfloat16,
|
26 |
device_map="cuda" if torch.cuda.is_available() else "cpu",
|
27 |
-
attn_implementation="eager",
|
28 |
).eval()
|
29 |
|
30 |
-
processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
|
31 |
-
return model, processor
|
32 |
-
except Exception as e:
|
33 |
-
raise Exception(f"Error loading model: {str(e)}")
|
34 |
-
|
35 |
-
def initialize_model():
|
36 |
-
"""Initialize model on CPU (for status check only)"""
|
37 |
-
try:
|
38 |
-
# Just return success message - actual loading happens in GPU functions
|
39 |
-
return "β
Ready to process! Model will be loaded when you upload a video."
|
40 |
-
except Exception as e:
|
41 |
-
return f"β Error: {str(e)}"
|
42 |
-
|
43 |
-
def cut_video_into_clips(video_path, clip_duration=10):
|
44 |
-
"""Cut video into clips of specified duration (default 10 seconds)
|
45 |
-
|
46 |
-
Handles videos of any length - the last clip will be shorter if video
|
47 |
-
duration is not exactly divisible by clip_duration.
|
48 |
-
"""
|
49 |
-
clips = []
|
50 |
-
clip_paths = []
|
51 |
-
|
52 |
-
try:
|
53 |
-
# Use OpenCV for more reliable video processing on HF Spaces
|
54 |
-
cap = cv2.VideoCapture(video_path)
|
55 |
-
fps = cap.get(cv2.CAP_PROP_FPS)
|
56 |
-
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
57 |
-
duration = total_frames / fps
|
58 |
-
|
59 |
-
print(f"Video info: {duration:.2f}s total, {fps:.2f} FPS, {total_frames} frames")
|
60 |
-
|
61 |
-
# Calculate frames per clip
|
62 |
-
frames_per_clip = int(fps * clip_duration)
|
63 |
-
|
64 |
-
clip_count = 0
|
65 |
-
current_frame = 0
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
end_time = min((current_frame + frames_per_clip) / fps, duration)
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
current_frame += frames_per_clip
|
83 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
video_clip.close()
|
88 |
-
|
89 |
-
# More detailed clip info showing actual duration
|
90 |
-
if clip_duration_actual < clip_duration:
|
91 |
-
clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s) [Final clip - {clip_duration_actual:.1f}s]")
|
92 |
else:
|
93 |
-
|
94 |
|
95 |
-
|
96 |
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
@spaces.GPU
|
114 |
-
def
|
115 |
-
"""Process
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
try:
|
122 |
-
# Load model inside GPU function
|
123 |
-
status_msg = "π Loading model..."
|
124 |
-
model, processor = load_model()
|
125 |
-
status_msg += "\nβ
Model loaded successfully!"
|
126 |
-
|
127 |
-
# Reset previous data
|
128 |
-
video_embeddings = []
|
129 |
-
video_clips = []
|
130 |
-
|
131 |
-
# Cut video into 10-second clips
|
132 |
-
status_msg += "\n㪠Cutting video into 10-second clips..."
|
133 |
-
clips_info, clip_paths = cut_video_into_clips(video_file.name, clip_duration=10)
|
134 |
-
|
135 |
-
if not clip_paths:
|
136 |
-
return "β Error cutting video into clips.", []
|
137 |
-
|
138 |
-
status_msg += f"\nβ
Created {len(clip_paths)} clips"
|
139 |
-
|
140 |
-
# Process each clip with the model
|
141 |
-
status_msg += "\nπ Generating embeddings for video clips..."
|
142 |
-
|
143 |
-
# Create dataloader for batch processing
|
144 |
-
dataloader = DataLoader(
|
145 |
-
dataset=clip_paths,
|
146 |
-
batch_size=1,
|
147 |
-
shuffle=False,
|
148 |
-
collate_fn=lambda x: processor.process_videos(x),
|
149 |
-
)
|
150 |
-
|
151 |
-
embeddings = []
|
152 |
-
for i, batch_doc in enumerate(tqdm(dataloader, desc="Processing clips")):
|
153 |
-
with torch.no_grad():
|
154 |
-
# Move to device
|
155 |
-
device = next(model.parameters()).device
|
156 |
-
batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
|
157 |
-
|
158 |
-
# Generate embeddings
|
159 |
-
embedding = model(**batch_doc)
|
160 |
-
embeddings.extend(list(torch.unbind(embedding.to("cpu"))))
|
161 |
-
|
162 |
-
video_embeddings = embeddings
|
163 |
-
video_clips = clip_paths
|
164 |
-
|
165 |
-
status_msg += f"\nβ
Generated embeddings for {len(embeddings)} clips"
|
166 |
-
status_msg += "\nπ― Ready for queries!"
|
167 |
-
|
168 |
-
return status_msg, clips_info
|
169 |
-
|
170 |
-
except Exception as e:
|
171 |
-
return f"β Error processing video: {str(e)}", []
|
172 |
|
173 |
@spaces.GPU
|
174 |
-
def
|
175 |
-
"""
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
try:
|
185 |
-
# Load
|
186 |
-
|
187 |
-
|
188 |
-
# Process query
|
189 |
-
batch_queries = processor.process_queries([query])
|
190 |
-
device = next(model.parameters()).device
|
191 |
-
batch_queries = {k: v.to(device) for k, v in batch_queries.items()}
|
192 |
-
|
193 |
-
# Generate query embedding
|
194 |
-
with torch.no_grad():
|
195 |
-
query_embedding = model(**batch_queries)
|
196 |
-
|
197 |
-
# Calculate scores
|
198 |
-
scores = processor.score_multi_vector(query_embedding, video_embeddings)
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
best_score = scores[0][best_clip_idx].item()
|
203 |
|
204 |
-
|
205 |
-
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
|
211 |
-
|
212 |
-
top_3_scores = torch.topk(scores[0], min(3, len(scores[0])))
|
213 |
-
rankings = "\n\nπ Top 3 Results:\n"
|
214 |
-
for i, (score, idx) in enumerate(zip(top_3_scores.values, top_3_scores.indices)):
|
215 |
-
rankings += f"{i+1}. Clip {idx+1} (Score: {score:.4f})\n"
|
216 |
-
|
217 |
-
return result_text + rankings, best_clip_path, f"Best matching clip for: '{query}'"
|
218 |
|
219 |
except Exception as e:
|
220 |
-
return f"β Error
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
# Create Gradio interface
|
223 |
def create_interface():
|
224 |
-
with gr.Blocks(title="
|
225 |
-
gr.Markdown("#
|
226 |
-
gr.Markdown("Upload
|
227 |
-
|
228 |
-
# Initialize model on startup
|
229 |
-
with gr.Row():
|
230 |
-
init_btn = gr.Button("π Initialize Model", variant="primary")
|
231 |
-
init_status = gr.Textbox(label="Initialization Status", value="Click 'Initialize Model' to start")
|
232 |
-
|
233 |
-
init_btn.click(initialize_model, outputs=[init_status])
|
234 |
|
235 |
with gr.Row():
|
236 |
with gr.Column(scale=1):
|
237 |
-
gr.Markdown("## π€ Upload
|
238 |
video_input = gr.File(
|
239 |
label="Upload Video File",
|
240 |
file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
|
241 |
type="filepath"
|
242 |
)
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
processing_status = gr.Textbox(
|
246 |
label="Processing Status",
|
247 |
-
lines=
|
248 |
-
value="Upload a video and click 'Process Video' to start"
|
249 |
)
|
250 |
|
251 |
clips_list = gr.JSON(
|
@@ -254,63 +378,70 @@ def create_interface():
|
|
254 |
)
|
255 |
|
256 |
with gr.Column(scale=1):
|
257 |
-
gr.Markdown("## π
|
258 |
query_input = gr.Textbox(
|
259 |
-
label="
|
260 |
-
placeholder="e.g., 'a
|
261 |
-
lines=
|
262 |
)
|
263 |
-
|
264 |
|
265 |
-
|
266 |
-
label="
|
267 |
-
lines=
|
|
|
268 |
)
|
269 |
|
270 |
-
with gr.Row():
|
271 |
-
result_video = gr.Video(
|
272 |
-
label="Best Matching Clip",
|
273 |
-
visible=True
|
274 |
-
)
|
275 |
-
|
276 |
# Event handlers
|
277 |
process_btn.click(
|
278 |
-
|
279 |
-
inputs=[video_input],
|
280 |
-
outputs=[processing_status, clips_list]
|
281 |
)
|
282 |
|
283 |
-
|
284 |
-
|
285 |
inputs=[query_input],
|
286 |
-
outputs=[
|
287 |
)
|
288 |
|
289 |
-
# Auto-search on Enter
|
290 |
query_input.submit(
|
291 |
-
|
292 |
inputs=[query_input],
|
293 |
-
outputs=[
|
294 |
)
|
295 |
|
296 |
gr.Markdown("""
|
297 |
-
## π
|
298 |
-
1. **
|
299 |
-
2. **
|
300 |
-
3. **
|
301 |
-
4. **
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
## π§ Features:
|
305 |
-
- βοΈ Automatic video segmentation into
|
306 |
-
- π§ AI-powered semantic video
|
307 |
-
-
|
308 |
-
-
|
309 |
-
-
|
|
|
|
|
310 |
""")
|
311 |
|
312 |
return demo
|
313 |
|
|
|
314 |
if __name__ == "__main__":
|
315 |
demo = create_interface()
|
316 |
demo.launch()
|
|
|
4 |
import os
|
5 |
import tempfile
|
6 |
import numpy as np
|
7 |
+
import pickle
|
8 |
from PIL import Image
|
9 |
from tqdm import tqdm
|
10 |
from torch.utils.data import DataLoader
|
11 |
+
from moviepy import VideoFileClip
|
12 |
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
|
13 |
import spaces
|
14 |
import warnings
|
15 |
warnings.filterwarnings("ignore")
|
16 |
|
|
|
|
|
|
|
17 |
|
18 |
+
class VideoRAGProcessor:
|
19 |
+
"""Class to handle model initialization and video processing"""
|
20 |
+
|
21 |
+
def __init__(self):
|
22 |
+
"""Initialize model and processor directly"""
|
23 |
+
print("Loading ColQwen2.5 Omni model... This may take a few minutes.")
|
24 |
+
|
25 |
+
self.model = ColQwen2_5Omni.from_pretrained(
|
26 |
"vidore/colqwen-omni-v0.1",
|
27 |
torch_dtype=torch.bfloat16,
|
28 |
device_map="cuda" if torch.cuda.is_available() else "cpu",
|
29 |
+
attn_implementation="eager",
|
30 |
).eval()
|
31 |
|
32 |
+
self.processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
print("Model loaded successfully!")
|
35 |
+
|
36 |
+
def cut_video_into_clips(self, video_path, clip_duration=5):
|
37 |
+
"""Cut video into clips of specified duration (default 5 seconds)"""
|
38 |
+
clips = []
|
39 |
+
clip_paths = []
|
40 |
+
clip_timestamps = []
|
41 |
+
|
42 |
+
try:
|
43 |
+
clips_dir = "./video_clips"
|
44 |
+
os.makedirs(clips_dir, exist_ok=True)
|
45 |
+
|
46 |
+
cap = cv2.VideoCapture(video_path)
|
47 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
48 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
49 |
+
duration = total_frames / fps
|
50 |
+
|
51 |
+
print(f"Video info: {duration:.2f}s total, {fps:.2f} FPS, {total_frames} frames")
|
52 |
+
|
53 |
+
frames_per_clip = int(fps * clip_duration)
|
54 |
+
clip_count = 0
|
55 |
+
current_frame = 0
|
56 |
|
57 |
+
while current_frame < total_frames:
|
58 |
+
clip_filename = f"clip_{clip_count + 1}.mp4"
|
59 |
+
clip_path = os.path.join(clips_dir, clip_filename)
|
|
|
60 |
|
61 |
+
try:
|
62 |
+
start_time = current_frame / fps
|
63 |
+
end_time = min((current_frame + frames_per_clip) / fps, duration)
|
64 |
+
|
65 |
+
clip_duration_actual = end_time - start_time
|
66 |
+
if clip_duration_actual < 1.0:
|
67 |
+
print(f"Skipping clip {clip_count + 1} - too short ({clip_duration_actual:.1f}s)")
|
68 |
+
current_frame += frames_per_clip
|
69 |
+
continue
|
70 |
+
|
71 |
+
if end_time >= duration:
|
72 |
+
end_time = duration - 0.1
|
73 |
+
clip_duration_actual = end_time - start_time
|
74 |
+
if clip_duration_actual < 1.0:
|
75 |
+
print(f"Skipping final clip - too short after adjustment ({clip_duration_actual:.1f}s)")
|
76 |
+
break
|
77 |
+
|
78 |
+
try:
|
79 |
+
video_clip = VideoFileClip(video_path)
|
80 |
+
sub_clip = video_clip.subclip(start_time, end_time)
|
81 |
+
sub_clip.write_videofile(clip_path, verbose=False, logger=None)
|
82 |
+
video_clip.close()
|
83 |
+
except AttributeError:
|
84 |
+
try:
|
85 |
+
video_clip = VideoFileClip(video_path)
|
86 |
+
sub_clip = video_clip.subclipped(start_time, end_time)
|
87 |
+
sub_clip.write_videofile(clip_path, verbose=False, logger=None)
|
88 |
+
video_clip.close()
|
89 |
+
except (AttributeError, Exception):
|
90 |
+
import subprocess
|
91 |
+
cmd = [
|
92 |
+
'ffmpeg', '-i', video_path,
|
93 |
+
'-ss', str(start_time),
|
94 |
+
'-t', str(clip_duration_actual),
|
95 |
+
'-c', 'copy',
|
96 |
+
'-avoid_negative_ts', 'make_zero',
|
97 |
+
'-y', clip_path
|
98 |
+
]
|
99 |
+
subprocess.run(cmd, capture_output=True, check=True)
|
100 |
+
|
101 |
+
clip_timestamps.append({
|
102 |
+
'clip_id': clip_count + 1,
|
103 |
+
'start_time': start_time,
|
104 |
+
'end_time': end_time,
|
105 |
+
'duration': clip_duration_actual
|
106 |
+
})
|
107 |
+
|
108 |
+
if clip_duration_actual < clip_duration:
|
109 |
+
clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s) [Final clip - {clip_duration_actual:.1f}s]")
|
110 |
+
else:
|
111 |
+
clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s)")
|
112 |
+
|
113 |
+
clip_paths.append(clip_path)
|
114 |
+
clip_count += 1
|
115 |
+
current_frame += frames_per_clip
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
print(f"Error creating clip {clip_count}: {str(e)}")
|
119 |
current_frame += frames_per_clip
|
120 |
continue
|
121 |
+
|
122 |
+
cap.release()
|
123 |
+
print(f"Successfully created {len(clip_paths)} clips from {duration:.2f}s video")
|
124 |
+
return clips, clip_paths, clip_timestamps
|
125 |
+
|
126 |
+
except Exception as e:
|
127 |
+
print(f"Error in cut_video_into_clips: {str(e)}")
|
128 |
+
return [], [], []
|
129 |
+
|
130 |
+
def process_and_analyze_video(self, video_file, query=None):
|
131 |
+
"""Process video and optionally analyze with query in single GPU call"""
|
132 |
+
if video_file is None:
|
133 |
+
return "β Please upload a video file.", [], ""
|
134 |
+
|
135 |
+
try:
|
136 |
+
status_msg = "π¬ Processing video..."
|
137 |
+
|
138 |
+
# Clean up old clips
|
139 |
+
clips_dir = "./video_clips"
|
140 |
+
if os.path.exists(clips_dir):
|
141 |
+
for file in os.listdir(clips_dir):
|
142 |
+
try:
|
143 |
+
os.remove(os.path.join(clips_dir, file))
|
144 |
+
except:
|
145 |
+
pass
|
146 |
+
|
147 |
+
# Cut video into clips
|
148 |
+
status_msg += "\n㪠Cutting video into 5-second clips..."
|
149 |
+
clips_info, clip_paths, clip_timestamps = self.cut_video_into_clips(video_file.name, clip_duration=5)
|
150 |
+
|
151 |
+
if not clip_paths:
|
152 |
+
return "β Error cutting video into clips.", [], ""
|
153 |
+
|
154 |
+
status_msg += f"\nβ
Created {len(clip_paths)} clips"
|
155 |
+
|
156 |
+
# Generate embeddings
|
157 |
+
status_msg += "\nπ Generating embeddings for video clips..."
|
158 |
+
embeddings = []
|
159 |
+
|
160 |
+
for i, clip_path in enumerate(tqdm(clip_paths, desc="Processing clips")):
|
161 |
+
try:
|
162 |
+
batch_doc = self.processor.process_videos([clip_path])
|
163 |
+
|
164 |
+
with torch.no_grad():
|
165 |
+
device = next(self.model.parameters()).device
|
166 |
+
batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
|
167 |
+
embedding = self.model(**batch_doc)
|
168 |
+
embeddings.extend(list(torch.unbind(embedding.to("cpu"))))
|
169 |
+
|
170 |
+
except Exception as e:
|
171 |
+
print(f"Error processing clip {i+1}: {e}")
|
172 |
+
continue
|
173 |
+
|
174 |
+
status_msg += f"\nβ
Generated embeddings for {len(embeddings)} clips"
|
175 |
+
|
176 |
+
# Save embeddings and metadata to disk for persistence
|
177 |
+
embeddings_data = {
|
178 |
+
'embeddings': embeddings,
|
179 |
+
'clip_timestamps': clip_timestamps,
|
180 |
+
'clips_info': clips_info
|
181 |
+
}
|
182 |
+
|
183 |
+
with open('./video_embeddings.pkl', 'wb') as f:
|
184 |
+
pickle.dump(embeddings_data, f)
|
185 |
+
|
186 |
+
status_msg += "\nπΎ Embeddings saved to disk"
|
187 |
+
|
188 |
+
# If query provided, analyze immediately
|
189 |
+
analysis_result = ""
|
190 |
+
if query and query.strip():
|
191 |
+
status_msg += f"\nπ Analyzing query: '{query}'"
|
192 |
+
analysis_result = self._analyze_with_embeddings(query, embeddings, clip_timestamps)
|
193 |
+
else:
|
194 |
+
status_msg += "\nπ― Ready for queries!"
|
195 |
+
|
196 |
+
return status_msg, clips_info, analysis_result
|
197 |
+
|
198 |
+
except Exception as e:
|
199 |
+
return f"β Error processing video: {str(e)}", [], ""
|
200 |
+
|
201 |
+
def _analyze_with_embeddings(self, query, embeddings, clip_timestamps):
|
202 |
+
"""Internal method to analyze with provided embeddings"""
|
203 |
+
try:
|
204 |
+
# Process query
|
205 |
+
batch_queries = self.processor.process_queries([query])
|
206 |
+
device = next(self.model.parameters()).device
|
207 |
+
batch_queries = {k: v.to(device) for k, v in batch_queries.items()}
|
208 |
+
|
209 |
+
# Generate query embedding
|
210 |
+
with torch.no_grad():
|
211 |
+
query_embedding = self.model(**batch_queries)
|
212 |
+
|
213 |
+
# Calculate scores
|
214 |
+
scores = self.processor.score_multi_vector(query_embedding, embeddings)
|
215 |
+
|
216 |
+
relevance_threshold = 0.5
|
217 |
+
relevant_clips = []
|
218 |
+
|
219 |
+
for idx, score in enumerate(scores[0]):
|
220 |
+
if score.item() > relevance_threshold:
|
221 |
+
timestamp_info = clip_timestamps[idx]
|
222 |
+
relevant_clips.append({
|
223 |
+
'clip_id': idx + 1,
|
224 |
+
'score': score.item(),
|
225 |
+
'start_time': timestamp_info['start_time'],
|
226 |
+
'end_time': timestamp_info['end_time']
|
227 |
+
})
|
228 |
+
|
229 |
+
relevant_clips.sort(key=lambda x: x['score'], reverse=True)
|
230 |
+
|
231 |
+
# Generate response
|
232 |
+
if relevant_clips:
|
233 |
+
question_words = ['does', 'do', 'is', 'are', 'was', 'were', 'can', 'could', 'will', 'would', 'has', 'have']
|
234 |
+
is_question = any(query.lower().strip().startswith(word) for word in question_words) or query.strip().endswith('?')
|
235 |
|
236 |
+
if is_question:
|
237 |
+
response = f"β
**Yes.** The following moments show activity matching '{query}':\n\n"
|
|
|
|
|
|
|
|
|
|
|
238 |
else:
|
239 |
+
response = f"π **Analysis Results** for '{query}':\n\n"
|
240 |
|
241 |
+
response += "**π Relevant Time Segments:**\n"
|
242 |
|
243 |
+
for i, clip in enumerate(relevant_clips[:5]):
|
244 |
+
start_min = int(clip['start_time'] // 60)
|
245 |
+
start_sec = int(clip['start_time'] % 60)
|
246 |
+
end_min = int(clip['end_time'] // 60)
|
247 |
+
end_sec = int(clip['end_time'] % 60)
|
248 |
+
|
249 |
+
confidence = "High" if clip['score'] > 0.8 else "Medium" if clip['score'] > 0.65 else "Low"
|
250 |
+
|
251 |
+
response += f"β’ **{start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}** "
|
252 |
+
response += f"(Confidence: {confidence}, Score: {clip['score']:.3f})\n"
|
253 |
|
254 |
+
total_duration = sum(clip['end_time'] - clip['start_time'] for clip in relevant_clips)
|
255 |
+
response += f"\nπ **Summary:** {len(relevant_clips)} relevant segment(s) found, "
|
256 |
+
response += f"totaling {total_duration:.1f} seconds of relevant footage."
|
257 |
+
|
258 |
+
else:
|
259 |
+
is_question = any(query.lower().strip().startswith(word) for word in ['does', 'do', 'is', 'are', 'was', 'were', 'can', 'could', 'will', 'would', 'has', 'have']) or query.strip().endswith('?')
|
260 |
+
|
261 |
+
if is_question:
|
262 |
+
response = f"β **No.** No clear evidence found for '{query}' in the analyzed footage."
|
263 |
+
else:
|
264 |
+
response = f"π **No Results** found for '{query}' in the analyzed footage."
|
265 |
+
|
266 |
+
response += f"\n\nπ‘ **Suggestion:** Try rephrasing your query or check if the activity occurs in a different time period."
|
267 |
+
|
268 |
+
best_score = max(scores[0]).item()
|
269 |
+
response += f"\n\nπ§ **Technical Details:**\n"
|
270 |
+
response += f"β’ Analyzed {len(embeddings)} video segments\n"
|
271 |
+
response += f"β’ Highest similarity score: {best_score:.3f}\n"
|
272 |
+
response += f"β’ Relevance threshold: {relevance_threshold}\n"
|
273 |
+
|
274 |
+
return response
|
275 |
+
|
276 |
+
except Exception as e:
|
277 |
+
return f"β Error during analysis: {str(e)}"
|
278 |
+
|
279 |
+
|
280 |
+
# Initialize processor instance (this will be recreated in each GPU call)
|
281 |
+
def get_video_rag():
|
282 |
+
return VideoRAGProcessor()
|
283 |
+
|
284 |
|
285 |
@spaces.GPU
|
286 |
+
def process_video_only(video_file):
|
287 |
+
"""Process video without query"""
|
288 |
+
video_rag = get_video_rag()
|
289 |
+
status, clips, _ = video_rag.process_and_analyze_video(video_file)
|
290 |
+
return status, clips
|
291 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
@spaces.GPU
|
294 |
+
def process_video_with_query(video_file, query):
|
295 |
+
"""Process video and analyze query in single GPU call"""
|
296 |
+
video_rag = get_video_rag()
|
297 |
+
status, clips, analysis = video_rag.process_and_analyze_video(video_file, query)
|
298 |
+
return status, clips, analysis
|
299 |
+
|
300 |
+
|
301 |
+
@spaces.GPU
|
302 |
+
def analyze_with_saved_embeddings(query):
|
303 |
+
"""Analyze query using saved embeddings"""
|
304 |
try:
|
305 |
+
# Load saved embeddings
|
306 |
+
if not os.path.exists('./video_embeddings.pkl'):
|
307 |
+
return "β No video processed. Please upload and process a video first."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
+
with open('./video_embeddings.pkl', 'rb') as f:
|
310 |
+
data = pickle.load(f)
|
|
|
311 |
|
312 |
+
embeddings = data['embeddings']
|
313 |
+
clip_timestamps = data['clip_timestamps']
|
314 |
|
315 |
+
# Initialize processor for analysis
|
316 |
+
video_rag = get_video_rag()
|
317 |
+
result = video_rag._analyze_with_embeddings(query, embeddings, clip_timestamps)
|
318 |
|
319 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
|
321 |
except Exception as e:
|
322 |
+
return f"β Error loading embeddings or analyzing: {str(e)}"
|
323 |
+
|
324 |
+
|
325 |
+
# Gradio interface functions
|
326 |
+
def process_video_interface(video_file):
|
327 |
+
"""Interface function for processing video only"""
|
328 |
+
return process_video_only(video_file)
|
329 |
+
|
330 |
+
|
331 |
+
def analyze_query_interface(query):
|
332 |
+
"""Interface function for analyzing query"""
|
333 |
+
return analyze_with_saved_embeddings(query)
|
334 |
+
|
335 |
+
|
336 |
+
def process_and_analyze_interface(video_file, query):
|
337 |
+
"""Interface function for processing video with immediate query"""
|
338 |
+
if query and query.strip():
|
339 |
+
return process_video_with_query(video_file, query)
|
340 |
+
else:
|
341 |
+
status, clips = process_video_only(video_file)
|
342 |
+
return status, clips, ""
|
343 |
+
|
344 |
|
345 |
# Create Gradio interface
|
346 |
def create_interface():
|
347 |
+
with gr.Blocks(title="Security Camera AI Assistant", theme=gr.themes.Soft()) as demo:
|
348 |
+
gr.Markdown("# π₯ Security Camera AI Assistant")
|
349 |
+
gr.Markdown("Upload security footage and ask questions about what happened. Get detailed analysis with precise timestamps!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
350 |
|
351 |
with gr.Row():
|
352 |
with gr.Column(scale=1):
|
353 |
+
gr.Markdown("## π€ Upload Security Footage")
|
354 |
video_input = gr.File(
|
355 |
label="Upload Video File",
|
356 |
file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
|
357 |
type="filepath"
|
358 |
)
|
359 |
+
|
360 |
+
# Option to process with immediate query
|
361 |
+
immediate_query = gr.Textbox(
|
362 |
+
label="Optional: Query to analyze immediately after processing",
|
363 |
+
placeholder="Leave empty to just process, or enter a query to analyze right away",
|
364 |
+
lines=2
|
365 |
+
)
|
366 |
+
|
367 |
+
process_btn = gr.Button("π¬ Process Video", variant="primary", size="lg")
|
368 |
|
369 |
processing_status = gr.Textbox(
|
370 |
label="Processing Status",
|
371 |
+
lines=8,
|
372 |
+
value="Model is ready! Upload a video and click 'Process Video' to start"
|
373 |
)
|
374 |
|
375 |
clips_list = gr.JSON(
|
|
|
378 |
)
|
379 |
|
380 |
with gr.Column(scale=1):
|
381 |
+
gr.Markdown("## π Ask Questions About the Footage")
|
382 |
query_input = gr.Textbox(
|
383 |
+
label="Security Analysis Query",
|
384 |
+
placeholder="e.g., 'Does a person in grey shirt approach the building?', 'Is there any suspicious activity?', 'Show me when cars are parked'",
|
385 |
+
lines=3
|
386 |
)
|
387 |
+
analyze_btn = gr.Button("π Analyze Footage", variant="secondary", size="lg")
|
388 |
|
389 |
+
analysis_results = gr.Textbox(
|
390 |
+
label="Analysis Results",
|
391 |
+
lines=15,
|
392 |
+
value="Process a video first, then ask questions about what you want to find in the footage."
|
393 |
)
|
394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
# Event handlers
|
396 |
process_btn.click(
|
397 |
+
process_and_analyze_interface,
|
398 |
+
inputs=[video_input, immediate_query],
|
399 |
+
outputs=[processing_status, clips_list, analysis_results]
|
400 |
)
|
401 |
|
402 |
+
analyze_btn.click(
|
403 |
+
analyze_query_interface,
|
404 |
inputs=[query_input],
|
405 |
+
outputs=[analysis_results]
|
406 |
)
|
407 |
|
|
|
408 |
query_input.submit(
|
409 |
+
analyze_query_interface,
|
410 |
inputs=[query_input],
|
411 |
+
outputs=[analysis_results]
|
412 |
)
|
413 |
|
414 |
gr.Markdown("""
|
415 |
+
## π How to Use:
|
416 |
+
1. **Upload**: Choose your security camera video file (MP4, AVI, MOV, MKV, WebM)
|
417 |
+
2. **Process**: Click 'Process Video' to analyze the footage (this creates 5-second segments)
|
418 |
+
3. **Ask**: Type your question about what you want to find in the footage
|
419 |
+
4. **Analyze**: Click 'Analyze Footage' to get detailed results with timestamps
|
420 |
+
|
421 |
+
## π‘ Pro Tip:
|
422 |
+
- You can enter a query in the "Optional" field to analyze immediately after processing
|
423 |
+
- This saves GPU time by doing both operations in a single call
|
424 |
+
|
425 |
+
## π Example Questions:
|
426 |
+
- "Does a person in red clothing enter the building?"
|
427 |
+
- "Is there any suspicious activity near the entrance?"
|
428 |
+
- "Show me when vehicles are present"
|
429 |
+
- "Are there people walking by during daytime?"
|
430 |
+
- "Is there movement in the parking area?"
|
431 |
|
432 |
## π§ Features:
|
433 |
+
- βοΈ Automatic video segmentation into 5-second clips
|
434 |
+
- π§ AI-powered semantic video analysis using ColQwen2.5 Omni
|
435 |
+
- π Precise timestamp reporting (MM:SS format)
|
436 |
+
- π Confidence scoring for each detection
|
437 |
+
- π― Yes/No question answering for security queries
|
438 |
+
- β‘ Smart relevance filtering to show only significant matches
|
439 |
+
- πΎ Persistent embeddings storage for multiple queries
|
440 |
""")
|
441 |
|
442 |
return demo
|
443 |
|
444 |
+
|
445 |
if __name__ == "__main__":
|
446 |
demo = create_interface()
|
447 |
demo.launch()
|