Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,366 +1,288 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
|
|
3 |
import os
|
4 |
import tempfile
|
5 |
-
import
|
6 |
from PIL import Image
|
7 |
from tqdm import tqdm
|
8 |
from torch.utils.data import DataLoader
|
9 |
from moviepy.editor import VideoFileClip
|
10 |
-
import numpy as np
|
11 |
-
import gc
|
12 |
-
|
13 |
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
|
|
|
|
|
14 |
|
15 |
-
# Global variables to store model and
|
16 |
model = None
|
17 |
processor = None
|
18 |
video_embeddings = []
|
19 |
video_clips = []
|
20 |
-
temp_dirs = []
|
21 |
|
22 |
-
def
|
23 |
-
"""
|
24 |
-
global temp_dirs
|
25 |
-
for temp_dir in temp_dirs:
|
26 |
-
try:
|
27 |
-
if os.path.exists(temp_dir):
|
28 |
-
shutil.rmtree(temp_dir)
|
29 |
-
except:
|
30 |
-
pass
|
31 |
-
temp_dirs = []
|
32 |
-
gc.collect()
|
33 |
-
|
34 |
-
def load_model():
|
35 |
-
"""Load the ColQwen2.5 Omni model and processor"""
|
36 |
global model, processor
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
"manu/colqwen-omni-v0.1",
|
50 |
-
trust_remote_code=True
|
51 |
-
)
|
52 |
-
print("Model loaded successfully!")
|
53 |
-
return True
|
54 |
-
except Exception as e:
|
55 |
-
print(f"Error loading model: {e}")
|
56 |
-
return False
|
57 |
|
58 |
-
|
|
|
59 |
|
60 |
-
def
|
61 |
-
"""
|
62 |
clips = []
|
63 |
-
|
64 |
-
temp_dirs.append(temp_dir)
|
65 |
|
66 |
try:
|
67 |
-
#
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
71 |
|
72 |
-
|
|
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
print(f"Creating {num_clips} clips of {clip_duration} seconds each")
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
# Extract clip
|
85 |
-
clip = video.subclip(start_time, end_time)
|
86 |
-
|
87 |
-
# Save clip to temporary file
|
88 |
-
clip_path = os.path.join(temp_dir, f"clip_{i:03d}.mp4")
|
89 |
-
clip.write_videofile(
|
90 |
-
clip_path,
|
91 |
-
verbose=False,
|
92 |
-
logger=None,
|
93 |
-
temp_audiofile_path=temp_dir
|
94 |
-
)
|
95 |
-
|
96 |
-
clips.append(clip_path)
|
97 |
-
clip.close()
|
98 |
-
|
99 |
-
video.close()
|
100 |
-
print(f"Successfully created {len(clips)} clips")
|
101 |
-
return clips, temp_dir
|
102 |
-
|
103 |
-
except Exception as e:
|
104 |
-
print(f"Error splitting video: {e}")
|
105 |
-
return [], temp_dir
|
106 |
-
|
107 |
-
def embed_video_clips(clips):
|
108 |
-
"""Embed video clips using ColQwen2.5 Omni"""
|
109 |
-
global model, processor
|
110 |
-
|
111 |
-
if not clips:
|
112 |
-
return []
|
113 |
-
|
114 |
-
embeddings = []
|
115 |
-
|
116 |
-
print("Generating embeddings for video clips...")
|
117 |
-
|
118 |
-
try:
|
119 |
-
# Process clips one by one to avoid memory issues
|
120 |
-
for i, clip_path in enumerate(tqdm(clips, desc="Embedding clips")):
|
121 |
try:
|
122 |
-
|
123 |
-
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
-
# Clear GPU memory after each clip
|
131 |
-
if torch.cuda.is_available():
|
132 |
-
torch.cuda.empty_cache()
|
133 |
-
|
134 |
except Exception as e:
|
135 |
-
print(f"Error
|
136 |
-
|
137 |
-
if embeddings:
|
138 |
-
embeddings.append(torch.zeros_like(embeddings[0]))
|
139 |
-
|
140 |
-
except Exception as e:
|
141 |
-
print(f"Error in embedding process: {e}")
|
142 |
-
return []
|
143 |
-
|
144 |
-
return embeddings
|
145 |
-
|
146 |
-
def search_clips(query, embeddings, clips, top_k=3):
|
147 |
-
"""Search for relevant clips based on query"""
|
148 |
-
global model, processor
|
149 |
-
|
150 |
-
if not embeddings or not query.strip():
|
151 |
-
return []
|
152 |
-
|
153 |
-
try:
|
154 |
-
# Process the query
|
155 |
-
batch_queries = processor.process_queries([query])
|
156 |
-
batch_queries = {k: v.to(model.device) for k, v in batch_queries.items()}
|
157 |
-
|
158 |
-
# Get query embeddings
|
159 |
-
with torch.no_grad():
|
160 |
-
query_embeddings = model(**batch_queries)
|
161 |
-
|
162 |
-
# Calculate scores
|
163 |
-
scores = processor.score_multi_vector(query_embeddings, embeddings)
|
164 |
-
|
165 |
-
# Get top-k results
|
166 |
-
top_indices = torch.topk(scores[0], min(top_k, len(clips))).indices
|
167 |
-
|
168 |
-
results = []
|
169 |
-
for idx in top_indices:
|
170 |
-
if idx < len(clips): # Safety check
|
171 |
-
results.append({
|
172 |
-
'clip_path': clips[idx],
|
173 |
-
'score': scores[0][idx].item(),
|
174 |
-
'clip_index': idx.item()
|
175 |
-
})
|
176 |
-
|
177 |
-
return results
|
178 |
|
|
|
|
|
|
|
179 |
except Exception as e:
|
180 |
-
|
181 |
-
return []
|
182 |
|
183 |
def process_video(video_file):
|
184 |
-
"""
|
185 |
-
global video_embeddings, video_clips
|
186 |
|
187 |
-
if
|
188 |
-
return "β Please
|
189 |
|
190 |
-
|
191 |
-
|
192 |
|
193 |
try:
|
194 |
-
#
|
195 |
-
|
196 |
-
|
197 |
-
yield "β Failed to load AI model. Please try again."
|
198 |
-
return
|
199 |
|
200 |
-
#
|
201 |
-
|
202 |
-
|
203 |
|
204 |
-
if not
|
205 |
-
|
206 |
-
return
|
207 |
|
208 |
-
|
209 |
-
yield f"π§ Analyzing {len(clips)} video clips (this may take a few minutes)..."
|
210 |
-
embeddings = embed_video_clips(clips)
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
-
# Store globally for querying
|
217 |
video_embeddings = embeddings
|
218 |
-
video_clips =
|
|
|
|
|
|
|
219 |
|
220 |
-
|
221 |
|
222 |
except Exception as e:
|
223 |
-
|
224 |
|
225 |
-
def
|
226 |
-
"""
|
227 |
-
global video_embeddings, video_clips
|
|
|
|
|
|
|
228 |
|
229 |
if not video_embeddings:
|
230 |
-
return "
|
231 |
|
232 |
-
if not
|
233 |
-
return "
|
234 |
|
235 |
try:
|
236 |
-
#
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
-
|
240 |
-
|
|
|
241 |
|
242 |
-
#
|
243 |
-
|
244 |
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
result_text += f"**Clip {i}:** {clip_time_start}s-{clip_time_end}s (Relevance: {result['score']:.3f})\n"
|
249 |
|
250 |
-
# Return
|
251 |
-
|
|
|
|
|
|
|
252 |
|
253 |
-
return result_text,
|
254 |
|
255 |
except Exception as e:
|
256 |
-
return f"β Error
|
257 |
-
|
258 |
-
# Custom CSS for better styling
|
259 |
-
css = """
|
260 |
-
.gradio-container {
|
261 |
-
max-width: 1200px !important;
|
262 |
-
}
|
263 |
-
.video-container {
|
264 |
-
max-height: 500px;
|
265 |
-
}
|
266 |
-
"""
|
267 |
|
268 |
# Create Gradio interface
|
269 |
-
|
270 |
-
gr.
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
label="Search Query",
|
301 |
-
placeholder="Examples: 'person in red shirt', 'suspicious activity', 'vehicle entering', 'people fighting'",
|
302 |
-
lines=2
|
303 |
-
)
|
304 |
-
|
305 |
-
with gr.Row():
|
306 |
-
top_k_slider = gr.Slider(
|
307 |
-
minimum=1,
|
308 |
-
maximum=5,
|
309 |
-
value=3,
|
310 |
-
step=1,
|
311 |
-
label="Number of results"
|
312 |
)
|
313 |
-
search_btn = gr.Button("π Search", variant="secondary")
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
)
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
|
328 |
-
|
329 |
-
### π‘ Usage Tips:
|
330 |
-
- **Upload**: Supported formats include MP4, AVI, MOV, etc.
|
331 |
-
- **Wait**: Processing may take several minutes depending on video length
|
332 |
-
- **Search**: Use descriptive queries like "person wearing blue jacket" or "car speeding"
|
333 |
-
- **Review**: Check multiple results to find the exact moment you're looking for
|
334 |
-
|
335 |
-
### βοΈ Legal Notice:
|
336 |
-
This tool is intended for authorized security personnel and law enforcement only.
|
337 |
-
Ensure proper legal authority before analyzing surveillance footage.
|
338 |
-
""")
|
339 |
-
|
340 |
-
# Event handlers
|
341 |
-
process_btn.click(
|
342 |
-
fn=process_video,
|
343 |
-
inputs=[video_input],
|
344 |
-
outputs=[process_status]
|
345 |
-
)
|
346 |
-
|
347 |
-
search_btn.click(
|
348 |
-
fn=query_video,
|
349 |
-
inputs=[query_input, top_k_slider],
|
350 |
-
outputs=[search_results, result_video]
|
351 |
-
)
|
352 |
-
|
353 |
-
# Allow enter key to trigger search
|
354 |
-
query_input.submit(
|
355 |
-
fn=query_video,
|
356 |
-
inputs=[query_input, top_k_slider],
|
357 |
-
outputs=[search_results, result_video]
|
358 |
-
)
|
359 |
|
360 |
-
# Launch the app
|
361 |
if __name__ == "__main__":
|
362 |
-
demo
|
363 |
-
|
364 |
-
server_port=7860,
|
365 |
-
share=False
|
366 |
-
)
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
import cv2
|
4 |
import os
|
5 |
import tempfile
|
6 |
+
import numpy as np
|
7 |
from PIL import Image
|
8 |
from tqdm import tqdm
|
9 |
from torch.utils.data import DataLoader
|
10 |
from moviepy.editor import VideoFileClip
|
|
|
|
|
|
|
11 |
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
|
12 |
+
import warnings
|
13 |
+
warnings.filterwarnings("ignore")
|
14 |
|
15 |
+
# Global variables to store model, processor, and embeddings
|
16 |
model = None
|
17 |
processor = None
|
18 |
video_embeddings = []
|
19 |
video_clips = []
|
|
|
20 |
|
21 |
+
def initialize_model():
|
22 |
+
"""Initialize the ColQwen2.5 Omni model and processor"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
global model, processor
|
24 |
|
25 |
+
try:
|
26 |
+
# Load model with eager attention (no flash-attn)
|
27 |
+
model = ColQwen2_5Omni.from_pretrained(
|
28 |
+
"vidore/colqwen-omni-v0.1",
|
29 |
+
torch_dtype=torch.bfloat16,
|
30 |
+
device_map="cuda" if torch.cuda.is_available() else "cpu",
|
31 |
+
attn_implementation="eager", # Use eager instead of flash-attn
|
32 |
+
).eval()
|
33 |
+
|
34 |
+
processor = ColQwen2_5OmniProcessor.from_pretrained("manu/colqwen-omni-v0.1")
|
35 |
+
return "β
Model loaded successfully!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
except Exception as e:
|
38 |
+
return f"β Error loading model: {str(e)}"
|
39 |
|
40 |
+
def cut_video_into_clips(video_path, clip_duration=10):
|
41 |
+
"""Cut video into clips of specified duration (default 10 seconds)"""
|
42 |
clips = []
|
43 |
+
clip_paths = []
|
|
|
44 |
|
45 |
try:
|
46 |
+
# Use OpenCV for more reliable video processing on HF Spaces
|
47 |
+
cap = cv2.VideoCapture(video_path)
|
48 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
49 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
50 |
+
duration = total_frames / fps
|
51 |
|
52 |
+
# Calculate frames per clip
|
53 |
+
frames_per_clip = int(fps * clip_duration)
|
54 |
|
55 |
+
clip_count = 0
|
56 |
+
current_frame = 0
|
|
|
57 |
|
58 |
+
while current_frame < total_frames:
|
59 |
+
# Create temporary file for this clip
|
60 |
+
temp_clip = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
|
61 |
+
temp_clip_path = temp_clip.name
|
62 |
+
temp_clip.close()
|
63 |
|
64 |
+
# Use moviepy for the actual cutting (more reliable for output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
try:
|
66 |
+
start_time = current_frame / fps
|
67 |
+
end_time = min((current_frame + frames_per_clip) / fps, duration)
|
68 |
|
69 |
+
video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
|
70 |
+
video_clip.write_videofile(temp_clip_path, verbose=False, logger=None)
|
71 |
+
video_clip.close()
|
72 |
+
|
73 |
+
clips.append(f"Clip {clip_count + 1} ({start_time:.1f}s - {end_time:.1f}s)")
|
74 |
+
clip_paths.append(temp_clip_path)
|
75 |
+
|
76 |
+
clip_count += 1
|
77 |
+
current_frame += frames_per_clip
|
78 |
|
|
|
|
|
|
|
|
|
79 |
except Exception as e:
|
80 |
+
print(f"Error creating clip {clip_count}: {str(e)}")
|
81 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
cap.release()
|
84 |
+
return clips, clip_paths
|
85 |
+
|
86 |
except Exception as e:
|
87 |
+
return [], []
|
|
|
88 |
|
89 |
def process_video(video_file):
|
90 |
+
"""Process uploaded video: cut into clips and generate embeddings"""
|
91 |
+
global model, processor, video_embeddings, video_clips
|
92 |
|
93 |
+
if model is None:
|
94 |
+
return "β Model not loaded. Please wait for initialization to complete.", []
|
95 |
|
96 |
+
if video_file is None:
|
97 |
+
return "β Please upload a video file.", []
|
98 |
|
99 |
try:
|
100 |
+
# Reset previous data
|
101 |
+
video_embeddings = []
|
102 |
+
video_clips = []
|
|
|
|
|
103 |
|
104 |
+
# Cut video into 10-second clips
|
105 |
+
status_msg = "π¬ Cutting video into 10-second clips..."
|
106 |
+
clips_info, clip_paths = cut_video_into_clips(video_file.name, clip_duration=10)
|
107 |
|
108 |
+
if not clip_paths:
|
109 |
+
return "β Error cutting video into clips.", []
|
|
|
110 |
|
111 |
+
status_msg += f"\nβ
Created {len(clip_paths)} clips"
|
|
|
|
|
112 |
|
113 |
+
# Process each clip with the model
|
114 |
+
status_msg += "\nπ Generating embeddings for video clips..."
|
115 |
+
|
116 |
+
# Create dataloader for batch processing
|
117 |
+
dataloader = DataLoader(
|
118 |
+
dataset=clip_paths,
|
119 |
+
batch_size=1,
|
120 |
+
shuffle=False,
|
121 |
+
collate_fn=lambda x: processor.process_videos(x),
|
122 |
+
)
|
123 |
+
|
124 |
+
embeddings = []
|
125 |
+
for i, batch_doc in enumerate(tqdm(dataloader, desc="Processing clips")):
|
126 |
+
with torch.no_grad():
|
127 |
+
# Move to device
|
128 |
+
device = next(model.parameters()).device
|
129 |
+
batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
|
130 |
+
|
131 |
+
# Generate embeddings
|
132 |
+
embedding = model(**batch_doc)
|
133 |
+
embeddings.extend(list(torch.unbind(embedding.to("cpu"))))
|
134 |
|
|
|
135 |
video_embeddings = embeddings
|
136 |
+
video_clips = clip_paths
|
137 |
+
|
138 |
+
status_msg += f"\nβ
Generated embeddings for {len(embeddings)} clips"
|
139 |
+
status_msg += "\nπ― Ready for queries!"
|
140 |
|
141 |
+
return status_msg, clips_info
|
142 |
|
143 |
except Exception as e:
|
144 |
+
return f"β Error processing video: {str(e)}", []
|
145 |
|
146 |
+
def search_video_clips(query):
|
147 |
+
"""Search through video clips using text query"""
|
148 |
+
global model, processor, video_embeddings, video_clips
|
149 |
+
|
150 |
+
if model is None:
|
151 |
+
return "β Model not loaded.", None, ""
|
152 |
|
153 |
if not video_embeddings:
|
154 |
+
return "β No video processed. Please upload and process a video first.", None, ""
|
155 |
|
156 |
+
if not query.strip():
|
157 |
+
return "β Please enter a search query.", None, ""
|
158 |
|
159 |
try:
|
160 |
+
# Process query
|
161 |
+
batch_queries = processor.process_queries([query])
|
162 |
+
device = next(model.parameters()).device
|
163 |
+
batch_queries = {k: v.to(device) for k, v in batch_queries.items()}
|
164 |
+
|
165 |
+
# Generate query embedding
|
166 |
+
with torch.no_grad():
|
167 |
+
query_embedding = model(**batch_queries)
|
168 |
+
|
169 |
+
# Calculate scores
|
170 |
+
scores = processor.score_multi_vector(query_embedding, video_embeddings)
|
171 |
|
172 |
+
# Find best match
|
173 |
+
best_clip_idx = scores[0].argmax().item()
|
174 |
+
best_score = scores[0][best_clip_idx].item()
|
175 |
|
176 |
+
# Get the best matching clip
|
177 |
+
best_clip_path = video_clips[best_clip_idx]
|
178 |
|
179 |
+
result_text = f"π― Best match: Clip {best_clip_idx + 1}\n"
|
180 |
+
result_text += f"π Similarity score: {best_score:.4f}\n"
|
181 |
+
result_text += f"π Query: '{query}'"
|
|
|
182 |
|
183 |
+
# Return top 3 results text
|
184 |
+
top_3_scores = torch.topk(scores[0], min(3, len(scores[0])))
|
185 |
+
rankings = "\n\nπ Top 3 Results:\n"
|
186 |
+
for i, (score, idx) in enumerate(zip(top_3_scores.values, top_3_scores.indices)):
|
187 |
+
rankings += f"{i+1}. Clip {idx+1} (Score: {score:.4f})\n"
|
188 |
|
189 |
+
return result_text + rankings, best_clip_path, f"Best matching clip for: '{query}'"
|
190 |
|
191 |
except Exception as e:
|
192 |
+
return f"β Error during search: {str(e)}", None, ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
# Create Gradio interface
|
195 |
+
def create_interface():
|
196 |
+
with gr.Blocks(title="Video RAG with ColQwen2.5 Omni", theme=gr.themes.Soft()) as demo:
|
197 |
+
gr.Markdown("# π¬ Video RAG with ColQwen2.5 Omni")
|
198 |
+
gr.Markdown("Upload a video, and it will be automatically cut into 10-second clips. Then search through the clips using natural language queries!")
|
199 |
+
|
200 |
+
# Initialize model on startup
|
201 |
+
with gr.Row():
|
202 |
+
init_btn = gr.Button("π Initialize Model", variant="primary")
|
203 |
+
init_status = gr.Textbox(label="Initialization Status", value="Click 'Initialize Model' to start")
|
204 |
+
|
205 |
+
init_btn.click(initialize_model, outputs=[init_status])
|
206 |
+
|
207 |
+
with gr.Row():
|
208 |
+
with gr.Column(scale=1):
|
209 |
+
gr.Markdown("## π€ Upload Video")
|
210 |
+
video_input = gr.File(
|
211 |
+
label="Upload Video File",
|
212 |
+
file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
|
213 |
+
type="filepath"
|
214 |
+
)
|
215 |
+
process_btn = gr.Button("π¬ Process Video", variant="secondary")
|
216 |
+
|
217 |
+
processing_status = gr.Textbox(
|
218 |
+
label="Processing Status",
|
219 |
+
lines=6,
|
220 |
+
value="Upload a video and click 'Process Video' to start"
|
221 |
+
)
|
222 |
+
|
223 |
+
clips_list = gr.JSON(
|
224 |
+
label="Generated Clips",
|
225 |
+
value=[]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
)
|
|
|
227 |
|
228 |
+
with gr.Column(scale=1):
|
229 |
+
gr.Markdown("## π Search Clips")
|
230 |
+
query_input = gr.Textbox(
|
231 |
+
label="Search Query",
|
232 |
+
placeholder="e.g., 'a dragon spitting fire', 'person running', 'car driving'",
|
233 |
+
lines=2
|
234 |
+
)
|
235 |
+
search_btn = gr.Button("π― Search", variant="primary")
|
236 |
+
|
237 |
+
search_results = gr.Textbox(
|
238 |
+
label="Search Results",
|
239 |
+
lines=8
|
240 |
+
)
|
241 |
+
|
242 |
+
with gr.Row():
|
243 |
+
result_video = gr.Video(
|
244 |
+
label="Best Matching Clip",
|
245 |
+
visible=True
|
246 |
)
|
247 |
+
|
248 |
+
# Event handlers
|
249 |
+
process_btn.click(
|
250 |
+
process_video,
|
251 |
+
inputs=[video_input],
|
252 |
+
outputs=[processing_status, clips_list]
|
253 |
)
|
254 |
+
|
255 |
+
search_btn.click(
|
256 |
+
search_video_clips,
|
257 |
+
inputs=[query_input],
|
258 |
+
outputs=[search_results, result_video, result_video]
|
259 |
+
)
|
260 |
+
|
261 |
+
# Auto-search on Enter
|
262 |
+
query_input.submit(
|
263 |
+
search_video_clips,
|
264 |
+
inputs=[query_input],
|
265 |
+
outputs=[search_results, result_video, result_video]
|
266 |
+
)
|
267 |
+
|
268 |
+
gr.Markdown("""
|
269 |
+
## π Instructions:
|
270 |
+
1. **Initialize**: Click 'Initialize Model' and wait for completion
|
271 |
+
2. **Upload**: Choose a video file (MP4, AVI, MOV, MKV, WebM)
|
272 |
+
3. **Process**: Click 'Process Video' to cut it into 10-second clips
|
273 |
+
4. **Search**: Enter a query describing what you're looking for
|
274 |
+
5. **Results**: View the best matching clip and similarity scores
|
275 |
+
|
276 |
+
## π§ Features:
|
277 |
+
- βοΈ Automatic video segmentation into 10-second clips
|
278 |
+
- π§ AI-powered semantic video search using ColQwen2.5 Omni
|
279 |
+
- π― Real-time similarity scoring and ranking
|
280 |
+
- π± OpenCV-based video processing for HF Spaces compatibility
|
281 |
+
- β‘ Eager attention implementation (no flash-attn dependency)
|
282 |
+
""")
|
283 |
|
284 |
+
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
|
|
286 |
if __name__ == "__main__":
|
287 |
+
demo = create_interface()
|
288 |
+
demo.launch()
|
|
|
|
|
|