Artyom Boyko commited on
Commit
076e4c4
·
1 Parent(s): 5d19fd1

Fix docker problem.

Browse files
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM ubuntu:latest
2
 
3
  # ENV PIP_ROOT_USER_ACTION=ignore
4
  ARG USERNAME=mcp_user
@@ -8,7 +8,9 @@ ARG APP_WORKDIR=app_srv
8
  # Update OS and install packages
9
  WORKDIR /tmp/
10
  COPY requirements.txt packages.txt ./
11
- RUN apt-get -y update && apt-get -y upgrade && xargs apt -y install < packages.txt
 
 
12
 
13
  # Install CUDA 12.8 from Internet
14
  RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb \
@@ -44,7 +46,7 @@ RUN userdel -r ubuntu || true
44
 
45
  ## Set up a new user with user ID 1000
46
  ## https://huggingface.co/docs/hub/spaces-sdks-docker#permissions
47
- RUN useradd -m -u $USER_UID $USERNAME
48
 
49
  ## Switch to the "user" user
50
  USER $USERNAME
 
1
+ FROM ubuntu:24.04
2
 
3
  # ENV PIP_ROOT_USER_ACTION=ignore
4
  ARG USERNAME=mcp_user
 
8
  # Update OS and install packages
9
  WORKDIR /tmp/
10
  COPY requirements.txt packages.txt ./
11
+ RUN apt-get -y update && apt-get -y upgrade
12
+ # RUN xargs apt -y install < packages.txt
13
+ RUN apt-get -y install ffmpeg git git-lfs htop iotop libxml2 libopenblas-dev libssl-dev python3-pip python3-wheel python3-setuptools python-is-python3 wget zlib1g net-tools curl
14
 
15
  # Install CUDA 12.8 from Internet
16
  RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb \
 
46
 
47
  ## Set up a new user with user ID 1000
48
  ## https://huggingface.co/docs/hub/spaces-sdks-docker#permissions
49
+ RUN useradd -s /bin/bash -m -u $USER_UID $USERNAME
50
 
51
  ## Switch to the "user" user
52
  USER $USERNAME
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: Aura AI Scan
3
- short_description: Flexible tool for in-depth analysis of YouTube videos.
4
- emoji: 📊
5
- sdk: docker
6
- pinned: false
7
- license: gpl-3.0
8
- thumbnail: >-
9
- https://cdn-uploads.huggingface.co/production/uploads/630f0cfbe52a259b855d290e/MeV4xfCNESPfFCmtZM_rg.jpeg
10
- ---
11
-
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Aura AI Scan
3
+ short_description: Flexible tool for in-depth analysis of YouTube videos.
4
+ emoji: 📊
5
+ sdk: docker
6
+ pinned: false
7
+ license: gpl-3.0
8
+ thumbnail: >-
9
+ https://cdn-uploads.huggingface.co/production/uploads/630f0cfbe52a259b855d290e/MeV4xfCNESPfFCmtZM_rg.jpeg
10
+ ---
11
+
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app_srv/app_srv.py CHANGED
@@ -1,481 +1,481 @@
1
- import gradio as gr
2
- import torch
3
- import os
4
- import shutil
5
- import json
6
- import base64
7
- import tempfile
8
- import time # To simulate delays and show progress more smoothly
9
- from pathlib import Path
10
-
11
- from downloader import download_youtube_video
12
- from video_processing import extract_frames_with_timestamps, generate_frame_descriptions
13
- from audio_processing import transcribe_audio
14
- from model_api import get_device_and_dtype
15
-
16
- device, dtype = get_device_and_dtype()
17
-
18
- DEFAULT_PROMPT = """You are an expert at analyzing video, so pay close attention. Your main goal is to analyze the frame and find information in it to answer the MAIN QUESTION. Pay attention to details.
19
- Provide the analysis for each frame in the following format, focusing on the frame at timestamp {timestamp}:
20
- FRAME: {timestamp}
21
- OBJECTS: List of objects with their count, for example: Bengal tiger - 1, Volvo car - 1, Person - 2 (male, female). Mentioning an object in the text on the frame does not count as a separate object.
22
- If there are no objects, the field is equal to NONE.
23
- BACKGROUND: Description of background and surroundings, e.g.: Muddy brown water. A road in the distance. An abandoned building on the horizon.
24
- ACTION: A detailed description of what is happening in the frame, for example: A Bengal tiger swimming in murky water, its head and part of its back visible above the surface.
25
- The shot is taken from above, above the tiger. A blue Volvo car is driving along the road in the distance. A part of a tree is visible in the right part of the frame.
26
- If there are no actions, the field is equal to NONE.
27
- RECOGNIZED TEXT: Any text recognized in the frame, e.g.: "STOP", "VOLVO", "EXIT 25". Only the text that is present in the frame, if it is not present, this field is NONE.
28
-
29
- OBJECTS:
30
- BACKGROUND:
31
- ACTION:
32
- RECOGNIZED TEXT:
33
- """
34
-
35
- def analyze_video_data(
36
- youtube_url: str,
37
- quality: int = 720,
38
- time_step: float = 5.0,
39
- include_audio_data: bool = False,
40
- include_frame_data: bool = False
41
- ) -> str:
42
- """
43
- This tool returns a text description of the frames from a YouTube clip and a full transcription of the audio track of that clip.
44
- Analyzing clips can be time-consuming (depending on the specified quality). You should always wait for the process to complete.
45
-
46
- Args:
47
- youtube_url (str): The URL of the YouTube video to be analyzed.
48
- quality (int, optional): The desired video quality for download (e.g., 144, 240, 360, 480, 720, 1080, 1440, 2160).
49
- Defaults to 720.
50
- time_step (float, optional): The interval in seconds at which frames will be extracted
51
- from the video. Defaults to 5.0.
52
- include_audio_data (bool, optional): If True, the base64 encoded audio data (MP3) will be included in the JSON results. Defaults to False.
53
- include_frame_data (bool, optional): If True, base64 encoded image data (JPG) for each extracted frame will be included in the JSON results. Defaults to False.
54
-
55
- Returns:
56
- str: A JSON string containing the analysis results.
57
- On success, it includes 'status': 'success', 'frame_analysis' (list of dictionaries
58
- with 'timestamp', 'description', and optional 'image_base64'),
59
- 'audio_transcription', and optional 'audio_base64'.
60
- On error, it includes 'status': 'error' and a 'message' detailing the error.
61
-
62
- Raises:
63
- Exception: Catches and reports any exceptions that occur during the video
64
- downloading, processing, or analysis steps, returning an error
65
- message within the JSON string.
66
- """
67
- results = {
68
- "status": "success",
69
- "message": "",
70
- "frame_analysis": [],
71
- "audio_transcription": "",
72
- "audio_base64": ""
73
- }
74
-
75
- try:
76
-
77
- # For debugging purpose
78
- print(f'Starting pprocessing tast with {youtube_url}, quality: {quality}, time step: {time_step}, include audio: {include_audio_data}, include frames {include_frame_data}.')
79
-
80
- video_data = download_youtube_video(
81
- url=youtube_url, video_quality=quality, # youtube_cookies=cookies
82
- )
83
-
84
- frames_dict = extract_frames_with_timestamps(
85
- video_path=video_data["video_path"],
86
- output_dir=video_data["data_path"],
87
- time_step=time_step,
88
- hw_device="cuda",
89
- )
90
-
91
- descriptions = generate_frame_descriptions(
92
- frames_dict=frames_dict,
93
- custom_prompt=DEFAULT_PROMPT,
94
- device=device,
95
- torch_dtype=dtype,
96
- )
97
-
98
- transcription_text = transcribe_audio(video_data["audio_path"])
99
-
100
- for timestamp, frame_path in frames_dict.items():
101
- description = descriptions.get(timestamp, "No description available")
102
- frame_entry = {"timestamp": timestamp, "description": description, "image_base64": ""}
103
-
104
- if include_frame_data and os.path.exists(frame_path):
105
- with open(frame_path, "rb") as f:
106
- frame_entry["image_base64"] = base64.b64encode(f.read()).decode("utf-8")
107
-
108
- results["frame_analysis"].append(frame_entry)
109
-
110
- results["audio_transcription"] = transcription_text
111
-
112
- if include_audio_data and os.path.exists(video_data["audio_path"]):
113
- with open(video_data["audio_path"], "rb") as f:
114
- results["audio_base64"] = base64.b64encode(f.read()).decode("utf-8")
115
-
116
- return json.dumps(results, indent=2, ensure_ascii=False)
117
-
118
- except Exception as e:
119
- error_message = f"Processing error: {str(e)}"
120
- results["status"] = "error"
121
- results["message"] = error_message
122
- results["frame_analysis"] = []
123
- results["audio_transcription"] = ""
124
- results["audio_base64"] = ""
125
-
126
- for frame_entry in results["frame_analysis"]:
127
- frame_entry["image_base64"] = ""
128
-
129
- return json.dumps(results, indent=2, ensure_ascii=False)
130
-
131
-
132
- def get_video_html_from_json(json_string: str) -> str:
133
- try:
134
- data = json.loads(json_string)
135
- if data["status"] == "error":
136
- return f"<p style='color:red;'>Error: {data['message']}</p>"
137
-
138
- html_content = ""
139
- if not data["frame_analysis"]:
140
- html_content += "<p>No frames analyzed or included.</p>"
141
- else:
142
- for frame in data["frame_analysis"]:
143
- timestamp = frame.get("timestamp", "N/A")
144
- description = frame.get("description", "No description available")
145
- image_base64 = frame.get("image_base64", "")
146
-
147
- html_content += f"<div style='margin-bottom: 20px; border: 1px solid #eee; padding: 10px; border-radius: 8px;'>"
148
- html_content += f"<h3>FRAME: {timestamp}</h3>"
149
- if image_base64:
150
- html_content += f"<img src='data:image/jpeg;base64,{image_base64}' style='max-width: 100%; height: auto; border-radius: 4px; margin-bottom: 10px;'><br>"
151
- else:
152
- html_content += f"<p>Image data not included for this frame (checkbox 'Include Frame Data' was not selected).</p>"
153
- html_content += f"<p><strong>Description:</strong> {description}</p>"
154
- html_content += "</div>"
155
- return html_content
156
- except json.JSONDecodeError:
157
- return "<p style='color:red;'>Invalid JSON response.</p>"
158
- except Exception as e:
159
- return f"<p style='color:red;'>Error processing video data for display: {str(e)}</p>"
160
-
161
-
162
- def get_audio_data_from_json(json_string: str) -> tuple[str, str | None]:
163
- try:
164
- data = json.loads(json_string)
165
- if data["status"] == "error":
166
- return f"Error: {data['message']}", None
167
-
168
- transcription = data.get("audio_transcription", "No transcription available.")
169
- audio_base64 = data.get("audio_base64", "")
170
-
171
- if audio_base64:
172
- with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
173
- temp_audio_file.write(base64.b64decode(audio_base64))
174
- temp_audio_path = temp_audio_file.name
175
-
176
- return transcription, temp_audio_path
177
- else:
178
- transcription += "\n\nAudio data not included (checkbox 'Include Audio Data' was not selected)."
179
- return transcription, None
180
-
181
- except json.JSONDecodeError:
182
- return "Invalid JSON response for audio.", None
183
- except Exception as e:
184
- return f"Error processing audio data for display: {str(e)}", None
185
-
186
- # Wrapper function for analysis with progress bar
187
- def analyze_video_data_with_progress_wrapper(
188
- youtube_url: str,
189
- prompt: str,
190
- quality: int,
191
- time_step: float,
192
- include_audio_data: bool,
193
- include_frame_data: bool,
194
- progress=gr.Progress()
195
- ):
196
- results = {
197
- "status": "pending",
198
- "message": "Starting analysis...",
199
- "frame_analysis": [],
200
- "audio_transcription": "",
201
- "audio_base64": ""
202
- }
203
-
204
- try:
205
- progress(0, desc="Downloading video...")
206
-
207
-
208
- # Step 1: Downloading a YouTube video
209
- video_data = download_youtube_video(
210
- url=youtube_url, video_quality=quality, # youtube_cookies=cookies
211
- )
212
- progress(0.25, desc="Extracting frames...")
213
-
214
- # Step 2: Extract frames from video
215
- frames_dict = extract_frames_with_timestamps(
216
- video_path=video_data["video_path"],
217
- output_dir=video_data["data_path"],
218
- time_step=time_step,
219
- hw_device="cuda",
220
- )
221
- progress(0.5, desc="Generating frame descriptions...")
222
-
223
- # Step 3: Generate frames descriptions
224
- descriptions = generate_frame_descriptions(
225
- frames_dict=frames_dict,
226
- custom_prompt=prompt,
227
- device=device,
228
- torch_dtype=dtype,
229
- )
230
- progress(0.75, desc="Transcribing audio...")
231
-
232
- # Step 4: Transcribe the audio
233
- transcription_text = transcribe_audio(video_data["audio_path"])
234
- progress(0.9, desc="Consolidating results...")
235
-
236
- # Build the final results dictionary
237
- for timestamp, frame_path in frames_dict.items():
238
- description = descriptions.get(timestamp, "No description available")
239
- frame_entry = {"timestamp": timestamp, "description": description, "image_base64": ""}
240
-
241
- if include_frame_data and os.path.exists(frame_path):
242
- with open(frame_path, "rb") as f:
243
- frame_entry["image_base64"] = base64.b64encode(f.read()).decode("utf-8")
244
-
245
- results["frame_analysis"].append(frame_entry)
246
-
247
- results["audio_transcription"] = transcription_text
248
-
249
- if include_audio_data and os.path.exists(video_data["audio_path"]):
250
- with open(video_data["audio_path"], "rb") as f:
251
- results["audio_base64"] = base64.b64encode(f.read()).decode("utf-8")
252
-
253
- results["status"] = "success"
254
- results["message"] = "Analysis complete!"
255
- progress(1.0, desc="Analysis complete!")
256
- yield json.dumps(results, indent=2, ensure_ascii=False)
257
-
258
- except Exception as e:
259
- error_message = f"Processing error: {str(e)}"
260
- results["status"] = "error"
261
- results["message"] = error_message
262
- results["frame_analysis"] = []
263
- results["audio_transcription"] = ""
264
- results["audio_base64"] = ""
265
-
266
- progress(1.0, desc="Analysis failed!")
267
- yield json.dumps(results, indent=2, ensure_ascii=False)
268
-
269
-
270
- # The path where the cookie.txt file is saved
271
- working_cookies_file_path = "/home/mcp_user/app_srv/cookies.txt"
272
-
273
- # Global variable to store the path to the last temporary Gradio file
274
- gradio_temp_cookies_file_path = None
275
-
276
-
277
- def upload_cookies_file(file):
278
- """
279
- Copies the uploaded file from Gradio temporary storage to working_cookies_file_path.
280
- Saves the path of the temporary file to a global variable for later deletion.
281
- """
282
- global gradio_temp_cookies_file_path # Declare global variable to be changed
283
-
284
- if file is None:
285
- return "Please first select a cookie file to upload."
286
-
287
- source_path = file.name
288
- gradio_temp_cookies_file_path = source_path
289
-
290
- try:
291
- shutil.copy(source_path, working_cookies_file_path)
292
-
293
- return (f"File successfully copied and saved as: {working_cookies_file_path}.\n"
294
- f"Path to the Gradio temporary file: {source_path}.")
295
-
296
- except Exception as e:
297
- return f"Error occurred while file copying: {e}"
298
-
299
-
300
- def clear_cookies_files():
301
- """
302
- Deletes working_cookies_file_path and also attempts to delete the last temporary Gradio file,
303
- path to which is stored globally.
304
- """
305
- global gradio_temp_cookies_file_path # Declare global variable to be changed
306
-
307
- status_messages = [] # List to collect status messages
308
-
309
- if os.path.exists(working_cookies_file_path):
310
- try:
311
- os.remove(working_cookies_file_path)
312
- status_messages.append(f"File {working_cookies_file_path} successfully deleted.")
313
- except Exception as e:
314
- status_messages.append(f"Error while deleting a file {working_cookies_file_path}: {e}.")
315
- else:
316
- status_messages.append(f"File {working_cookies_file_path} not found.")
317
-
318
-
319
- if gradio_temp_cookies_file_path and os.path.exists(gradio_temp_cookies_file_path):
320
- try:
321
- os.remove(gradio_temp_cookies_file_path)
322
- status_messages.append(f"Temporary Gradio file ({os.path.basename(gradio_temp_cookies_file_path)}) was successfully deleted.")
323
- except Exception as e:
324
- status_messages.append(f"Error deleting a temporary Gradio file ({os.path.basename(gradio_temp_cookies_file_path)}): {e}.")
325
- finally:
326
- gradio_temp_cookies_file_path = None
327
- elif gradio_temp_cookies_file_path is None:
328
- status_messages.append("Path to the Gradio temporary file was unknown.")
329
- else:
330
- status_messages.append(f"Temporary Gradio file ({os.path.basename(gradio_temp_cookies_file_path)}) no longer exists.")
331
-
332
- # Merge all status messages
333
- return "\n".join(status_messages)
334
-
335
-
336
- with gr.Blocks(title="Video Analysis Tool",) as demo:
337
-
338
- gr.Markdown(
339
- """
340
- # 🎥 YouTube Comprehensive Video Analysis Tool
341
- YouTube AI Video Analyze - get frame-by-frame descriptions with timestamps and audio transcription.
342
- Since most MCP servers operate without a GUI, this version includes several tools to help you view JSON results, both raw and visually.
343
-
344
- This space can be run locally as it is a Docker container.
345
- """
346
- )
347
-
348
- with gr.Row():
349
- youtube_url = gr.Textbox(
350
- label="YouTube Video URL",
351
- value="https://www.youtube.com/watch?v=R3GfuzLMPkA",
352
- lines=1,
353
- scale=5
354
- )
355
-
356
- with gr.Row():
357
- prompt = gr.Textbox(
358
- label="Analysis Prompt",
359
- value=DEFAULT_PROMPT,
360
- lines=3,
361
- scale=4
362
- )
363
-
364
- with gr.Column(scale=2, min_width=200):
365
-
366
- file_input = gr.File(label="Select a cookie file to upload", file_count="single", height=263)
367
-
368
- output_message = gr.Textbox(label="Status of uploading file with cookies")
369
-
370
- upload_cookies_file_button = gr.Button("Save file with cookies")
371
-
372
-
373
- with gr.Column(scale=2, min_width=200):
374
- quality = gr.Dropdown(
375
- label="Video Quality",
376
- choices=[144, 240, 360, 480, 720, 1080, 1440, 2160],
377
- value=720
378
- )
379
- time_step = gr.Slider(
380
- label="Frame Interval (seconds)",
381
- minimum=0.5,
382
- maximum=30,
383
- step=0.5,
384
- value=30
385
- )
386
- include_audio_data = gr.Checkbox(
387
- label="Include Audio Data (MP3) in Results", value=False
388
- )
389
- include_frame_data = gr.Checkbox(
390
- label="Include Frame Data (JPG) in Results", value=False
391
- )
392
-
393
- t1 = gr.Textbox(value="Waiting for task...",label="Task Progress", show_label=True, lines=3, interactive=False)
394
-
395
- # Button to create an MCP server point
396
- submit_btn = gr.Button("Start Video Analysis (No Progress Bar, for MCP Server use)", variant="primary", visible=False)
397
-
398
- # Analyze button with progress bar
399
- submit_btn_with_progress = gr.Button("Analyze Video", variant="secondary")
400
-
401
- with gr.Tabs() as results_tabs:
402
- with gr.TabItem("Video"):
403
- video_output_html = gr.HTML(label="Video Frames Analysis", elem_id="video-output-html")
404
- with gr.TabItem("Audio"):
405
- audio_player_output = gr.Audio(label="Play Audio", type="filepath", render=True)
406
- audio_transcription_output = gr.Textbox(label="Audio Transcription", lines=10)
407
- with gr.TabItem("JSON"):
408
- results_json_viewer = gr.JSON(
409
- label="Raw Analysis Results (JSON)",
410
- elem_classes=["output-box", "results-output"],
411
- )
412
-
413
- raw_json_output = gr.State()
414
-
415
- # Logic for normal button (without progress bar), this button becomes the MCP server point.
416
- submit_btn.click(
417
- fn=analyze_video_data,
418
- inputs=[youtube_url, prompt, quality, time_step, include_audio_data, include_frame_data],
419
- outputs=[raw_json_output],
420
- api_name="analyze_video_data",
421
- show_api=True
422
- ).then(
423
- fn=get_video_html_from_json,
424
- inputs=[raw_json_output],
425
- outputs=[video_output_html],
426
- show_api=False
427
- ).then(
428
- fn=get_audio_data_from_json,
429
- inputs=[raw_json_output],
430
- outputs=[audio_transcription_output, audio_player_output],
431
- show_api=False
432
- ).then(
433
- fn=lambda x: json.loads(x),
434
- inputs=[raw_json_output],
435
- outputs=[results_json_viewer],
436
- show_api=False
437
- )
438
-
439
- # Logic for button with progress bar
440
- submit_btn_with_progress.click(
441
- fn=analyze_video_data_with_progress_wrapper,
442
- inputs=[youtube_url, prompt, quality, time_step, include_audio_data, include_frame_data],
443
- outputs=[raw_json_output],
444
- api_name="analyze_video_data_with_progress_button",
445
- show_progress_on=t1,
446
- show_api=False
447
- ).then(
448
- fn=get_video_html_from_json,
449
- inputs=[raw_json_output],
450
- outputs=[video_output_html],
451
- show_api=False
452
- ).then(
453
- fn=get_audio_data_from_json,
454
- inputs=[raw_json_output],
455
- outputs=[audio_transcription_output, audio_player_output],
456
- show_api=False
457
- ).then(
458
- fn=lambda x: json.loads(x),
459
- inputs=[raw_json_output],
460
- outputs=[results_json_viewer],
461
- show_api=False
462
- )
463
-
464
- # Logic of processing cookies
465
- upload_cookies_file_button.click(
466
- fn=upload_cookies_file,
467
- inputs=file_input,
468
- outputs=output_message,
469
- show_api=False
470
- )
471
-
472
- file_input.clear(
473
- fn=clear_cookies_files,
474
- inputs=None,
475
- outputs=output_message,
476
- show_api=False
477
- )
478
-
479
-
480
- if __name__ == "__main__":
481
  demo.launch(mcp_server=True)
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ import shutil
5
+ import json
6
+ import base64
7
+ import tempfile
8
+ import time # To simulate delays and show progress more smoothly
9
+ from pathlib import Path
10
+
11
+ from downloader import download_youtube_video
12
+ from video_processing import extract_frames_with_timestamps, generate_frame_descriptions
13
+ from audio_processing import transcribe_audio
14
+ from model_api import get_device_and_dtype
15
+
16
+ device, dtype = get_device_and_dtype()
17
+
18
+ DEFAULT_PROMPT = """You are an expert at analyzing video, so pay close attention. Your main goal is to analyze the frame and find information in it to answer the MAIN QUESTION. Pay attention to details.
19
+ Provide the analysis for each frame in the following format, focusing on the frame at timestamp {timestamp}:
20
+ FRAME: {timestamp}
21
+ OBJECTS: List of objects with their count, for example: Bengal tiger - 1, Volvo car - 1, Person - 2 (male, female). Mentioning an object in the text on the frame does not count as a separate object.
22
+ If there are no objects, the field is equal to NONE.
23
+ BACKGROUND: Description of background and surroundings, e.g.: Muddy brown water. A road in the distance. An abandoned building on the horizon.
24
+ ACTION: A detailed description of what is happening in the frame, for example: A Bengal tiger swimming in murky water, its head and part of its back visible above the surface.
25
+ The shot is taken from above, above the tiger. A blue Volvo car is driving along the road in the distance. A part of a tree is visible in the right part of the frame.
26
+ If there are no actions, the field is equal to NONE.
27
+ RECOGNIZED TEXT: Any text recognized in the frame, e.g.: "STOP", "VOLVO", "EXIT 25". Only the text that is present in the frame, if it is not present, this field is NONE.
28
+
29
+ OBJECTS:
30
+ BACKGROUND:
31
+ ACTION:
32
+ RECOGNIZED TEXT:
33
+ """
34
+
35
+ def analyze_video_data(
36
+ youtube_url: str,
37
+ quality: int = 720,
38
+ time_step: float = 5.0,
39
+ include_audio_data: bool = False,
40
+ include_frame_data: bool = False
41
+ ) -> str:
42
+ """
43
+ This tool returns a text description of the frames from a YouTube clip and a full transcription of the audio track of that clip.
44
+ Analyzing clips can be time-consuming (depending on the specified quality). You should always wait for the process to complete.
45
+
46
+ Args:
47
+ youtube_url (str): The URL of the YouTube video to be analyzed.
48
+ quality (int, optional): The desired video quality for download (e.g., 144, 240, 360, 480, 720, 1080, 1440, 2160).
49
+ Defaults to 720.
50
+ time_step (float, optional): The interval in seconds at which frames will be extracted
51
+ from the video. Defaults to 5.0.
52
+ include_audio_data (bool, optional): If True, the base64 encoded audio data (MP3) will be included in the JSON results. Defaults to False.
53
+ include_frame_data (bool, optional): If True, base64 encoded image data (JPG) for each extracted frame will be included in the JSON results. Defaults to False.
54
+
55
+ Returns:
56
+ str: A JSON string containing the analysis results.
57
+ On success, it includes 'status': 'success', 'frame_analysis' (list of dictionaries
58
+ with 'timestamp', 'description', and optional 'image_base64'),
59
+ 'audio_transcription', and optional 'audio_base64'.
60
+ On error, it includes 'status': 'error' and a 'message' detailing the error.
61
+
62
+ Raises:
63
+ Exception: Catches and reports any exceptions that occur during the video
64
+ downloading, processing, or analysis steps, returning an error
65
+ message within the JSON string.
66
+ """
67
+ results = {
68
+ "status": "success",
69
+ "message": "",
70
+ "frame_analysis": [],
71
+ "audio_transcription": "",
72
+ "audio_base64": ""
73
+ }
74
+
75
+ try:
76
+
77
+ # For debugging purpose
78
+ print(f'Starting pprocessing tast with {youtube_url}, quality: {quality}, time step: {time_step}, include audio: {include_audio_data}, include frames {include_frame_data}.')
79
+
80
+ video_data = download_youtube_video(
81
+ url=youtube_url, video_quality=quality, # youtube_cookies=cookies
82
+ )
83
+
84
+ frames_dict = extract_frames_with_timestamps(
85
+ video_path=video_data["video_path"],
86
+ output_dir=video_data["data_path"],
87
+ time_step=time_step,
88
+ hw_device="cuda",
89
+ )
90
+
91
+ descriptions = generate_frame_descriptions(
92
+ frames_dict=frames_dict,
93
+ custom_prompt=DEFAULT_PROMPT,
94
+ device=device,
95
+ torch_dtype=dtype,
96
+ )
97
+
98
+ transcription_text = transcribe_audio(video_data["audio_path"])
99
+
100
+ for timestamp, frame_path in frames_dict.items():
101
+ description = descriptions.get(timestamp, "No description available")
102
+ frame_entry = {"timestamp": timestamp, "description": description, "image_base64": ""}
103
+
104
+ if include_frame_data and os.path.exists(frame_path):
105
+ with open(frame_path, "rb") as f:
106
+ frame_entry["image_base64"] = base64.b64encode(f.read()).decode("utf-8")
107
+
108
+ results["frame_analysis"].append(frame_entry)
109
+
110
+ results["audio_transcription"] = transcription_text
111
+
112
+ if include_audio_data and os.path.exists(video_data["audio_path"]):
113
+ with open(video_data["audio_path"], "rb") as f:
114
+ results["audio_base64"] = base64.b64encode(f.read()).decode("utf-8")
115
+
116
+ return json.dumps(results, indent=2, ensure_ascii=False)
117
+
118
+ except Exception as e:
119
+ error_message = f"Processing error: {str(e)}"
120
+ results["status"] = "error"
121
+ results["message"] = error_message
122
+ results["frame_analysis"] = []
123
+ results["audio_transcription"] = ""
124
+ results["audio_base64"] = ""
125
+
126
+ for frame_entry in results["frame_analysis"]:
127
+ frame_entry["image_base64"] = ""
128
+
129
+ return json.dumps(results, indent=2, ensure_ascii=False)
130
+
131
+
132
+ def get_video_html_from_json(json_string: str) -> str:
133
+ try:
134
+ data = json.loads(json_string)
135
+ if data["status"] == "error":
136
+ return f"<p style='color:red;'>Error: {data['message']}</p>"
137
+
138
+ html_content = ""
139
+ if not data["frame_analysis"]:
140
+ html_content += "<p>No frames analyzed or included.</p>"
141
+ else:
142
+ for frame in data["frame_analysis"]:
143
+ timestamp = frame.get("timestamp", "N/A")
144
+ description = frame.get("description", "No description available")
145
+ image_base64 = frame.get("image_base64", "")
146
+
147
+ html_content += f"<div style='margin-bottom: 20px; border: 1px solid #eee; padding: 10px; border-radius: 8px;'>"
148
+ html_content += f"<h3>FRAME: {timestamp}</h3>"
149
+ if image_base64:
150
+ html_content += f"<img src='data:image/jpeg;base64,{image_base64}' style='max-width: 100%; height: auto; border-radius: 4px; margin-bottom: 10px;'><br>"
151
+ else:
152
+ html_content += f"<p>Image data not included for this frame (checkbox 'Include Frame Data' was not selected).</p>"
153
+ html_content += f"<p><strong>Description:</strong> {description}</p>"
154
+ html_content += "</div>"
155
+ return html_content
156
+ except json.JSONDecodeError:
157
+ return "<p style='color:red;'>Invalid JSON response.</p>"
158
+ except Exception as e:
159
+ return f"<p style='color:red;'>Error processing video data for display: {str(e)}</p>"
160
+
161
+
162
+ def get_audio_data_from_json(json_string: str) -> tuple[str, str | None]:
163
+ try:
164
+ data = json.loads(json_string)
165
+ if data["status"] == "error":
166
+ return f"Error: {data['message']}", None
167
+
168
+ transcription = data.get("audio_transcription", "No transcription available.")
169
+ audio_base64 = data.get("audio_base64", "")
170
+
171
+ if audio_base64:
172
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
173
+ temp_audio_file.write(base64.b64decode(audio_base64))
174
+ temp_audio_path = temp_audio_file.name
175
+
176
+ return transcription, temp_audio_path
177
+ else:
178
+ transcription += "\n\nAudio data not included (checkbox 'Include Audio Data' was not selected)."
179
+ return transcription, None
180
+
181
+ except json.JSONDecodeError:
182
+ return "Invalid JSON response for audio.", None
183
+ except Exception as e:
184
+ return f"Error processing audio data for display: {str(e)}", None
185
+
186
+ # Wrapper function for analysis with progress bar
187
+ def analyze_video_data_with_progress_wrapper(
188
+ youtube_url: str,
189
+ prompt: str,
190
+ quality: int,
191
+ time_step: float,
192
+ include_audio_data: bool,
193
+ include_frame_data: bool,
194
+ progress=gr.Progress()
195
+ ):
196
+ results = {
197
+ "status": "pending",
198
+ "message": "Starting analysis...",
199
+ "frame_analysis": [],
200
+ "audio_transcription": "",
201
+ "audio_base64": ""
202
+ }
203
+
204
+ try:
205
+ progress(0, desc="Downloading video...")
206
+
207
+
208
+ # Step 1: Downloading a YouTube video
209
+ video_data = download_youtube_video(
210
+ url=youtube_url, video_quality=quality, # youtube_cookies=cookies
211
+ )
212
+ progress(0.25, desc="Extracting frames...")
213
+
214
+ # Step 2: Extract frames from video
215
+ frames_dict = extract_frames_with_timestamps(
216
+ video_path=video_data["video_path"],
217
+ output_dir=video_data["data_path"],
218
+ time_step=time_step,
219
+ hw_device="cuda",
220
+ )
221
+ progress(0.5, desc="Generating frame descriptions...")
222
+
223
+ # Step 3: Generate frames descriptions
224
+ descriptions = generate_frame_descriptions(
225
+ frames_dict=frames_dict,
226
+ custom_prompt=prompt,
227
+ device=device,
228
+ torch_dtype=dtype,
229
+ )
230
+ progress(0.75, desc="Transcribing audio...")
231
+
232
+ # Step 4: Transcribe the audio
233
+ transcription_text = transcribe_audio(video_data["audio_path"])
234
+ progress(0.9, desc="Consolidating results...")
235
+
236
+ # Build the final results dictionary
237
+ for timestamp, frame_path in frames_dict.items():
238
+ description = descriptions.get(timestamp, "No description available")
239
+ frame_entry = {"timestamp": timestamp, "description": description, "image_base64": ""}
240
+
241
+ if include_frame_data and os.path.exists(frame_path):
242
+ with open(frame_path, "rb") as f:
243
+ frame_entry["image_base64"] = base64.b64encode(f.read()).decode("utf-8")
244
+
245
+ results["frame_analysis"].append(frame_entry)
246
+
247
+ results["audio_transcription"] = transcription_text
248
+
249
+ if include_audio_data and os.path.exists(video_data["audio_path"]):
250
+ with open(video_data["audio_path"], "rb") as f:
251
+ results["audio_base64"] = base64.b64encode(f.read()).decode("utf-8")
252
+
253
+ results["status"] = "success"
254
+ results["message"] = "Analysis complete!"
255
+ progress(1.0, desc="Analysis complete!")
256
+ yield json.dumps(results, indent=2, ensure_ascii=False)
257
+
258
+ except Exception as e:
259
+ error_message = f"Processing error: {str(e)}"
260
+ results["status"] = "error"
261
+ results["message"] = error_message
262
+ results["frame_analysis"] = []
263
+ results["audio_transcription"] = ""
264
+ results["audio_base64"] = ""
265
+
266
+ progress(1.0, desc="Analysis failed!")
267
+ yield json.dumps(results, indent=2, ensure_ascii=False)
268
+
269
+
270
+ # The path where the cookie.txt file is saved
271
+ working_cookies_file_path = "/home/mcp_user/app_srv/cookies.txt"
272
+
273
+ # Global variable to store the path to the last temporary Gradio file
274
+ gradio_temp_cookies_file_path = None
275
+
276
+
277
+ def upload_cookies_file(file):
278
+ """
279
+ Copies the uploaded file from Gradio temporary storage to working_cookies_file_path.
280
+ Saves the path of the temporary file to a global variable for later deletion.
281
+ """
282
+ global gradio_temp_cookies_file_path # Declare global variable to be changed
283
+
284
+ if file is None:
285
+ return "Please first select a cookie file to upload."
286
+
287
+ source_path = file.name
288
+ gradio_temp_cookies_file_path = source_path
289
+
290
+ try:
291
+ shutil.copy(source_path, working_cookies_file_path)
292
+
293
+ return (f"File successfully copied and saved as: {working_cookies_file_path}.\n"
294
+ f"Path to the Gradio temporary file: {source_path}.")
295
+
296
+ except Exception as e:
297
+ return f"Error occurred while file copying: {e}"
298
+
299
+
300
+ def clear_cookies_files():
301
+ """
302
+ Deletes working_cookies_file_path and also attempts to delete the last temporary Gradio file,
303
+ path to which is stored globally.
304
+ """
305
+ global gradio_temp_cookies_file_path # Declare global variable to be changed
306
+
307
+ status_messages = [] # List to collect status messages
308
+
309
+ if os.path.exists(working_cookies_file_path):
310
+ try:
311
+ os.remove(working_cookies_file_path)
312
+ status_messages.append(f"File {working_cookies_file_path} successfully deleted.")
313
+ except Exception as e:
314
+ status_messages.append(f"Error while deleting a file {working_cookies_file_path}: {e}.")
315
+ else:
316
+ status_messages.append(f"File {working_cookies_file_path} not found.")
317
+
318
+
319
+ if gradio_temp_cookies_file_path and os.path.exists(gradio_temp_cookies_file_path):
320
+ try:
321
+ os.remove(gradio_temp_cookies_file_path)
322
+ status_messages.append(f"Temporary Gradio file ({os.path.basename(gradio_temp_cookies_file_path)}) was successfully deleted.")
323
+ except Exception as e:
324
+ status_messages.append(f"Error deleting a temporary Gradio file ({os.path.basename(gradio_temp_cookies_file_path)}): {e}.")
325
+ finally:
326
+ gradio_temp_cookies_file_path = None
327
+ elif gradio_temp_cookies_file_path is None:
328
+ status_messages.append("Path to the Gradio temporary file was unknown.")
329
+ else:
330
+ status_messages.append(f"Temporary Gradio file ({os.path.basename(gradio_temp_cookies_file_path)}) no longer exists.")
331
+
332
+ # Merge all status messages
333
+ return "\n".join(status_messages)
334
+
335
+
336
+ with gr.Blocks(title="Video Analysis Tool",) as demo:
337
+
338
+ gr.Markdown(
339
+ """
340
+ # 🎥 YouTube Comprehensive Video Analysis Tool
341
+ YouTube AI Video Analyze - get frame-by-frame descriptions with timestamps and audio transcription.
342
+ Since most MCP servers operate without a GUI, this version includes several tools to help you view JSON results, both raw and visually.
343
+
344
+ This space can be run locally as it is a Docker container.
345
+ """
346
+ )
347
+
348
+ with gr.Row():
349
+ youtube_url = gr.Textbox(
350
+ label="YouTube Video URL",
351
+ value="https://www.youtube.com/watch?v=R3GfuzLMPkA",
352
+ lines=1,
353
+ scale=5
354
+ )
355
+
356
+ with gr.Row():
357
+ prompt = gr.Textbox(
358
+ label="Analysis Prompt",
359
+ value=DEFAULT_PROMPT,
360
+ lines=3,
361
+ scale=4
362
+ )
363
+
364
+ with gr.Column(scale=2, min_width=200):
365
+
366
+ file_input = gr.File(label="Select a cookie file to upload", file_count="single", height=263)
367
+
368
+ output_message = gr.Textbox(label="Status of uploading file with cookies")
369
+
370
+ upload_cookies_file_button = gr.Button("Save file with cookies")
371
+
372
+
373
+ with gr.Column(scale=2, min_width=200):
374
+ quality = gr.Dropdown(
375
+ label="Video Quality",
376
+ choices=[144, 240, 360, 480, 720, 1080, 1440, 2160],
377
+ value=720
378
+ )
379
+ time_step = gr.Slider(
380
+ label="Frame Interval (seconds)",
381
+ minimum=0.5,
382
+ maximum=30,
383
+ step=0.5,
384
+ value=30
385
+ )
386
+ include_audio_data = gr.Checkbox(
387
+ label="Include Audio Data (MP3) in Results", value=False
388
+ )
389
+ include_frame_data = gr.Checkbox(
390
+ label="Include Frame Data (JPG) in Results", value=False
391
+ )
392
+
393
+ t1 = gr.Textbox(value="Waiting for task...",label="Task Progress", show_label=True, lines=3, interactive=False)
394
+
395
+ # Button to create an MCP server point
396
+ submit_btn = gr.Button("Start Video Analysis (No Progress Bar, for MCP Server use)", variant="primary", visible=False)
397
+
398
+ # Analyze button with progress bar
399
+ submit_btn_with_progress = gr.Button("Analyze Video", variant="secondary")
400
+
401
+ with gr.Tabs() as results_tabs:
402
+ with gr.TabItem("Video"):
403
+ video_output_html = gr.HTML(label="Video Frames Analysis", elem_id="video-output-html")
404
+ with gr.TabItem("Audio"):
405
+ audio_player_output = gr.Audio(label="Play Audio", type="filepath", render=True)
406
+ audio_transcription_output = gr.Textbox(label="Audio Transcription", lines=10)
407
+ with gr.TabItem("JSON"):
408
+ results_json_viewer = gr.JSON(
409
+ label="Raw Analysis Results (JSON)",
410
+ elem_classes=["output-box", "results-output"],
411
+ )
412
+
413
+ raw_json_output = gr.State()
414
+
415
+ # Logic for normal button (without progress bar), this button becomes the MCP server point.
416
+ submit_btn.click(
417
+ fn=analyze_video_data,
418
+ inputs=[youtube_url, prompt, quality, time_step, include_audio_data, include_frame_data],
419
+ outputs=[raw_json_output],
420
+ api_name="analyze_video_data",
421
+ show_api=True
422
+ ).then(
423
+ fn=get_video_html_from_json,
424
+ inputs=[raw_json_output],
425
+ outputs=[video_output_html],
426
+ show_api=False
427
+ ).then(
428
+ fn=get_audio_data_from_json,
429
+ inputs=[raw_json_output],
430
+ outputs=[audio_transcription_output, audio_player_output],
431
+ show_api=False
432
+ ).then(
433
+ fn=lambda x: json.loads(x),
434
+ inputs=[raw_json_output],
435
+ outputs=[results_json_viewer],
436
+ show_api=False
437
+ )
438
+
439
+ # Logic for button with progress bar
440
+ submit_btn_with_progress.click(
441
+ fn=analyze_video_data_with_progress_wrapper,
442
+ inputs=[youtube_url, prompt, quality, time_step, include_audio_data, include_frame_data],
443
+ outputs=[raw_json_output],
444
+ api_name="analyze_video_data_with_progress_button",
445
+ show_progress_on=t1,
446
+ show_api=False
447
+ ).then(
448
+ fn=get_video_html_from_json,
449
+ inputs=[raw_json_output],
450
+ outputs=[video_output_html],
451
+ show_api=False
452
+ ).then(
453
+ fn=get_audio_data_from_json,
454
+ inputs=[raw_json_output],
455
+ outputs=[audio_transcription_output, audio_player_output],
456
+ show_api=False
457
+ ).then(
458
+ fn=lambda x: json.loads(x),
459
+ inputs=[raw_json_output],
460
+ outputs=[results_json_viewer],
461
+ show_api=False
462
+ )
463
+
464
+ # Logic of processing cookies
465
+ upload_cookies_file_button.click(
466
+ fn=upload_cookies_file,
467
+ inputs=file_input,
468
+ outputs=output_message,
469
+ show_api=False
470
+ )
471
+
472
+ file_input.clear(
473
+ fn=clear_cookies_files,
474
+ inputs=None,
475
+ outputs=output_message,
476
+ show_api=False
477
+ )
478
+
479
+
480
+ if __name__ == "__main__":
481
  demo.launch(mcp_server=True)
app_srv/audio_processing.py CHANGED
@@ -1,54 +1,54 @@
1
- from transformers import pipeline
2
- import torch
3
- import gc
4
- import librosa
5
-
6
- from model_api import clear_gpu_cache, get_device_and_dtype
7
-
8
-
9
- def transcribe_audio(audio_path: str, device: str = "cuda", torch_dtype: torch.dtype = torch.float16) -> str:
10
- """
11
- Transcribes an MP3 audio file.
12
-
13
- Args:
14
- audio_path: Path to the audio file.
15
- device: The device to use for transcription (e.g., "cuda" for GPU, "cpu" for CPU).
16
- torch_dtype: The torch data type to use for model computations.
17
-
18
- Returns:
19
- The transcribed text.
20
- """
21
- try:
22
- # Create a pipeline with explicit device specification
23
- pipe = pipeline(
24
- "automatic-speech-recognition",
25
- model="openai/whisper-small",
26
- device=device,
27
- chunk_length_s=25,
28
- stride_length_s=2,
29
- torch_dtype=torch_dtype,
30
- )
31
-
32
- # Perform transcription
33
- result = pipe(audio_path, return_timestamps=True)
34
-
35
- # Extract text
36
- text = result['text']
37
-
38
- return text
39
-
40
- except Exception as e:
41
- print(f"Error during transcription: {str(e)}")
42
- return ""
43
-
44
- finally:
45
- del pipe
46
- clear_gpu_cache()
47
-
48
-
49
- if __name__ == "__main__":
50
-
51
- selected_device, selected_dtype = get_device_and_dtype()
52
-
53
- result = transcribe_audio("/workspaces/Video_Analyser/app_srv/downloads/45677153-510d-4f47-95ee-c1b4b0843433/audio.mp3.mp3", selected_device, selected_dtype)
54
  print(result)
 
1
+ from transformers import pipeline
2
+ import torch
3
+ import gc
4
+ import librosa
5
+
6
+ from model_api import clear_gpu_cache, get_device_and_dtype
7
+
8
+
9
+ def transcribe_audio(audio_path: str, device: str = "cuda", torch_dtype: torch.dtype = torch.float16) -> str:
10
+ """
11
+ Transcribes an MP3 audio file.
12
+
13
+ Args:
14
+ audio_path: Path to the audio file.
15
+ device: The device to use for transcription (e.g., "cuda" for GPU, "cpu" for CPU).
16
+ torch_dtype: The torch data type to use for model computations.
17
+
18
+ Returns:
19
+ The transcribed text.
20
+ """
21
+ try:
22
+ # Create a pipeline with explicit device specification
23
+ pipe = pipeline(
24
+ "automatic-speech-recognition",
25
+ model="openai/whisper-small",
26
+ device=device,
27
+ chunk_length_s=25,
28
+ stride_length_s=2,
29
+ torch_dtype=torch_dtype,
30
+ )
31
+
32
+ # Perform transcription
33
+ result = pipe(audio_path, return_timestamps=True)
34
+
35
+ # Extract text
36
+ text = result['text']
37
+
38
+ return text
39
+
40
+ except Exception as e:
41
+ print(f"Error during transcription: {str(e)}")
42
+ return ""
43
+
44
+ finally:
45
+ del pipe
46
+ clear_gpu_cache()
47
+
48
+
49
+ if __name__ == "__main__":
50
+
51
+ selected_device, selected_dtype = get_device_and_dtype()
52
+
53
+ result = transcribe_audio("/workspaces/Video_Analyser/app_srv/downloads/45677153-510d-4f47-95ee-c1b4b0843433/audio.mp3.mp3", selected_device, selected_dtype)
54
  print(result)
app_srv/downloader.py CHANGED
@@ -1,145 +1,145 @@
1
- import yt_dlp
2
- import os
3
- import uuid
4
- import json
5
- from pathlib import Path
6
- from typing import Dict, Any
7
- from datetime import datetime
8
-
9
-
10
- def download_youtube_video(url: str,
11
- base_dir: str = None,
12
- video_quality: int = 720,
13
- youtube_cookies: str = "./cookies.txt") -> Dict[str, str]:
14
- """
15
- Downloads video and audio from YouTube, saving them to a unique GUID folder.
16
- Metadata is saved in JSON format including download datetime and timezone.
17
-
18
- Args:
19
- url (str): YouTube video URL
20
- base_dir (str): Base download directory (default './downloads')
21
- video_quality (int): preferred quality of the downloaded video, acceptable values 144, 240, 360, 480, 720, 1080, 1440, 2160. Default value: 720.
22
-
23
- Returns:
24
- dict: Dictionary with file paths and information:
25
- {
26
- 'data_path': str, # Path to download directory
27
- 'video_path': str, # Full path to video.mp4
28
- 'audio_path': str, # Full path to audio.mp3
29
- 'metadata_path': str # Full path to metadata.json
30
- }
31
-
32
- Raises:
33
- RuntimeError: If download fails
34
- """
35
-
36
- youtube_quality = [144, 240, 360, 480, 720, 1080, 1440, 2160]
37
-
38
- if video_quality not in youtube_quality:
39
- raise ValueError(
40
- f"Invalid video quality: '{video_quality}'. "
41
- f"Allowed qualities are: {', '.join(map(str, youtube_quality))}"
42
- )
43
-
44
- try:
45
-
46
- # Determine the base directory
47
- if base_dir is None:
48
- # Get the directory of the current script file
49
- script_dir = Path(__file__).parent
50
- base_dir = script_dir / "downloads" # Создаем папку 'downloads' рядом со скриптом
51
- else:
52
- base_dir = Path(base_dir)
53
-
54
- # Generate GUID and create folder
55
- guid = str(uuid.uuid4())
56
- download_dir = Path(base_dir) / guid
57
- os.makedirs(download_dir, exist_ok=True)
58
-
59
- # File paths
60
- video_path = download_dir / "video.mp4"
61
- audio_path = download_dir / "audio.mp3"
62
- metadata_path = download_dir / "metadata.json"
63
-
64
- # Record exact download start time
65
- download_datetime = datetime.now()
66
- current_timezone = download_datetime.astimezone().tzinfo
67
-
68
- # 1. Download video (MP4)
69
- video_opts = {
70
- 'format': (
71
- f"bestvideo[height={video_quality}][ext=mp4]"
72
- f"/worstvideo[height>{video_quality}][ext=mp4]"
73
- f"/bestvideo[height<={video_quality}][ext=mp4]"
74
- ),
75
- 'outtmpl': str(video_path),
76
- 'quiet': True,
77
- 'no_warnings': True,
78
- 'restrict_filenames': True,
79
- 'cookiefile': youtube_cookies,
80
- }
81
-
82
- with yt_dlp.YoutubeDL(video_opts) as ydl:
83
- video_info = ydl.extract_info(url, download=True)
84
-
85
- # 2. Download audio (MP3)
86
- audio_opts = {
87
- 'format': 'bestaudio/best',
88
- 'outtmpl': str(audio_path),
89
- 'quiet': True,
90
- 'postprocessors': [{
91
- 'key': 'FFmpegExtractAudio',
92
- 'preferredcodec': 'mp3',
93
- 'preferredquality': '128',
94
- }],
95
- 'cookiefile': youtube_cookies,
96
- }
97
-
98
- with yt_dlp.YoutubeDL(audio_opts) as ydl:
99
- audio_info = ydl.extract_info(url, download=True)
100
-
101
- # Format date and time for storage
102
- formatted_date = download_datetime.strftime('%Y-%m-%d')
103
- formatted_time = download_datetime.strftime('%H:%M:%S')
104
-
105
- # 3. Save metadata to JSON
106
- metadata = {
107
- 'original_url': url,
108
- 'guid': guid,
109
- 'download_info': {
110
- 'date': formatted_date,
111
- 'time': formatted_time,
112
- 'timezone': str(current_timezone),
113
- 'datetime_iso': download_datetime.isoformat(),
114
- },
115
- 'video': {
116
- 'path': str(video_path),
117
- 'title': video_info.get('title'),
118
- 'duration': video_info.get('duration'),
119
- 'resolution': video_info.get('resolution'),
120
- 'upload_date': video_info.get('upload_date'),
121
- },
122
- 'audio': {
123
- 'path': str(audio_path),
124
- 'bitrate': audio_info.get('abr'),
125
- 'codec': 'mp3',
126
- },
127
- }
128
-
129
- with open(metadata_path, 'w', encoding='utf-8') as f:
130
- json.dump(metadata, f, indent=2, ensure_ascii=False)
131
-
132
- return {
133
- 'data_path': str(download_dir.absolute()),
134
- 'video_path': str(video_path.absolute()),
135
- 'audio_path': str(audio_path.absolute()) + ".mp3",
136
- 'metadata_path': str(metadata_path),
137
- }
138
-
139
- except Exception as e:
140
- raise RuntimeError(f"Media download error: {str(e)}")
141
-
142
- if __name__ == "__main__":
143
- video_url = "https://www.youtube.com/watch?v=FK3dav4bA4s"
144
- downloaded_video = download_youtube_video(video_url, "./temp")
145
  print(downloaded_video)
 
1
+ import yt_dlp
2
+ import os
3
+ import uuid
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Dict, Any
7
+ from datetime import datetime
8
+
9
+
10
+ def download_youtube_video(url: str,
11
+ base_dir: str = None,
12
+ video_quality: int = 720,
13
+ youtube_cookies: str = "./cookies.txt") -> Dict[str, str]:
14
+ """
15
+ Downloads video and audio from YouTube, saving them to a unique GUID folder.
16
+ Metadata is saved in JSON format including download datetime and timezone.
17
+
18
+ Args:
19
+ url (str): YouTube video URL
20
+ base_dir (str): Base download directory (default './downloads')
21
+ video_quality (int): preferred quality of the downloaded video, acceptable values 144, 240, 360, 480, 720, 1080, 1440, 2160. Default value: 720.
22
+
23
+ Returns:
24
+ dict: Dictionary with file paths and information:
25
+ {
26
+ 'data_path': str, # Path to download directory
27
+ 'video_path': str, # Full path to video.mp4
28
+ 'audio_path': str, # Full path to audio.mp3
29
+ 'metadata_path': str # Full path to metadata.json
30
+ }
31
+
32
+ Raises:
33
+ RuntimeError: If download fails
34
+ """
35
+
36
+ youtube_quality = [144, 240, 360, 480, 720, 1080, 1440, 2160]
37
+
38
+ if video_quality not in youtube_quality:
39
+ raise ValueError(
40
+ f"Invalid video quality: '{video_quality}'. "
41
+ f"Allowed qualities are: {', '.join(map(str, youtube_quality))}"
42
+ )
43
+
44
+ try:
45
+
46
+ # Determine the base directory
47
+ if base_dir is None:
48
+ # Get the directory of the current script file
49
+ script_dir = Path(__file__).parent
50
+ base_dir = script_dir / "downloads" # Создаем папку 'downloads' рядом со скриптом
51
+ else:
52
+ base_dir = Path(base_dir)
53
+
54
+ # Generate GUID and create folder
55
+ guid = str(uuid.uuid4())
56
+ download_dir = Path(base_dir) / guid
57
+ os.makedirs(download_dir, exist_ok=True)
58
+
59
+ # File paths
60
+ video_path = download_dir / "video.mp4"
61
+ audio_path = download_dir / "audio.mp3"
62
+ metadata_path = download_dir / "metadata.json"
63
+
64
+ # Record exact download start time
65
+ download_datetime = datetime.now()
66
+ current_timezone = download_datetime.astimezone().tzinfo
67
+
68
+ # 1. Download video (MP4)
69
+ video_opts = {
70
+ 'format': (
71
+ f"bestvideo[height={video_quality}][ext=mp4]"
72
+ f"/worstvideo[height>{video_quality}][ext=mp4]"
73
+ f"/bestvideo[height<={video_quality}][ext=mp4]"
74
+ ),
75
+ 'outtmpl': str(video_path),
76
+ 'quiet': True,
77
+ 'no_warnings': True,
78
+ 'restrict_filenames': True,
79
+ 'cookiefile': youtube_cookies,
80
+ }
81
+
82
+ with yt_dlp.YoutubeDL(video_opts) as ydl:
83
+ video_info = ydl.extract_info(url, download=True)
84
+
85
+ # 2. Download audio (MP3)
86
+ audio_opts = {
87
+ 'format': 'bestaudio/best',
88
+ 'outtmpl': str(audio_path),
89
+ 'quiet': True,
90
+ 'postprocessors': [{
91
+ 'key': 'FFmpegExtractAudio',
92
+ 'preferredcodec': 'mp3',
93
+ 'preferredquality': '128',
94
+ }],
95
+ 'cookiefile': youtube_cookies,
96
+ }
97
+
98
+ with yt_dlp.YoutubeDL(audio_opts) as ydl:
99
+ audio_info = ydl.extract_info(url, download=True)
100
+
101
+ # Format date and time for storage
102
+ formatted_date = download_datetime.strftime('%Y-%m-%d')
103
+ formatted_time = download_datetime.strftime('%H:%M:%S')
104
+
105
+ # 3. Save metadata to JSON
106
+ metadata = {
107
+ 'original_url': url,
108
+ 'guid': guid,
109
+ 'download_info': {
110
+ 'date': formatted_date,
111
+ 'time': formatted_time,
112
+ 'timezone': str(current_timezone),
113
+ 'datetime_iso': download_datetime.isoformat(),
114
+ },
115
+ 'video': {
116
+ 'path': str(video_path),
117
+ 'title': video_info.get('title'),
118
+ 'duration': video_info.get('duration'),
119
+ 'resolution': video_info.get('resolution'),
120
+ 'upload_date': video_info.get('upload_date'),
121
+ },
122
+ 'audio': {
123
+ 'path': str(audio_path),
124
+ 'bitrate': audio_info.get('abr'),
125
+ 'codec': 'mp3',
126
+ },
127
+ }
128
+
129
+ with open(metadata_path, 'w', encoding='utf-8') as f:
130
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
131
+
132
+ return {
133
+ 'data_path': str(download_dir.absolute()),
134
+ 'video_path': str(video_path.absolute()),
135
+ 'audio_path': str(audio_path.absolute()) + ".mp3",
136
+ 'metadata_path': str(metadata_path),
137
+ }
138
+
139
+ except Exception as e:
140
+ raise RuntimeError(f"Media download error: {str(e)}")
141
+
142
+ if __name__ == "__main__":
143
+ video_url = "https://www.youtube.com/watch?v=FK3dav4bA4s"
144
+ downloaded_video = download_youtube_video(video_url, "./temp")
145
  print(downloaded_video)
app_srv/model_api.py CHANGED
@@ -1,122 +1,122 @@
1
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, logging
2
- from qwen_vl_utils import process_vision_info
3
- import torch
4
- from PIL import Image
5
- import gc
6
- import warnings
7
-
8
-
9
- ### Utility functions ###
10
- def clear_gpu_cache():
11
- """Clears GPU cache and performs garbage collection"""
12
- torch.cuda.empty_cache()
13
- gc.collect()
14
-
15
-
16
- def get_device_and_dtype():
17
- if torch.cuda.is_available():
18
- device = 'cuda'
19
- torch_dtype = torch.float16
20
- else:
21
- device = 'cpu'
22
- torch_dtype = torch.float32
23
- return device, torch_dtype
24
-
25
-
26
- ### Model-related functions ###
27
- def initialize_model(device: str, torch_dtype: torch.dtype):
28
- """Initialize and return model with GPU optimization"""
29
- with warnings.catch_warnings():
30
- warnings.simplefilter("ignore")
31
- return Qwen2_5_VLForConditionalGeneration.from_pretrained(
32
- "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
33
- device_map=device,
34
- torch_dtype=torch_dtype,
35
- )
36
-
37
-
38
- def initialize_processor():
39
- """Initialize and return processor"""
40
- return AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
41
-
42
-
43
- def prepare_model_inputs(processor, messages, device: str):
44
- """Prepare all inputs for model inference"""
45
- text = processor.apply_chat_template(
46
- messages, tokenize=False, add_generation_prompt=True
47
- )
48
- image_inputs, video_inputs = process_vision_info(messages)
49
-
50
- return processor(
51
- text=[text],
52
- images=image_inputs,
53
- videos=video_inputs,
54
- padding=True,
55
- return_tensors="pt"
56
- ).to(device)
57
-
58
-
59
- def generate_description(model, inputs):
60
- """Generate description using model"""
61
- return model.generate(
62
- **inputs,
63
- max_new_tokens=512,
64
- do_sample=True,
65
- temperature=0.7,
66
- top_p=0.9
67
- )
68
-
69
-
70
- def process_model_output(processor, inputs, generated_ids):
71
- """Process model output to get clean text"""
72
- generated_ids_trimmed = [
73
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
74
- ]
75
- return processor.batch_decode(
76
- generated_ids_trimmed,
77
- skip_special_tokens=True,
78
- clean_up_tokenization_spaces=False
79
- )[0]
80
-
81
-
82
- def create_prompt_message(image_path, timestamp, custom_prompt=None):
83
- """
84
- Create standardized message prompt for model with optional custom prompt
85
-
86
- Args:
87
- image_path (str): Path to the image file
88
- timestamp (str): Video timestamp in HH:MM:SS.SSS format
89
- custom_prompt (str, optional): Custom prompt text. If None, uses default prompt.
90
-
91
- Returns:
92
- list: Formatted message for model input
93
- """
94
- default_prompt = f"Video timestamp: {timestamp}. Describe in detail what is happening in this frame."
95
- prompt_text = custom_prompt.format(timestamp=timestamp) if custom_prompt else default_prompt
96
-
97
- return [{
98
- "role": "user",
99
- "content": [
100
- {"type": "image", "image": f"file://{image_path}"},
101
- {"type": "text", "text": prompt_text},
102
- ],
103
- }]
104
-
105
-
106
- if __name__ == "__main__":
107
-
108
- selected_device, selected_dtype = get_device_and_dtype()
109
-
110
- image_url = "/workspaces/Video_Analyser/app_srv/temp/a28af289-377d-468d-b0eb-ed0f7dcd2ab3/frames/frame_00_01_50_110.jpg"
111
- timestamp = "00:01:16.076"
112
- custom_prompt = "Timestamp {timestamp}. Analyze this frame focusing on objects in the foreground."
113
-
114
- model = initialize_model(selected_device, selected_dtype)
115
- processor = initialize_processor()
116
- messages = create_prompt_message(image_url, timestamp, custom_prompt)
117
- inputs = prepare_model_inputs(processor, messages, selected_device)
118
-
119
- generated_ids = generate_description(model, inputs)
120
- image_description = process_model_output(processor, inputs, generated_ids)
121
-
122
- print(image_description)
 
1
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, logging
2
+ from qwen_vl_utils import process_vision_info
3
+ import torch
4
+ from PIL import Image
5
+ import gc
6
+ import warnings
7
+
8
+
9
+ ### Utility functions ###
10
+ def clear_gpu_cache():
11
+ """Clears GPU cache and performs garbage collection"""
12
+ torch.cuda.empty_cache()
13
+ gc.collect()
14
+
15
+
16
+ def get_device_and_dtype():
17
+ if torch.cuda.is_available():
18
+ device = 'cuda'
19
+ torch_dtype = torch.float16
20
+ else:
21
+ device = 'cpu'
22
+ torch_dtype = torch.float32
23
+ return device, torch_dtype
24
+
25
+
26
+ ### Model-related functions ###
27
+ def initialize_model(device: str, torch_dtype: torch.dtype):
28
+ """Initialize and return model with GPU optimization"""
29
+ with warnings.catch_warnings():
30
+ warnings.simplefilter("ignore")
31
+ return Qwen2_5_VLForConditionalGeneration.from_pretrained(
32
+ "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
33
+ device_map=device,
34
+ torch_dtype=torch_dtype,
35
+ )
36
+
37
+
38
+ def initialize_processor():
39
+ """Initialize and return processor"""
40
+ return AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct-AWQ")
41
+
42
+
43
+ def prepare_model_inputs(processor, messages, device: str):
44
+ """Prepare all inputs for model inference"""
45
+ text = processor.apply_chat_template(
46
+ messages, tokenize=False, add_generation_prompt=True
47
+ )
48
+ image_inputs, video_inputs = process_vision_info(messages)
49
+
50
+ return processor(
51
+ text=[text],
52
+ images=image_inputs,
53
+ videos=video_inputs,
54
+ padding=True,
55
+ return_tensors="pt"
56
+ ).to(device)
57
+
58
+
59
+ def generate_description(model, inputs):
60
+ """Generate description using model"""
61
+ return model.generate(
62
+ **inputs,
63
+ max_new_tokens=512,
64
+ do_sample=True,
65
+ temperature=0.7,
66
+ top_p=0.9
67
+ )
68
+
69
+
70
+ def process_model_output(processor, inputs, generated_ids):
71
+ """Process model output to get clean text"""
72
+ generated_ids_trimmed = [
73
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
74
+ ]
75
+ return processor.batch_decode(
76
+ generated_ids_trimmed,
77
+ skip_special_tokens=True,
78
+ clean_up_tokenization_spaces=False
79
+ )[0]
80
+
81
+
82
+ def create_prompt_message(image_path, timestamp, custom_prompt=None):
83
+ """
84
+ Create standardized message prompt for model with optional custom prompt
85
+
86
+ Args:
87
+ image_path (str): Path to the image file
88
+ timestamp (str): Video timestamp in HH:MM:SS.SSS format
89
+ custom_prompt (str, optional): Custom prompt text. If None, uses default prompt.
90
+
91
+ Returns:
92
+ list: Formatted message for model input
93
+ """
94
+ default_prompt = f"Video timestamp: {timestamp}. Describe in detail what is happening in this frame."
95
+ prompt_text = custom_prompt.format(timestamp=timestamp) if custom_prompt else default_prompt
96
+
97
+ return [{
98
+ "role": "user",
99
+ "content": [
100
+ {"type": "image", "image": f"file://{image_path}"},
101
+ {"type": "text", "text": prompt_text},
102
+ ],
103
+ }]
104
+
105
+
106
+ if __name__ == "__main__":
107
+
108
+ selected_device, selected_dtype = get_device_and_dtype()
109
+
110
+ image_url = "/workspaces/Video_Analyser/app_srv/temp/a28af289-377d-468d-b0eb-ed0f7dcd2ab3/frames/frame_00_01_50_110.jpg"
111
+ timestamp = "00:01:16.076"
112
+ custom_prompt = "Timestamp {timestamp}. Analyze this frame focusing on objects in the foreground."
113
+
114
+ model = initialize_model(selected_device, selected_dtype)
115
+ processor = initialize_processor()
116
+ messages = create_prompt_message(image_url, timestamp, custom_prompt)
117
+ inputs = prepare_model_inputs(processor, messages, selected_device)
118
+
119
+ generated_ids = generate_description(model, inputs)
120
+ image_description = process_model_output(processor, inputs, generated_ids)
121
+
122
+ print(image_description)
app_srv/test.ipynb CHANGED
@@ -1,555 +1,555 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "7f265e58",
6
- "metadata": {},
7
- "source": [
8
- "# Download video and audio from YouTube"
9
- ]
10
- },
11
- {
12
- "cell_type": "code",
13
- "execution_count": null,
14
- "id": "69ee0ec3",
15
- "metadata": {},
16
- "outputs": [],
17
- "source": [
18
- "import yt_dlp\n",
19
- "import os\n",
20
- "import uuid\n",
21
- "import json\n",
22
- "from pathlib import Path\n",
23
- "from typing import Dict, Any\n",
24
- "from datetime import datetime\n",
25
- "\n",
26
- "\n",
27
- "def download_youtube_media(url: str,\n",
28
- " base_dir: str = \"./downloads\",\n",
29
- " video_quality: int = 720) -> Dict[str, str]:\n",
30
- " \"\"\"\n",
31
- " Downloads video and audio from YouTube, saving them to a unique GUID folder.\n",
32
- " Metadata is saved in JSON format including download datetime and timezone.\n",
33
- " \n",
34
- " Args:\n",
35
- " url (str): YouTube video URL\n",
36
- " base_dir (str): Base download directory (default './downloads')\n",
37
- " video_quality (int): preferred quality of the downloaded video, acceptable values 144, 240, 360, 480, 720, 1080, 1440, 2160.\n",
38
- " \n",
39
- " Returns:\n",
40
- " dict: Dictionary with file paths and information:\n",
41
- " {\n",
42
- " 'data_path': str, # Path to download directory\n",
43
- " 'video_path': str, # Full path to video.mp4\n",
44
- " 'audio_path': str, # Full path to audio.mp3\n",
45
- " 'metadata_path': str # Full path to metadata.json\n",
46
- " }\n",
47
- " \n",
48
- " Raises:\n",
49
- " RuntimeError: If download fails\n",
50
- " \"\"\"\n",
51
- " \n",
52
- " youtube_quality = [144, 240, 360, 480, 720, 1080, 1440, 2160]\n",
53
- "\n",
54
- "\n",
55
- " if video_quality not in youtube_quality:\n",
56
- " raise ValueError(\n",
57
- " f\"Invalid video quality: '{video_quality}'. \"\n",
58
- " f\"Allowed qualities are: {', '.join(map(str, youtube_quality))}\"\n",
59
- " )\n",
60
- "\n",
61
- " try:\n",
62
- " # Generate GUID and create folder\n",
63
- " guid = str(uuid.uuid4())\n",
64
- " download_dir = Path(base_dir) / guid\n",
65
- " os.makedirs(download_dir, exist_ok=True)\n",
66
- " \n",
67
- " # File paths\n",
68
- " video_path = download_dir / \"video.mp4\"\n",
69
- " audio_path = download_dir / \"audio.mp3\"\n",
70
- " metadata_path = download_dir / \"metadata.json\"\n",
71
- "\n",
72
- " # Record exact download start time\n",
73
- " download_datetime = datetime.now()\n",
74
- " current_timezone = download_datetime.astimezone().tzinfo\n",
75
- " \n",
76
- " # 1. Download video (MP4)\n",
77
- " video_opts = {\n",
78
- " 'format': (\n",
79
- " f\"bestvideo[height={video_quality}][ext=mp4]\"\n",
80
- " f\"/worstvideo[height>{video_quality}][ext=mp4]\"\n",
81
- " f\"/bestvideo[height<={video_quality}][ext=mp4]\"\n",
82
- " ),\n",
83
- " 'outtmpl': str(video_path),\n",
84
- " 'quiet': True,\n",
85
- " 'no_warnings': True,\n",
86
- " 'restrict_filenames': True,\n",
87
- " }\n",
88
- " \n",
89
- " with yt_dlp.YoutubeDL(video_opts) as ydl:\n",
90
- " video_info = ydl.extract_info(url, download=True)\n",
91
- " \n",
92
- " # 2. Download audio (MP3)\n",
93
- " audio_opts = {\n",
94
- " 'format': 'bestaudio/best',\n",
95
- " 'outtmpl': str(audio_path),\n",
96
- " 'quiet': True,\n",
97
- " 'postprocessors': [{\n",
98
- " 'key': 'FFmpegExtractAudio',\n",
99
- " 'preferredcodec': 'mp3',\n",
100
- " 'preferredquality': '128',\n",
101
- " }],\n",
102
- " }\n",
103
- " \n",
104
- " with yt_dlp.YoutubeDL(audio_opts) as ydl:\n",
105
- " audio_info = ydl.extract_info(url, download=True)\n",
106
- " \n",
107
- " # Format date and time for storage\n",
108
- " formatted_date = download_datetime.strftime('%Y-%m-%d')\n",
109
- " formatted_time = download_datetime.strftime('%H:%M:%S')\n",
110
- " \n",
111
- " # 3. Save metadata to JSON\n",
112
- " metadata = {\n",
113
- " 'original_url': url,\n",
114
- " 'guid': guid,\n",
115
- " 'download_info': {\n",
116
- " 'date': formatted_date,\n",
117
- " 'time': formatted_time,\n",
118
- " 'timezone': str(current_timezone),\n",
119
- " 'datetime_iso': download_datetime.isoformat(),\n",
120
- " },\n",
121
- " 'video': {\n",
122
- " 'path': str(video_path),\n",
123
- " 'title': video_info.get('title'),\n",
124
- " 'duration': video_info.get('duration'),\n",
125
- " 'resolution': video_info.get('resolution'),\n",
126
- " 'upload_date': video_info.get('upload_date'),\n",
127
- " },\n",
128
- " 'audio': {\n",
129
- " 'path': str(audio_path),\n",
130
- " 'bitrate': audio_info.get('abr'),\n",
131
- " 'codec': 'mp3',\n",
132
- " },\n",
133
- " }\n",
134
- " \n",
135
- " with open(metadata_path, 'w', encoding='utf-8') as f:\n",
136
- " json.dump(metadata, f, indent=2, ensure_ascii=False)\n",
137
- " \n",
138
- " return {\n",
139
- " 'data_path': str(download_dir.absolute()),\n",
140
- " 'video': str(video_path.absolute()),\n",
141
- " 'audio': str(audio_path.absolute()) + \".mp3\",\n",
142
- " 'metadata': str(metadata_path),\n",
143
- " }\n",
144
- " \n",
145
- " except Exception as e:\n",
146
- " raise RuntimeError(f\"Media download error: {str(e)}\")\n",
147
- "\n",
148
- "if __name__ == \"__main__\":\n",
149
- " video_url = \"https://www.youtube.com/watch?v=FK3dav4bA4s\"\n",
150
- " downloaded_video = download_youtube_media(video_url, \"./temp\")\n",
151
- " print(downloaded_video)"
152
- ]
153
- },
154
- {
155
- "cell_type": "code",
156
- "execution_count": null,
157
- "id": "e79c5071",
158
- "metadata": {},
159
- "outputs": [],
160
- "source": [
161
- "downloaded_video"
162
- ]
163
- },
164
- {
165
- "cell_type": "code",
166
- "execution_count": null,
167
- "id": "745320a1",
168
- "metadata": {},
169
- "outputs": [],
170
- "source": [
171
- "import copy\n",
172
- "test = copy.deepcopy(downloaded_video)\n",
173
- "\n",
174
- "print(test)"
175
- ]
176
- },
177
- {
178
- "cell_type": "markdown",
179
- "id": "f62e8b83",
180
- "metadata": {},
181
- "source": [
182
- "# Split video to frames in jpg"
183
- ]
184
- },
185
- {
186
- "cell_type": "code",
187
- "execution_count": null,
188
- "id": "5461045d",
189
- "metadata": {},
190
- "outputs": [],
191
- "source": [
192
- "import os\n",
193
- "from pathlib import Path\n",
194
- "from typing import Dict\n",
195
- "import av\n",
196
- "\n",
197
- "def extract_frames_with_timestamps(\n",
198
- " video_path: str,\n",
199
- " output_dir: str,\n",
200
- " time_step: float = 1.0,\n",
201
- " quality: int = 95,\n",
202
- " frame_prefix: str = \"frame\",\n",
203
- " use_hw_accel: bool = True,\n",
204
- " hw_device: str = \"cuda\"\n",
205
- ") -> Dict[str, str]:\n",
206
- " \"\"\"\n",
207
- " Extracts frames from video with NVIDIA hardware acceleration (NVDEC/CUDA).\n",
208
- " \n",
209
- " Args:\n",
210
- " video_path: Path to the video file\n",
211
- " output_dir: Directory to save frames\n",
212
- " time_step: Interval between frames (seconds)\n",
213
- " quality: JPEG quality (1-100)\n",
214
- " frame_prefix: Prefix for saved frames\n",
215
- " use_hw_accel: Enable NVIDIA hardware decoding\n",
216
- " hw_device: GPU device (e.g., 'cuda:0')\n",
217
- "\n",
218
- " Returns:\n",
219
- " Dict of {timestamp: frame_path}\n",
220
- " \"\"\"\n",
221
- " result = {}\n",
222
- " try:\n",
223
- " video_path = Path(video_path).absolute()\n",
224
- " output_dir = Path(output_dir).absolute()\n",
225
- " \n",
226
- " if not video_path.exists():\n",
227
- " raise ValueError(f\"Video file not found: {video_path}\")\n",
228
- "\n",
229
- " frames_dir = output_dir / \"frames\"\n",
230
- " frames_dir.mkdir(parents=True, exist_ok=True)\n",
231
- "\n",
232
- " # Configure hardware acceleration\n",
233
- " options = {}\n",
234
- " if use_hw_accel:\n",
235
- " options.update({\n",
236
- " 'hwaccel': 'cuda',\n",
237
- " 'hwaccel_device': hw_device,\n",
238
- " 'hwaccel_output_format': 'cuda' # Keep frames in GPU memory\n",
239
- " })\n",
240
- "\n",
241
- " # Open video with hardware acceleration\n",
242
- " container = av.open(str(video_path), options=options)\n",
243
- " video_stream = next(s for s in container.streams if s.type == 'video')\n",
244
- "\n",
245
- " fps = float(video_stream.average_rate)\n",
246
- " if fps <= 0:\n",
247
- " raise RuntimeError(\"Invalid frame rate\")\n",
248
- "\n",
249
- " frame_interval = max(1, int(round(fps * time_step)))\n",
250
- " frame_count = 0\n",
251
- "\n",
252
- " for frame in container.decode(video_stream):\n",
253
- " if frame_count % frame_interval == 0:\n",
254
- " current_time = float(frame.pts * video_stream.time_base)\n",
255
- " hh = int(current_time // 3600)\n",
256
- " mm = int((current_time % 3600) // 60)\n",
257
- " ss = current_time % 60\n",
258
- " \n",
259
- " timestamp = f\"{hh:02d}:{mm:02d}:{ss:06.3f}\"\n",
260
- " safe_timestamp = timestamp.replace(':', '_').replace('.', '_')\n",
261
- " frame_path = frames_dir / f\"{frame_prefix}_{safe_timestamp}.jpg\"\n",
262
- "\n",
263
- " # Convert GPU frame to CPU if needed\n",
264
- " if hasattr(frame, 'to_ndarray'): # CUDA frame\n",
265
- " img = frame.to_ndarray(format='rgb24')\n",
266
- " img = av.VideoFrame.from_ndarray(img, format='rgb24')\n",
267
- " else:\n",
268
- " img = frame\n",
269
- "\n",
270
- " img.to_image().save(str(frame_path), quality=quality)\n",
271
- " result[timestamp] = str(frame_path)\n",
272
- "\n",
273
- " frame_count += 1\n",
274
- "\n",
275
- " return result\n",
276
- "\n",
277
- " except Exception as e:\n",
278
- " for path in result.values():\n",
279
- " try: os.remove(path)\n",
280
- " except: pass\n",
281
- " raise RuntimeError(f\"Frame extraction failed: {str(e)}\")\n",
282
- "\n",
283
- "if __name__ == \"__main__\":\n",
284
- " frames = extract_frames_with_timestamps(downloaded_video['video'], downloaded_video['data_path'], time_step=2)\n",
285
- " print(frames)\n"
286
- ]
287
- },
288
- {
289
- "cell_type": "markdown",
290
- "id": "ba7b44d6",
291
- "metadata": {},
292
- "source": [
293
- "# Video Analyzer"
294
- ]
295
- },
296
- {
297
- "cell_type": "code",
298
- "execution_count": null,
299
- "id": "418ae84e",
300
- "metadata": {},
301
- "outputs": [],
302
- "source": [
303
- "# pip install autoawq --upgrade"
304
- ]
305
- },
306
- {
307
- "cell_type": "code",
308
- "execution_count": null,
309
- "id": "fe840ffa",
310
- "metadata": {},
311
- "outputs": [],
312
- "source": [
313
- "from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
314
- "from qwen_vl_utils import process_vision_info\n",
315
- "import torch\n",
316
- "\n",
317
- "# default: Load the model on the available device(s)\n",
318
- "model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
319
- " \"Qwen/Qwen2.5-VL-7B-Instruct-AWQ\",\n",
320
- " torch_dtype=torch.float16,\n",
321
- " device_map=\"auto\",\n",
322
- ")\n",
323
- "\n",
324
- "# default processer\n",
325
- "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2.5-VL-7B-Instruct-AWQ\")\n",
326
- "\n",
327
- "messages = [\n",
328
- " {\n",
329
- " \"role\": \"user\",\n",
330
- " \"content\": [\n",
331
- " {\"type\": \"image\", \"image\": \"file:///workspaces/Video_Analyser/temp/fcaaa3e8-d99d-47c5-b464-617e4c9a1b1a/frames/frame_00_02_51_171.jpg\"},\n",
332
- " {\"type\": \"text\", \"text\": \"Describe this image in\"},\n",
333
- " ],\n",
334
- " }\n",
335
- "]\n",
336
- "\n",
337
- "# Preparation for inference\n",
338
- "text = processor.apply_chat_template(\n",
339
- " messages, tokenize=False, add_generation_prompt=True\n",
340
- ")\n",
341
- "\n",
342
- "image_inputs, video_inputs = process_vision_info(messages)\n",
343
- "inputs = processor(\n",
344
- " text=[text],\n",
345
- " images=image_inputs,\n",
346
- " videos=video_inputs,\n",
347
- " padding=True,\n",
348
- " return_tensors=\"pt\",\n",
349
- ")\n",
350
- "inputs = inputs.to(\"cuda\")\n",
351
- "\n",
352
- "# Inference: Generation of the output\n",
353
- "generated_ids = model.generate(**inputs, max_new_tokens=128)\n",
354
- "generated_ids_trimmed = [\n",
355
- " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
356
- "]\n",
357
- "output_text = processor.batch_decode(\n",
358
- " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
359
- ")\n",
360
- "print(output_text)\n"
361
- ]
362
- },
363
- {
364
- "cell_type": "markdown",
365
- "id": "3ba7ef97",
366
- "metadata": {},
367
- "source": [
368
- "# Audio content"
369
- ]
370
- },
371
- {
372
- "cell_type": "code",
373
- "execution_count": null,
374
- "id": "bf798dcf",
375
- "metadata": {},
376
- "outputs": [],
377
- "source": [
378
- "import torch\n",
379
- "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\n",
380
- "from datasets import load_dataset\n",
381
- "\n",
382
- "\n",
383
- "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
384
- "torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n",
385
- "\n",
386
- "model_id = \"openai/whisper-large-v3-turbo\"\n",
387
- "\n",
388
- "model = AutoModelForSpeechSeq2Seq.from_pretrained(\n",
389
- " model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True\n",
390
- ").to(device)\n",
391
- "\n",
392
- "pipe = pipeline(\n",
393
- " \"automatic-speech-recognition\",\n",
394
- " model=model,\n",
395
- " torch_dtype=torch_dtype,\n",
396
- " device=device,\n",
397
- " return_timestamps=True\n",
398
- ")\n",
399
- "\n",
400
- "\n",
401
- "result = pipe(\"/workspaces/Video_Analyser/app_srv/temp/a6fba6eb-038e-4f4e-bcb7-f41d87ee1422/audio.mp3.mp3\")\n",
402
- "\n",
403
- "result"
404
- ]
405
- },
406
- {
407
- "cell_type": "code",
408
- "execution_count": null,
409
- "id": "980e9742",
410
- "metadata": {},
411
- "outputs": [],
412
- "source": [
413
- "! pip install librosa"
414
- ]
415
- },
416
- {
417
- "cell_type": "code",
418
- "execution_count": null,
419
- "id": "a66eabd3",
420
- "metadata": {},
421
- "outputs": [],
422
- "source": [
423
- "! pip install -U openai-whisper"
424
- ]
425
- },
426
- {
427
- "cell_type": "code",
428
- "execution_count": null,
429
- "id": "e6d0e5fd",
430
- "metadata": {},
431
- "outputs": [],
432
- "source": [
433
- "import torch\n",
434
- "from transformers import pipeline\n",
435
- "from typing import Dict, Union\n",
436
- "\n",
437
- "def transcribe_with_timestamps_optimized(\n",
438
- " audio_path: str,\n",
439
- " model_name: str = \"openai/whisper-small\",\n",
440
- " language: str = \"en\",\n",
441
- " chunk_length_s: int = 5,\n",
442
- " stride_length_s: Union[int, tuple] = (2, 2)\n",
443
- ") -> Dict[float, str]:\n",
444
- " device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
445
- " print(f\"Используемое устройство: {device}\")\n",
446
- "\n",
447
- " try:\n",
448
- " transcriber = pipeline(\n",
449
- " \"automatic-speech-recognition\",\n",
450
- " model=model_name,\n",
451
- " chunk_length_s=chunk_length_s,\n",
452
- " stride_length_s=stride_length_s,\n",
453
- " device=device,\n",
454
- " )\n",
455
- " except Exception as e:\n",
456
- " print(f\"Ошибка при инициализации pipeline: {e}\")\n",
457
- " print(\"Убедитесь, что модель установлена или доступна на Hugging Face Hub.\")\n",
458
- " raise\n",
459
- "\n",
460
- " print(f\"Начало транскрипции файла: {audio_path}\")\n",
461
- " try:\n",
462
- " result = transcriber(\n",
463
- " audio_path,\n",
464
- " return_timestamps=\"True\",\n",
465
- " generate_kwargs={\"language\": language} if language else {}\n",
466
- " )\n",
467
- " except Exception as e:\n",
468
- " print(f\"Ошибка при транскрипции аудиофайла: {e}\")\n",
469
- " return {}\n",
470
- "\n",
471
- " transcribed_segments = {}\n",
472
- " if \"chunks\" in result and result[\"chunks\"]:\n",
473
- " for chunk in result[\"chunks\"]:\n",
474
- " start_time = chunk[\"timestamp\"][0] if chunk[\"timestamp\"][0] is not None else 0.0\n",
475
- " text = chunk[\"text\"].strip()\n",
476
- " transcribed_segments[float(start_time)] = text\n",
477
- " else:\n",
478
- " if \"text\" in result:\n",
479
- " transcribed_segments[0.0] = result[\"text\"].strip()\n",
480
- " print(\"Предупреждение: получена только общая транскрипция, без посегментных временных меток.\")\n",
481
- " print(\"Убедитесь, что 'return_timestamps=\\\"True\\\"' или 'return_timestamps=\\\"word\\\"' используется.\")\n",
482
- " else:\n",
483
- " print(\"Не удалось получить транскрипцию или временные метки.\")\n",
484
- "\n",
485
- " print(\"Транскрипция завершена.\")\n",
486
- " return transcribed_segments\n",
487
- "\n",
488
- "\n",
489
- "# Пример использования\n",
490
- "if __name__ == \"__main__\":\n",
491
- " \n",
492
- " result = transcribe_with_timestamps_optimized(\n",
493
- " audio_path=\"/workspaces/Video_Analyser/app_srv/temp/a6fba6eb-038e-4f4e-bcb7-f41d87ee1422/audio.mp3.mp3\",\n",
494
- " )\n",
495
- "\n",
496
- " print(result)"
497
- ]
498
- },
499
- {
500
- "cell_type": "code",
501
- "execution_count": null,
502
- "id": "ca9a4832",
503
- "metadata": {},
504
- "outputs": [],
505
- "source": [
506
- "from transformers import pipeline\n",
507
- "import librosa\n",
508
- "\n",
509
- "def transcribe_with_pipeline(audio_path):\n",
510
- " pipe = pipeline(\n",
511
- " \"automatic-speech-recognition\",\n",
512
- " model=\"openai/whisper-small\",\n",
513
- " chunk_length_s=30, # разбивает на чанки по 30 секунд\n",
514
- " stride_length_s=2, # перекрытие между чанками\n",
515
- " )\n",
516
- " \n",
517
- " result = pipe(audio_path, return_timestamps=True)\n",
518
- " return result['text']\n",
519
- "\n",
520
- "result = transcribe_with_pipeline(\"/workspaces/Video_Analyser/app_srv/temp/a6fba6eb-038e-4f4e-bcb7-f41d87ee1422/audio.mp3.mp3\")"
521
- ]
522
- },
523
- {
524
- "cell_type": "code",
525
- "execution_count": null,
526
- "id": "7cd4e28e",
527
- "metadata": {},
528
- "outputs": [],
529
- "source": [
530
- "result"
531
- ]
532
- }
533
- ],
534
- "metadata": {
535
- "kernelspec": {
536
- "display_name": "Python 3",
537
- "language": "python",
538
- "name": "python3"
539
- },
540
- "language_info": {
541
- "codemirror_mode": {
542
- "name": "ipython",
543
- "version": 3
544
- },
545
- "file_extension": ".py",
546
- "mimetype": "text/x-python",
547
- "name": "python",
548
- "nbconvert_exporter": "python",
549
- "pygments_lexer": "ipython3",
550
- "version": "3.12.3"
551
- }
552
- },
553
- "nbformat": 4,
554
- "nbformat_minor": 5
555
- }
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "7f265e58",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Download video and audio from YouTube"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "69ee0ec3",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import yt_dlp\n",
19
+ "import os\n",
20
+ "import uuid\n",
21
+ "import json\n",
22
+ "from pathlib import Path\n",
23
+ "from typing import Dict, Any\n",
24
+ "from datetime import datetime\n",
25
+ "\n",
26
+ "\n",
27
+ "def download_youtube_media(url: str,\n",
28
+ " base_dir: str = \"./downloads\",\n",
29
+ " video_quality: int = 720) -> Dict[str, str]:\n",
30
+ " \"\"\"\n",
31
+ " Downloads video and audio from YouTube, saving them to a unique GUID folder.\n",
32
+ " Metadata is saved in JSON format including download datetime and timezone.\n",
33
+ " \n",
34
+ " Args:\n",
35
+ " url (str): YouTube video URL\n",
36
+ " base_dir (str): Base download directory (default './downloads')\n",
37
+ " video_quality (int): preferred quality of the downloaded video, acceptable values 144, 240, 360, 480, 720, 1080, 1440, 2160.\n",
38
+ " \n",
39
+ " Returns:\n",
40
+ " dict: Dictionary with file paths and information:\n",
41
+ " {\n",
42
+ " 'data_path': str, # Path to download directory\n",
43
+ " 'video_path': str, # Full path to video.mp4\n",
44
+ " 'audio_path': str, # Full path to audio.mp3\n",
45
+ " 'metadata_path': str # Full path to metadata.json\n",
46
+ " }\n",
47
+ " \n",
48
+ " Raises:\n",
49
+ " RuntimeError: If download fails\n",
50
+ " \"\"\"\n",
51
+ " \n",
52
+ " youtube_quality = [144, 240, 360, 480, 720, 1080, 1440, 2160]\n",
53
+ "\n",
54
+ "\n",
55
+ " if video_quality not in youtube_quality:\n",
56
+ " raise ValueError(\n",
57
+ " f\"Invalid video quality: '{video_quality}'. \"\n",
58
+ " f\"Allowed qualities are: {', '.join(map(str, youtube_quality))}\"\n",
59
+ " )\n",
60
+ "\n",
61
+ " try:\n",
62
+ " # Generate GUID and create folder\n",
63
+ " guid = str(uuid.uuid4())\n",
64
+ " download_dir = Path(base_dir) / guid\n",
65
+ " os.makedirs(download_dir, exist_ok=True)\n",
66
+ " \n",
67
+ " # File paths\n",
68
+ " video_path = download_dir / \"video.mp4\"\n",
69
+ " audio_path = download_dir / \"audio.mp3\"\n",
70
+ " metadata_path = download_dir / \"metadata.json\"\n",
71
+ "\n",
72
+ " # Record exact download start time\n",
73
+ " download_datetime = datetime.now()\n",
74
+ " current_timezone = download_datetime.astimezone().tzinfo\n",
75
+ " \n",
76
+ " # 1. Download video (MP4)\n",
77
+ " video_opts = {\n",
78
+ " 'format': (\n",
79
+ " f\"bestvideo[height={video_quality}][ext=mp4]\"\n",
80
+ " f\"/worstvideo[height>{video_quality}][ext=mp4]\"\n",
81
+ " f\"/bestvideo[height<={video_quality}][ext=mp4]\"\n",
82
+ " ),\n",
83
+ " 'outtmpl': str(video_path),\n",
84
+ " 'quiet': True,\n",
85
+ " 'no_warnings': True,\n",
86
+ " 'restrict_filenames': True,\n",
87
+ " }\n",
88
+ " \n",
89
+ " with yt_dlp.YoutubeDL(video_opts) as ydl:\n",
90
+ " video_info = ydl.extract_info(url, download=True)\n",
91
+ " \n",
92
+ " # 2. Download audio (MP3)\n",
93
+ " audio_opts = {\n",
94
+ " 'format': 'bestaudio/best',\n",
95
+ " 'outtmpl': str(audio_path),\n",
96
+ " 'quiet': True,\n",
97
+ " 'postprocessors': [{\n",
98
+ " 'key': 'FFmpegExtractAudio',\n",
99
+ " 'preferredcodec': 'mp3',\n",
100
+ " 'preferredquality': '128',\n",
101
+ " }],\n",
102
+ " }\n",
103
+ " \n",
104
+ " with yt_dlp.YoutubeDL(audio_opts) as ydl:\n",
105
+ " audio_info = ydl.extract_info(url, download=True)\n",
106
+ " \n",
107
+ " # Format date and time for storage\n",
108
+ " formatted_date = download_datetime.strftime('%Y-%m-%d')\n",
109
+ " formatted_time = download_datetime.strftime('%H:%M:%S')\n",
110
+ " \n",
111
+ " # 3. Save metadata to JSON\n",
112
+ " metadata = {\n",
113
+ " 'original_url': url,\n",
114
+ " 'guid': guid,\n",
115
+ " 'download_info': {\n",
116
+ " 'date': formatted_date,\n",
117
+ " 'time': formatted_time,\n",
118
+ " 'timezone': str(current_timezone),\n",
119
+ " 'datetime_iso': download_datetime.isoformat(),\n",
120
+ " },\n",
121
+ " 'video': {\n",
122
+ " 'path': str(video_path),\n",
123
+ " 'title': video_info.get('title'),\n",
124
+ " 'duration': video_info.get('duration'),\n",
125
+ " 'resolution': video_info.get('resolution'),\n",
126
+ " 'upload_date': video_info.get('upload_date'),\n",
127
+ " },\n",
128
+ " 'audio': {\n",
129
+ " 'path': str(audio_path),\n",
130
+ " 'bitrate': audio_info.get('abr'),\n",
131
+ " 'codec': 'mp3',\n",
132
+ " },\n",
133
+ " }\n",
134
+ " \n",
135
+ " with open(metadata_path, 'w', encoding='utf-8') as f:\n",
136
+ " json.dump(metadata, f, indent=2, ensure_ascii=False)\n",
137
+ " \n",
138
+ " return {\n",
139
+ " 'data_path': str(download_dir.absolute()),\n",
140
+ " 'video': str(video_path.absolute()),\n",
141
+ " 'audio': str(audio_path.absolute()) + \".mp3\",\n",
142
+ " 'metadata': str(metadata_path),\n",
143
+ " }\n",
144
+ " \n",
145
+ " except Exception as e:\n",
146
+ " raise RuntimeError(f\"Media download error: {str(e)}\")\n",
147
+ "\n",
148
+ "if __name__ == \"__main__\":\n",
149
+ " video_url = \"https://www.youtube.com/watch?v=FK3dav4bA4s\"\n",
150
+ " downloaded_video = download_youtube_media(video_url, \"./temp\")\n",
151
+ " print(downloaded_video)"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "id": "e79c5071",
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "downloaded_video"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": null,
167
+ "id": "745320a1",
168
+ "metadata": {},
169
+ "outputs": [],
170
+ "source": [
171
+ "import copy\n",
172
+ "test = copy.deepcopy(downloaded_video)\n",
173
+ "\n",
174
+ "print(test)"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "markdown",
179
+ "id": "f62e8b83",
180
+ "metadata": {},
181
+ "source": [
182
+ "# Split video to frames in jpg"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "id": "5461045d",
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": [
192
+ "import os\n",
193
+ "from pathlib import Path\n",
194
+ "from typing import Dict\n",
195
+ "import av\n",
196
+ "\n",
197
+ "def extract_frames_with_timestamps(\n",
198
+ " video_path: str,\n",
199
+ " output_dir: str,\n",
200
+ " time_step: float = 1.0,\n",
201
+ " quality: int = 95,\n",
202
+ " frame_prefix: str = \"frame\",\n",
203
+ " use_hw_accel: bool = True,\n",
204
+ " hw_device: str = \"cuda\"\n",
205
+ ") -> Dict[str, str]:\n",
206
+ " \"\"\"\n",
207
+ " Extracts frames from video with NVIDIA hardware acceleration (NVDEC/CUDA).\n",
208
+ " \n",
209
+ " Args:\n",
210
+ " video_path: Path to the video file\n",
211
+ " output_dir: Directory to save frames\n",
212
+ " time_step: Interval between frames (seconds)\n",
213
+ " quality: JPEG quality (1-100)\n",
214
+ " frame_prefix: Prefix for saved frames\n",
215
+ " use_hw_accel: Enable NVIDIA hardware decoding\n",
216
+ " hw_device: GPU device (e.g., 'cuda:0')\n",
217
+ "\n",
218
+ " Returns:\n",
219
+ " Dict of {timestamp: frame_path}\n",
220
+ " \"\"\"\n",
221
+ " result = {}\n",
222
+ " try:\n",
223
+ " video_path = Path(video_path).absolute()\n",
224
+ " output_dir = Path(output_dir).absolute()\n",
225
+ " \n",
226
+ " if not video_path.exists():\n",
227
+ " raise ValueError(f\"Video file not found: {video_path}\")\n",
228
+ "\n",
229
+ " frames_dir = output_dir / \"frames\"\n",
230
+ " frames_dir.mkdir(parents=True, exist_ok=True)\n",
231
+ "\n",
232
+ " # Configure hardware acceleration\n",
233
+ " options = {}\n",
234
+ " if use_hw_accel:\n",
235
+ " options.update({\n",
236
+ " 'hwaccel': 'cuda',\n",
237
+ " 'hwaccel_device': hw_device,\n",
238
+ " 'hwaccel_output_format': 'cuda' # Keep frames in GPU memory\n",
239
+ " })\n",
240
+ "\n",
241
+ " # Open video with hardware acceleration\n",
242
+ " container = av.open(str(video_path), options=options)\n",
243
+ " video_stream = next(s for s in container.streams if s.type == 'video')\n",
244
+ "\n",
245
+ " fps = float(video_stream.average_rate)\n",
246
+ " if fps <= 0:\n",
247
+ " raise RuntimeError(\"Invalid frame rate\")\n",
248
+ "\n",
249
+ " frame_interval = max(1, int(round(fps * time_step)))\n",
250
+ " frame_count = 0\n",
251
+ "\n",
252
+ " for frame in container.decode(video_stream):\n",
253
+ " if frame_count % frame_interval == 0:\n",
254
+ " current_time = float(frame.pts * video_stream.time_base)\n",
255
+ " hh = int(current_time // 3600)\n",
256
+ " mm = int((current_time % 3600) // 60)\n",
257
+ " ss = current_time % 60\n",
258
+ " \n",
259
+ " timestamp = f\"{hh:02d}:{mm:02d}:{ss:06.3f}\"\n",
260
+ " safe_timestamp = timestamp.replace(':', '_').replace('.', '_')\n",
261
+ " frame_path = frames_dir / f\"{frame_prefix}_{safe_timestamp}.jpg\"\n",
262
+ "\n",
263
+ " # Convert GPU frame to CPU if needed\n",
264
+ " if hasattr(frame, 'to_ndarray'): # CUDA frame\n",
265
+ " img = frame.to_ndarray(format='rgb24')\n",
266
+ " img = av.VideoFrame.from_ndarray(img, format='rgb24')\n",
267
+ " else:\n",
268
+ " img = frame\n",
269
+ "\n",
270
+ " img.to_image().save(str(frame_path), quality=quality)\n",
271
+ " result[timestamp] = str(frame_path)\n",
272
+ "\n",
273
+ " frame_count += 1\n",
274
+ "\n",
275
+ " return result\n",
276
+ "\n",
277
+ " except Exception as e:\n",
278
+ " for path in result.values():\n",
279
+ " try: os.remove(path)\n",
280
+ " except: pass\n",
281
+ " raise RuntimeError(f\"Frame extraction failed: {str(e)}\")\n",
282
+ "\n",
283
+ "if __name__ == \"__main__\":\n",
284
+ " frames = extract_frames_with_timestamps(downloaded_video['video'], downloaded_video['data_path'], time_step=2)\n",
285
+ " print(frames)\n"
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "markdown",
290
+ "id": "ba7b44d6",
291
+ "metadata": {},
292
+ "source": [
293
+ "# Video Analyzer"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": null,
299
+ "id": "418ae84e",
300
+ "metadata": {},
301
+ "outputs": [],
302
+ "source": [
303
+ "# pip install autoawq --upgrade"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": null,
309
+ "id": "fe840ffa",
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": [
313
+ "from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
314
+ "from qwen_vl_utils import process_vision_info\n",
315
+ "import torch\n",
316
+ "\n",
317
+ "# default: Load the model on the available device(s)\n",
318
+ "model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
319
+ " \"Qwen/Qwen2.5-VL-7B-Instruct-AWQ\",\n",
320
+ " torch_dtype=torch.float16,\n",
321
+ " device_map=\"auto\",\n",
322
+ ")\n",
323
+ "\n",
324
+ "# default processer\n",
325
+ "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2.5-VL-7B-Instruct-AWQ\")\n",
326
+ "\n",
327
+ "messages = [\n",
328
+ " {\n",
329
+ " \"role\": \"user\",\n",
330
+ " \"content\": [\n",
331
+ " {\"type\": \"image\", \"image\": \"file:///workspaces/Video_Analyser/temp/fcaaa3e8-d99d-47c5-b464-617e4c9a1b1a/frames/frame_00_02_51_171.jpg\"},\n",
332
+ " {\"type\": \"text\", \"text\": \"Describe this image in\"},\n",
333
+ " ],\n",
334
+ " }\n",
335
+ "]\n",
336
+ "\n",
337
+ "# Preparation for inference\n",
338
+ "text = processor.apply_chat_template(\n",
339
+ " messages, tokenize=False, add_generation_prompt=True\n",
340
+ ")\n",
341
+ "\n",
342
+ "image_inputs, video_inputs = process_vision_info(messages)\n",
343
+ "inputs = processor(\n",
344
+ " text=[text],\n",
345
+ " images=image_inputs,\n",
346
+ " videos=video_inputs,\n",
347
+ " padding=True,\n",
348
+ " return_tensors=\"pt\",\n",
349
+ ")\n",
350
+ "inputs = inputs.to(\"cuda\")\n",
351
+ "\n",
352
+ "# Inference: Generation of the output\n",
353
+ "generated_ids = model.generate(**inputs, max_new_tokens=128)\n",
354
+ "generated_ids_trimmed = [\n",
355
+ " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
356
+ "]\n",
357
+ "output_text = processor.batch_decode(\n",
358
+ " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
359
+ ")\n",
360
+ "print(output_text)\n"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "markdown",
365
+ "id": "3ba7ef97",
366
+ "metadata": {},
367
+ "source": [
368
+ "# Audio content"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": null,
374
+ "id": "bf798dcf",
375
+ "metadata": {},
376
+ "outputs": [],
377
+ "source": [
378
+ "import torch\n",
379
+ "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\n",
380
+ "from datasets import load_dataset\n",
381
+ "\n",
382
+ "\n",
383
+ "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
384
+ "torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n",
385
+ "\n",
386
+ "model_id = \"openai/whisper-large-v3-turbo\"\n",
387
+ "\n",
388
+ "model = AutoModelForSpeechSeq2Seq.from_pretrained(\n",
389
+ " model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True\n",
390
+ ").to(device)\n",
391
+ "\n",
392
+ "pipe = pipeline(\n",
393
+ " \"automatic-speech-recognition\",\n",
394
+ " model=model,\n",
395
+ " torch_dtype=torch_dtype,\n",
396
+ " device=device,\n",
397
+ " return_timestamps=True\n",
398
+ ")\n",
399
+ "\n",
400
+ "\n",
401
+ "result = pipe(\"/workspaces/Video_Analyser/app_srv/temp/a6fba6eb-038e-4f4e-bcb7-f41d87ee1422/audio.mp3.mp3\")\n",
402
+ "\n",
403
+ "result"
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "code",
408
+ "execution_count": null,
409
+ "id": "980e9742",
410
+ "metadata": {},
411
+ "outputs": [],
412
+ "source": [
413
+ "! pip install librosa"
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "code",
418
+ "execution_count": null,
419
+ "id": "a66eabd3",
420
+ "metadata": {},
421
+ "outputs": [],
422
+ "source": [
423
+ "! pip install -U openai-whisper"
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": null,
429
+ "id": "e6d0e5fd",
430
+ "metadata": {},
431
+ "outputs": [],
432
+ "source": [
433
+ "import torch\n",
434
+ "from transformers import pipeline\n",
435
+ "from typing import Dict, Union\n",
436
+ "\n",
437
+ "def transcribe_with_timestamps_optimized(\n",
438
+ " audio_path: str,\n",
439
+ " model_name: str = \"openai/whisper-small\",\n",
440
+ " language: str = \"en\",\n",
441
+ " chunk_length_s: int = 5,\n",
442
+ " stride_length_s: Union[int, tuple] = (2, 2)\n",
443
+ ") -> Dict[float, str]:\n",
444
+ " device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
445
+ " print(f\"Используемое устройство: {device}\")\n",
446
+ "\n",
447
+ " try:\n",
448
+ " transcriber = pipeline(\n",
449
+ " \"automatic-speech-recognition\",\n",
450
+ " model=model_name,\n",
451
+ " chunk_length_s=chunk_length_s,\n",
452
+ " stride_length_s=stride_length_s,\n",
453
+ " device=device,\n",
454
+ " )\n",
455
+ " except Exception as e:\n",
456
+ " print(f\"Ошибка при инициализации pipeline: {e}\")\n",
457
+ " print(\"Убедитесь, что модель установлена или доступна на Hugging Face Hub.\")\n",
458
+ " raise\n",
459
+ "\n",
460
+ " print(f\"Начало транскрипции файла: {audio_path}\")\n",
461
+ " try:\n",
462
+ " result = transcriber(\n",
463
+ " audio_path,\n",
464
+ " return_timestamps=\"True\",\n",
465
+ " generate_kwargs={\"language\": language} if language else {}\n",
466
+ " )\n",
467
+ " except Exception as e:\n",
468
+ " print(f\"Ошибка при транскрипции аудиофайла: {e}\")\n",
469
+ " return {}\n",
470
+ "\n",
471
+ " transcribed_segments = {}\n",
472
+ " if \"chunks\" in result and result[\"chunks\"]:\n",
473
+ " for chunk in result[\"chunks\"]:\n",
474
+ " start_time = chunk[\"timestamp\"][0] if chunk[\"timestamp\"][0] is not None else 0.0\n",
475
+ " text = chunk[\"text\"].strip()\n",
476
+ " transcribed_segments[float(start_time)] = text\n",
477
+ " else:\n",
478
+ " if \"text\" in result:\n",
479
+ " transcribed_segments[0.0] = result[\"text\"].strip()\n",
480
+ " print(\"Предупреждение: получена только общая транскрипция, без посегментных временных меток.\")\n",
481
+ " print(\"Убедитесь, что 'return_timestamps=\\\"True\\\"' или 'return_timestamps=\\\"word\\\"' используется.\")\n",
482
+ " else:\n",
483
+ " print(\"Не удалось получить транскрипцию или временные метки.\")\n",
484
+ "\n",
485
+ " print(\"Транскрипция завершена.\")\n",
486
+ " return transcribed_segments\n",
487
+ "\n",
488
+ "\n",
489
+ "# Пример использования\n",
490
+ "if __name__ == \"__main__\":\n",
491
+ " \n",
492
+ " result = transcribe_with_timestamps_optimized(\n",
493
+ " audio_path=\"/workspaces/Video_Analyser/app_srv/temp/a6fba6eb-038e-4f4e-bcb7-f41d87ee1422/audio.mp3.mp3\",\n",
494
+ " )\n",
495
+ "\n",
496
+ " print(result)"
497
+ ]
498
+ },
499
+ {
500
+ "cell_type": "code",
501
+ "execution_count": null,
502
+ "id": "ca9a4832",
503
+ "metadata": {},
504
+ "outputs": [],
505
+ "source": [
506
+ "from transformers import pipeline\n",
507
+ "import librosa\n",
508
+ "\n",
509
+ "def transcribe_with_pipeline(audio_path):\n",
510
+ " pipe = pipeline(\n",
511
+ " \"automatic-speech-recognition\",\n",
512
+ " model=\"openai/whisper-small\",\n",
513
+ " chunk_length_s=30, # разбивает на чанки по 30 секунд\n",
514
+ " stride_length_s=2, # перекрытие между чанками\n",
515
+ " )\n",
516
+ " \n",
517
+ " result = pipe(audio_path, return_timestamps=True)\n",
518
+ " return result['text']\n",
519
+ "\n",
520
+ "result = transcribe_with_pipeline(\"/workspaces/Video_Analyser/app_srv/temp/a6fba6eb-038e-4f4e-bcb7-f41d87ee1422/audio.mp3.mp3\")"
521
+ ]
522
+ },
523
+ {
524
+ "cell_type": "code",
525
+ "execution_count": null,
526
+ "id": "7cd4e28e",
527
+ "metadata": {},
528
+ "outputs": [],
529
+ "source": [
530
+ "result"
531
+ ]
532
+ }
533
+ ],
534
+ "metadata": {
535
+ "kernelspec": {
536
+ "display_name": "Python 3",
537
+ "language": "python",
538
+ "name": "python3"
539
+ },
540
+ "language_info": {
541
+ "codemirror_mode": {
542
+ "name": "ipython",
543
+ "version": 3
544
+ },
545
+ "file_extension": ".py",
546
+ "mimetype": "text/x-python",
547
+ "name": "python",
548
+ "nbconvert_exporter": "python",
549
+ "pygments_lexer": "ipython3",
550
+ "version": "3.12.3"
551
+ }
552
+ },
553
+ "nbformat": 4,
554
+ "nbformat_minor": 5
555
+ }
app_srv/video_processing.py CHANGED
@@ -1,157 +1,157 @@
1
- import os
2
- import av
3
- import torch
4
- from pathlib import Path
5
- from typing import Dict
6
- from tqdm import tqdm
7
-
8
- from downloader import download_youtube_video
9
- from model_api import initialize_model, initialize_processor, prepare_model_inputs, generate_description, process_model_output, clear_gpu_cache, create_prompt_message, get_device_and_dtype
10
-
11
- def extract_frames_with_timestamps(
12
- video_path: str,
13
- output_dir: str,
14
- time_step: float = 1.0,
15
- quality: int = 95,
16
- frame_prefix: str = "frame",
17
- use_hw_accel: bool = True,
18
- hw_device: str = "cuda"
19
- ) -> Dict[str, str]:
20
- """
21
- Extracts frames from video with NVIDIA hardware acceleration (NVDEC/CUDA).
22
-
23
- Args:
24
- video_path: Path to the video file
25
- output_dir: Directory to save frames
26
- time_step: Interval between frames (seconds)
27
- quality: JPEG quality (1-100)
28
- frame_prefix: Prefix for saved frames
29
- use_hw_accel: Enable NVIDIA hardware decoding
30
- hw_device: GPU device (e.g., 'cuda:0')
31
-
32
- Returns:
33
- Dict of {timestamp: frame_path}
34
- """
35
- result = {}
36
- try:
37
- video_path = Path(video_path).absolute()
38
- output_dir = Path(output_dir).absolute()
39
-
40
- if not video_path.exists():
41
- raise ValueError(f"Video file not found: {video_path}")
42
-
43
- frames_dir = output_dir / "frames"
44
- frames_dir.mkdir(parents=True, exist_ok=True)
45
-
46
- # Configure hardware acceleration
47
- options = {}
48
- if use_hw_accel:
49
- options.update({
50
- 'hwaccel': 'cuda',
51
- 'hwaccel_device': hw_device,
52
- 'hwaccel_output_format': 'cuda' # Keep frames in GPU memory
53
- })
54
-
55
- # Open video with hardware acceleration
56
- container = av.open(str(video_path), options=options)
57
- video_stream = next(s for s in container.streams if s.type == 'video')
58
-
59
- fps = float(video_stream.average_rate)
60
- if fps <= 0:
61
- raise RuntimeError("Invalid frame rate")
62
-
63
- frame_interval = max(1, int(round(fps * time_step)))
64
- frame_count = 0
65
-
66
- for frame in container.decode(video_stream):
67
- if frame_count % frame_interval == 0:
68
- current_time = float(frame.pts * video_stream.time_base)
69
- hh = int(current_time // 3600)
70
- mm = int((current_time % 3600) // 60)
71
- ss = current_time % 60
72
-
73
- timestamp = f"{hh:02d}:{mm:02d}:{ss:06.3f}"
74
- safe_timestamp = timestamp.replace(':', '_').replace('.', '_')
75
- frame_path = frames_dir / f"{frame_prefix}_{safe_timestamp}.jpg"
76
-
77
- # Convert GPU frame to CPU if needed
78
- if hasattr(frame, 'to_ndarray'): # CUDA frame
79
- img = frame.to_ndarray(format='rgb24')
80
- img = av.VideoFrame.from_ndarray(img, format='rgb24')
81
- else:
82
- img = frame
83
-
84
- img.to_image().save(str(frame_path), quality=quality)
85
- result[timestamp] = str(frame_path)
86
-
87
- frame_count += 1
88
-
89
- return result
90
-
91
- except Exception as e:
92
- for path in result.values():
93
- try: os.remove(path)
94
- except: pass
95
- raise RuntimeError(f"Frame extraction failed: {str(e)}")
96
-
97
-
98
- def generate_frame_descriptions(frames_dict: Dict, custom_prompt: str = None, device: str = "cuda", torch_dtype: torch.dtype = torch.float16):
99
- """
100
- Generate descriptions for video frames with progress tracking
101
-
102
- Args:
103
- frames_dict (dict): Dictionary of {timestamp: image_path} pairs
104
- custom_prompt (str, optional): Custom prompt to use for all frames.
105
- Can include {timestamp} placeholder.
106
-
107
- Returns:
108
- dict: Dictionary of {timestamp: description} pairs
109
- """
110
-
111
- # Instantiating model components
112
- model = initialize_model(device, torch_dtype)
113
- processor = initialize_processor()
114
- descriptions = {}
115
-
116
- with tqdm(
117
- frames_dict.items(),
118
- total=len(frames_dict),
119
- desc="Processing frames",
120
- unit="frame"
121
- ) as progress_bar:
122
-
123
- for timestamp, image_path in progress_bar:
124
- try:
125
- progress_bar.set_postfix({"current": timestamp})
126
-
127
- # Prepare model input with custom prompt
128
- messages = create_prompt_message(image_path, timestamp, custom_prompt)
129
- inputs = prepare_model_inputs(processor, messages, device)
130
-
131
- # Generate and process output
132
- generated_ids = generate_description(model, inputs)
133
- output_text = process_model_output(processor, inputs, generated_ids)
134
- descriptions[timestamp] = output_text
135
-
136
- # Memory cleanup
137
- del inputs, generated_ids
138
- clear_gpu_cache()
139
-
140
- except Exception as e:
141
- print(f"\nError processing frame {timestamp}: {str(e)}")
142
- descriptions[timestamp] = f"Description generation error: {str(e)}"
143
- clear_gpu_cache()
144
-
145
- # Final cleanup
146
- del model, processor
147
- clear_gpu_cache()
148
-
149
- return descriptions
150
-
151
-
152
-
153
- if __name__ == "__main__":
154
- video_url = "https://www.youtube.com/watch?v=L1vXCYZAYYM"
155
- video_data = download_youtube_video(video_url)
156
- frames = extract_frames_with_timestamps(video_path=video_data['video_path'], output_dir=video_data['data_path'], time_step=10)
157
  print(frames)
 
1
+ import os
2
+ import av
3
+ import torch
4
+ from pathlib import Path
5
+ from typing import Dict
6
+ from tqdm import tqdm
7
+
8
+ from downloader import download_youtube_video
9
+ from model_api import initialize_model, initialize_processor, prepare_model_inputs, generate_description, process_model_output, clear_gpu_cache, create_prompt_message, get_device_and_dtype
10
+
11
+ def extract_frames_with_timestamps(
12
+ video_path: str,
13
+ output_dir: str,
14
+ time_step: float = 1.0,
15
+ quality: int = 95,
16
+ frame_prefix: str = "frame",
17
+ use_hw_accel: bool = True,
18
+ hw_device: str = "cuda"
19
+ ) -> Dict[str, str]:
20
+ """
21
+ Extracts frames from video with NVIDIA hardware acceleration (NVDEC/CUDA).
22
+
23
+ Args:
24
+ video_path: Path to the video file
25
+ output_dir: Directory to save frames
26
+ time_step: Interval between frames (seconds)
27
+ quality: JPEG quality (1-100)
28
+ frame_prefix: Prefix for saved frames
29
+ use_hw_accel: Enable NVIDIA hardware decoding
30
+ hw_device: GPU device (e.g., 'cuda:0')
31
+
32
+ Returns:
33
+ Dict of {timestamp: frame_path}
34
+ """
35
+ result = {}
36
+ try:
37
+ video_path = Path(video_path).absolute()
38
+ output_dir = Path(output_dir).absolute()
39
+
40
+ if not video_path.exists():
41
+ raise ValueError(f"Video file not found: {video_path}")
42
+
43
+ frames_dir = output_dir / "frames"
44
+ frames_dir.mkdir(parents=True, exist_ok=True)
45
+
46
+ # Configure hardware acceleration
47
+ options = {}
48
+ if use_hw_accel:
49
+ options.update({
50
+ 'hwaccel': 'cuda',
51
+ 'hwaccel_device': hw_device,
52
+ 'hwaccel_output_format': 'cuda' # Keep frames in GPU memory
53
+ })
54
+
55
+ # Open video with hardware acceleration
56
+ container = av.open(str(video_path), options=options)
57
+ video_stream = next(s for s in container.streams if s.type == 'video')
58
+
59
+ fps = float(video_stream.average_rate)
60
+ if fps <= 0:
61
+ raise RuntimeError("Invalid frame rate")
62
+
63
+ frame_interval = max(1, int(round(fps * time_step)))
64
+ frame_count = 0
65
+
66
+ for frame in container.decode(video_stream):
67
+ if frame_count % frame_interval == 0:
68
+ current_time = float(frame.pts * video_stream.time_base)
69
+ hh = int(current_time // 3600)
70
+ mm = int((current_time % 3600) // 60)
71
+ ss = current_time % 60
72
+
73
+ timestamp = f"{hh:02d}:{mm:02d}:{ss:06.3f}"
74
+ safe_timestamp = timestamp.replace(':', '_').replace('.', '_')
75
+ frame_path = frames_dir / f"{frame_prefix}_{safe_timestamp}.jpg"
76
+
77
+ # Convert GPU frame to CPU if needed
78
+ if hasattr(frame, 'to_ndarray'): # CUDA frame
79
+ img = frame.to_ndarray(format='rgb24')
80
+ img = av.VideoFrame.from_ndarray(img, format='rgb24')
81
+ else:
82
+ img = frame
83
+
84
+ img.to_image().save(str(frame_path), quality=quality)
85
+ result[timestamp] = str(frame_path)
86
+
87
+ frame_count += 1
88
+
89
+ return result
90
+
91
+ except Exception as e:
92
+ for path in result.values():
93
+ try: os.remove(path)
94
+ except: pass
95
+ raise RuntimeError(f"Frame extraction failed: {str(e)}")
96
+
97
+
98
+ def generate_frame_descriptions(frames_dict: Dict, custom_prompt: str = None, device: str = "cuda", torch_dtype: torch.dtype = torch.float16):
99
+ """
100
+ Generate descriptions for video frames with progress tracking
101
+
102
+ Args:
103
+ frames_dict (dict): Dictionary of {timestamp: image_path} pairs
104
+ custom_prompt (str, optional): Custom prompt to use for all frames.
105
+ Can include {timestamp} placeholder.
106
+
107
+ Returns:
108
+ dict: Dictionary of {timestamp: description} pairs
109
+ """
110
+
111
+ # Instantiating model components
112
+ model = initialize_model(device, torch_dtype)
113
+ processor = initialize_processor()
114
+ descriptions = {}
115
+
116
+ with tqdm(
117
+ frames_dict.items(),
118
+ total=len(frames_dict),
119
+ desc="Processing frames",
120
+ unit="frame"
121
+ ) as progress_bar:
122
+
123
+ for timestamp, image_path in progress_bar:
124
+ try:
125
+ progress_bar.set_postfix({"current": timestamp})
126
+
127
+ # Prepare model input with custom prompt
128
+ messages = create_prompt_message(image_path, timestamp, custom_prompt)
129
+ inputs = prepare_model_inputs(processor, messages, device)
130
+
131
+ # Generate and process output
132
+ generated_ids = generate_description(model, inputs)
133
+ output_text = process_model_output(processor, inputs, generated_ids)
134
+ descriptions[timestamp] = output_text
135
+
136
+ # Memory cleanup
137
+ del inputs, generated_ids
138
+ clear_gpu_cache()
139
+
140
+ except Exception as e:
141
+ print(f"\nError processing frame {timestamp}: {str(e)}")
142
+ descriptions[timestamp] = f"Description generation error: {str(e)}"
143
+ clear_gpu_cache()
144
+
145
+ # Final cleanup
146
+ del model, processor
147
+ clear_gpu_cache()
148
+
149
+ return descriptions
150
+
151
+
152
+
153
+ if __name__ == "__main__":
154
+ video_url = "https://www.youtube.com/watch?v=L1vXCYZAYYM"
155
+ video_data = download_youtube_video(video_url)
156
+ frames = extract_frames_with_timestamps(video_path=video_data['video_path'], output_dir=video_data['data_path'], time_step=10)
157
  print(frames)
packages.txt CHANGED
@@ -1,16 +1,16 @@
1
- ffmpeg
2
- git
3
- git-lfs
4
- htop
5
- iotop
6
- libxml2
7
- libopenblas-dev
8
- libssl-dev
9
- python3-pip
10
- python3-wheel
11
- python3-setuptools
12
- python-is-python3
13
- wget
14
- zlib1g
15
- net-tools
16
  curl
 
1
+ ffmpeg
2
+ git
3
+ git-lfs
4
+ htop
5
+ iotop
6
+ libxml2
7
+ libopenblas-dev
8
+ libssl-dev
9
+ python3-pip
10
+ python3-wheel
11
+ python3-setuptools
12
+ python-is-python3
13
+ wget
14
+ zlib1g
15
+ net-tools
16
  curl