Artyom Boyko commited on
Commit
95b1e18
·
1 Parent(s): c867d05

Solving the YouTube problem.

Browse files
app_srv/app_srv.py CHANGED
@@ -2,8 +2,6 @@ import gradio as gr
2
  import torch
3
  import os
4
  import json
5
- import requests # Added for making HTTP requests
6
- import socket # Added for getting hostname
7
 
8
  # Import your modules
9
  from downloader import download_youtube_video
@@ -15,35 +13,14 @@ from model_api import get_device_and_dtype
15
  device, dtype = get_device_and_dtype()
16
 
17
  # Default prompt
18
- DEFAULT_PROMPT = "Analyze the frame, describe what objects are in the frame, how many there are, the background and the action taking place."
19
-
20
- # --- FUNCTION TO GET PUBLIC IP AND HOSTNAME (NOT FOR MCP) ---
21
- def get_public_ip_and_hostname() -> str:
22
- """
23
- Retrieves the public IP address and the hostname of the machine.
24
- This function is intended for display purposes within the Gradio UI
25
- and should NOT be exposed via MCP API.
26
- """
27
- public_ip = "N/A"
28
- hostname = "N/A"
29
-
30
- try:
31
- # Get public IP address
32
- response = requests.get("https://api.ipify.org?format=json", timeout=5)
33
- response.raise_for_status() # Raise an exception for HTTP errors
34
- public_ip = response.json().get("ip", "N/A")
35
- except requests.exceptions.RequestException as e:
36
- print(f"Error getting public IP: {e}")
37
- public_ip = f"Error: {e}"
38
-
39
- try:
40
- # Get hostname
41
- hostname = socket.gethostname()
42
- except Exception as e:
43
- print(f"Error getting hostname: {e}")
44
- hostname = f"Error: {e}"
45
-
46
- return f"Public IP: {public_ip} | Hostname: {hostname}"
47
 
48
  # --- OPTIMIZED FUNCTION, RETURNING JSON STRING ---
49
  def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: float) -> str:
@@ -62,8 +39,8 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
62
  str: A JSON formatted string containing the analysis results.
63
  The JSON structure includes:
64
  - "status": "success" if the analysis was successful, "error" otherwise.
65
- - "message": A brief description of the outcome (empty string for success,
66
- or an error message for error).
67
  - "frame_analysis": A list of dictionaries, where each dictionary represents a frame
68
  and contains "timestamp" and "description".
69
  - "audio_transcription": The transcribed text of the video's audio.
@@ -74,21 +51,25 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
74
  """
75
 
76
  results = {
77
- "status": "success", # Default to success
78
- "message": "", # Default message is empty for success
79
  "frame_analysis": [],
80
  "audio_transcription": ""
81
  }
82
 
83
  try:
 
 
 
 
84
  # 1. Download video
85
  video_data = download_youtube_video(
86
  url=youtube_url,
87
- video_quality=quality
 
88
  )
89
 
90
  # 2. Extract frames
91
- # frames_dict: {timestamp: path_to_frame_image}
92
  frames_dict = extract_frames_with_timestamps(
93
  video_path=video_data['video_path'],
94
  output_dir=video_data['data_path'],
@@ -99,7 +80,7 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
99
  # 3. Generate descriptions for frames
100
  descriptions = generate_frame_descriptions(
101
  frames_dict=frames_dict,
102
- custom_prompt=prompt,
103
  device=device,
104
  torch_dtype=dtype
105
  )
@@ -116,6 +97,8 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
116
  })
117
 
118
  results["audio_transcription"] = transcription_text
 
 
119
 
120
  # Return formatted JSON string
121
  return json.dumps(results, indent=2, ensure_ascii=False)
@@ -124,12 +107,12 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
124
  error_message = f"Processing error: {str(e)}"
125
  print(f"An error occurred during video analysis: {e}") # For debugging
126
 
127
- results["status"] = "error" # Set status to error
128
- results["message"] = error_message # Set error message
129
- results["frame_analysis"] = [] # Clear frame results on error
130
- results["audio_transcription"] = "" # Clear transcription on error
131
 
132
- # In case of error, return JSON string with error details
133
  return json.dumps(results, indent=2, ensure_ascii=False)
134
 
135
  # Create Gradio interface
@@ -138,12 +121,6 @@ with gr.Blocks(title="Video Analysis Tool", css="""
138
  .output-box {border-radius: 8px !important; margin-top:15px;}
139
  .results-output {background:#f8f8f8 !important; padding:15px !important;}
140
  h1 {color: #1a73e8 !important;}
141
- .ip-info {
142
- font-size: 0.9em;
143
- color: #666;
144
- margin-top: -15px; /* Adjust as needed to pull it closer to the title */
145
- margin-bottom: 10px;
146
- }
147
  """) as demo:
148
 
149
  gr.Markdown("""
@@ -151,21 +128,7 @@ with gr.Blocks(title="Video Analysis Tool", css="""
151
  Analyze YouTube videos - get frame-by-frame descriptions with timestamps and audio transcription.
152
  """)
153
 
154
- # NEW: Display Public IP and Hostname
155
- # We use a gr.Markdown component to display the text.
156
- # The key here is that get_public_ip_and_hostname is NOT directly an input/output
157
- # of a button. It's called once when the app loads, or its output is static.
158
- # To prevent it from being in MCP API, we typically don't expose it via gr.Interface
159
- # or explicitly set show_api=False for the component if it were interactive.
160
- # Here, it's a simple call rendered in Markdown, so it won't be exposed.
161
- gr.Markdown(
162
- f"<div class='ip-info'>{get_public_ip_and_hostname()}</div>",
163
- # This component itself does not expose an API endpoint if it's just static Markdown
164
- # or updated via a gr.State and not directly via a `fn` in `click` with `show_api=True`.
165
- # The key is that the function `get_public_ip_and_hostname` is called
166
- # during the UI definition, not as an API endpoint.
167
- )
168
-
169
  with gr.Row():
170
  youtube_url = gr.Textbox(
171
  label="YouTube Video URL",
 
2
  import torch
3
  import os
4
  import json
 
 
5
 
6
  # Import your modules
7
  from downloader import download_youtube_video
 
13
  device, dtype = get_device_and_dtype()
14
 
15
  # Default prompt
16
+ DEFAULT_PROMPT = """
17
+ Present the frame analysis in the following format, focusing on the details in the frame:
18
+ FRAME: {timestamp} \n
19
+ OBJECTS: List of objects with their count, for example: Bengal tiger - 1, Volvo car - 1, Person - 2 (male, female). Don't count the mention of an object in the text with the video as a separate object. \n
20
+ TEXT: This is where you place the text that is present in the frame. Just bring the text present in the frame from left to right, top to bottom. \n
21
+ BACKGROUND: Description of background and surroundings, e.g.: Muddy brown water. A road in the distance. An abandoned building on the horizon. Describe only what is visible. \n
22
+ ACTION: A detailed description of what is happening in the frame, for example: A Bengal tiger is swimming in murky water, its head and part of its back are visible above the surface. A blue Volvo car is driving along the road in the distance. A part of a tree is visible in the right part of the frame.
23
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # --- OPTIMIZED FUNCTION, RETURNING JSON STRING ---
26
  def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: float) -> str:
 
39
  str: A JSON formatted string containing the analysis results.
40
  The JSON structure includes:
41
  - "status": "success" if the analysis was successful, "error" otherwise.
42
+ - "message": A brief description of the outcome (e.g., "Analysis completed successfully."
43
+ or an error message).
44
  - "frame_analysis": A list of dictionaries, where each dictionary represents a frame
45
  and contains "timestamp" and "description".
46
  - "audio_transcription": The transcribed text of the video's audio.
 
51
  """
52
 
53
  results = {
54
+ "status": "success",
55
+ "message": "Analysis completed successfully.",
56
  "frame_analysis": [],
57
  "audio_transcription": ""
58
  }
59
 
60
  try:
61
+
62
+ # YouTube
63
+ cookies = os.getenv("YOUTUBE_COOKIES")
64
+
65
  # 1. Download video
66
  video_data = download_youtube_video(
67
  url=youtube_url,
68
+ video_quality=quality,
69
+ youtube_cookies=cookies
70
  )
71
 
72
  # 2. Extract frames
 
73
  frames_dict = extract_frames_with_timestamps(
74
  video_path=video_data['video_path'],
75
  output_dir=video_data['data_path'],
 
80
  # 3. Generate descriptions for frames
81
  descriptions = generate_frame_descriptions(
82
  frames_dict=frames_dict,
83
+ custom_prompt=prompt, # Now `prompt` can contain the {timestamp} placeholder
84
  device=device,
85
  torch_dtype=dtype
86
  )
 
97
  })
98
 
99
  results["audio_transcription"] = transcription_text
100
+
101
+ print("Video processing complete")
102
 
103
  # Return formatted JSON string
104
  return json.dumps(results, indent=2, ensure_ascii=False)
 
107
  error_message = f"Processing error: {str(e)}"
108
  print(f"An error occurred during video analysis: {e}") # For debugging
109
 
110
+ results["status"] = "error"
111
+ results["message"] = error_message
112
+ results["frame_analysis"] = []
113
+ results["audio_transcription"] = ""
114
 
115
+ # В случае ошибки возвращаем JSON-строку с ошибкой
116
  return json.dumps(results, indent=2, ensure_ascii=False)
117
 
118
  # Create Gradio interface
 
121
  .output-box {border-radius: 8px !important; margin-top:15px;}
122
  .results-output {background:#f8f8f8 !important; padding:15px !important;}
123
  h1 {color: #1a73e8 !important;}
 
 
 
 
 
 
124
  """) as demo:
125
 
126
  gr.Markdown("""
 
128
  Analyze YouTube videos - get frame-by-frame descriptions with timestamps and audio transcription.
129
  """)
130
 
131
+ # Top row: Video URL, prompt, analysis parameters, and analyze button
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  with gr.Row():
133
  youtube_url = gr.Textbox(
134
  label="YouTube Video URL",
app_srv/audio_processing.py CHANGED
@@ -49,5 +49,5 @@ if __name__ == "__main__":
49
 
50
  selected_device, selected_dtype = get_device_and_dtype()
51
 
52
- result = transcribe_audio("/workspaces/Video_Analyser/app_srv/temp/a28af289-377d-468d-b0eb-ed0f7dcd2ab3/audio.mp3.mp3", selected_device, selected_dtype)
53
  print(result)
 
49
 
50
  selected_device, selected_dtype = get_device_and_dtype()
51
 
52
+ result = transcribe_audio("/workspaces/Video_Analyser/app_srv/downloads/45677153-510d-4f47-95ee-c1b4b0843433/audio.mp3.mp3", selected_device, selected_dtype)
53
  print(result)
app_srv/downloader.py CHANGED
@@ -9,7 +9,8 @@ from datetime import datetime
9
 
10
  def download_youtube_video(url: str,
11
  base_dir: str = None,
12
- video_quality: int = 720) -> Dict[str, str]:
 
13
  """
14
  Downloads video and audio from YouTube, saving them to a unique GUID folder.
15
  Metadata is saved in JSON format including download datetime and timezone.
@@ -75,6 +76,7 @@ def download_youtube_video(url: str,
75
  'quiet': True,
76
  'no_warnings': True,
77
  'restrict_filenames': True,
 
78
  }
79
 
80
  with yt_dlp.YoutubeDL(video_opts) as ydl:
@@ -90,6 +92,7 @@ def download_youtube_video(url: str,
90
  'preferredcodec': 'mp3',
91
  'preferredquality': '128',
92
  }],
 
93
  }
94
 
95
  with yt_dlp.YoutubeDL(audio_opts) as ydl:
 
9
 
10
  def download_youtube_video(url: str,
11
  base_dir: str = None,
12
+ video_quality: int = 720,
13
+ youtube_cookies: str = "") -> Dict[str, str]:
14
  """
15
  Downloads video and audio from YouTube, saving them to a unique GUID folder.
16
  Metadata is saved in JSON format including download datetime and timezone.
 
76
  'quiet': True,
77
  'no_warnings': True,
78
  'restrict_filenames': True,
79
+ 'cookiefile': youtube_cookies,
80
  }
81
 
82
  with yt_dlp.YoutubeDL(video_opts) as ydl:
 
92
  'preferredcodec': 'mp3',
93
  'preferredquality': '128',
94
  }],
95
+ 'cookiefile': youtube_cookies,
96
  }
97
 
98
  with yt_dlp.YoutubeDL(audio_opts) as ydl:
app_srv/test.ipynb CHANGED
@@ -425,49 +425,10 @@
425
  },
426
  {
427
  "cell_type": "code",
428
- "execution_count": 6,
429
  "id": "e6d0e5fd",
430
  "metadata": {},
431
- "outputs": [
432
- {
433
- "name": "stdout",
434
- "output_type": "stream",
435
- "text": [
436
- "Используемое устройство: cuda:0\n"
437
- ]
438
- },
439
- {
440
- "name": "stderr",
441
- "output_type": "stream",
442
- "text": [
443
- "Device set to use cuda:0\n"
444
- ]
445
- },
446
- {
447
- "name": "stdout",
448
- "output_type": "stream",
449
- "text": [
450
- "Начало транскрипции файла: /workspaces/Video_Analyser/app_srv/temp/a6fba6eb-038e-4f4e-bcb7-f41d87ee1422/audio.mp3.mp3\n"
451
- ]
452
- },
453
- {
454
- "name": "stderr",
455
- "output_type": "stream",
456
- "text": [
457
- "/usr/local/lib/python3.12/dist-packages/transformers/models/whisper/generation_whisper.py:573: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.\n",
458
- " warnings.warn(\n",
459
- "Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.\n"
460
- ]
461
- },
462
- {
463
- "name": "stdout",
464
- "output_type": "stream",
465
- "text": [
466
- "Транскрипция завершена.\n",
467
- "{0.0: \"with their signature orange fur and black stripes, Tigers have become icons of beauty, power, and the importance of conservation. Tigers have evolved into six subspecies. to six subspecies. The tiger's tail of evolution can be traced back to about two million years ago when the earliest known tiger ancestor left Africa and ventured into Asia. Over time, the big cat split into nine subspecies with six still alive today. The most numerous subspecies is the Bengal tiger, accounting for approximately 50% of the tiger population worldwide. Tigers are the world's largest cats. On average, the big cats weigh about 450 pounds, but the largest is the Siberian tiger subspecies, measuring up to 13 feet long and weighing up to 660 pounds. This extra weight is primarily because of large powerful muscles. Unlike lions, the second largest of the big cats, tigers have more muscle mass and are therefore heavier. Tigers have webbed toes, often living near bodies of water, tigers have adapted for a semi-aquatic lifestyle. They appear to enjoy being in the water, unlike most cats, and are excellent swimmers using their powerful muscles to propel their bodies in the water. They've also evolved to have webbing between their toes. The webbing allows the big cats to push around a greater volume of water with each stroke of their paws. White tigers are a type of Bengal tiger. The classic black and orange coloration of most tigers is caused by the pigments U melanin, which turns for black, and pheomelanin, which turns for black, and pheomelanin, which turns fur orange. The production of pheomelanin is triggered by the gene SLC45A2. White tigers carry a mutated version of this gene which prevents them from producing orange pigmentation. Fewer than 4,000 tigers remain in the wild. At the turn of the 20th century, approximately 100,000 tigers roamed the wild, living as far west as Turkey and as far north as Russia, but due to deforestation, human development, and poaching, their range severely decreased and within 100 years, the world's tiger population declined by about 96%. by about 96%. Over the past few decades, programs have been put in place to protect tigers and their habitats with the help of the global community, tiger populations may slowly rebound. slowly rebound. Music. you Thank you. you you you you you you Music playing you Music playing you you you Music playing you you\"}\n"
468
- ]
469
- }
470
- ],
471
  "source": [
472
  "import torch\n",
473
  "from transformers import pipeline\n",
@@ -537,20 +498,10 @@
537
  },
538
  {
539
  "cell_type": "code",
540
- "execution_count": 22,
541
  "id": "ca9a4832",
542
  "metadata": {},
543
- "outputs": [
544
- {
545
- "name": "stderr",
546
- "output_type": "stream",
547
- "text": [
548
- "Device set to use cuda:0\n",
549
- "/usr/local/lib/python3.12/dist-packages/transformers/models/whisper/generation_whisper.py:573: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.\n",
550
- " warnings.warn(\n"
551
- ]
552
- }
553
- ],
554
  "source": [
555
  "from transformers import pipeline\n",
556
  "import librosa\n",
@@ -571,21 +522,10 @@
571
  },
572
  {
573
  "cell_type": "code",
574
- "execution_count": 23,
575
  "id": "7cd4e28e",
576
  "metadata": {},
577
- "outputs": [
578
- {
579
- "data": {
580
- "text/plain": [
581
- "\" With their signature orange fur and black stripes, tigers have become icons of beauty, power, and the importance of conservation. Tigers have evolved into six subspecies. The tiger's tale of evolution can be traced back to about two million years ago when the earliest known tiger ancestor left Africa and ventured into Asia. Over time, the big cats split into nine subspecies, with six still alive today. The most numerous subspecies is the Bengal tiger, accounting for approximately 50% of the tiger population worldwide. Tigers are the world's largest cats. On average, the big cats weigh about 450 pounds, but the largest is the Siberian tiger subspecies, measuring up to 13 feet long and weighing up to 660 pounds. This extra weight is primarily because of large powerful muscles. Unlike lions, the second largest of the big cats, tigers have more muscle mass and are therefore heavier. Tigers have webbed toes. Even living near bodies of water, tigers have adapted for a semi-aquatic lifestyle. They appear to enjoy being in the water, unlike most cats, and are excellent swimmers, using their powerful muscles to propel their bodies in the water. They've also evolved to have webbing between their toes. The webbing allows the big cats to push around a greater volume of water with each stroke of their paws. White tigers are a type of Bengal tiger. The classic black and orange coloration of most tigers is caused by the pigments U-melanin which turns for black and pheomelanin which turns for orange. The production of pheomelanin is triggered by the gene SLC45A2. White tigers carry a mutated version of this gene which prevents them from producing orange pigmentation. Fewer than 4,000 tigers remain in the wild. At the turn of the 20th century, approximately 100,000 tigers roamed the wild, living as far west as Turkey and as far north as Russia. But due to deforestation, human development, and poaching, their range severely decreased, within 100 years, the world's tiger population declined by about 96%. Over the past few decades, programs have been put in place to protect tigers and their habitats. With the help of the global community, tiger populations may slowly rebound. Thank you very much for watching this video, and I'll see you in the next one.\""
582
- ]
583
- },
584
- "execution_count": 23,
585
- "metadata": {},
586
- "output_type": "execute_result"
587
- }
588
- ],
589
  "source": [
590
  "result"
591
  ]
 
425
  },
426
  {
427
  "cell_type": "code",
428
+ "execution_count": null,
429
  "id": "e6d0e5fd",
430
  "metadata": {},
431
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  "source": [
433
  "import torch\n",
434
  "from transformers import pipeline\n",
 
498
  },
499
  {
500
  "cell_type": "code",
501
+ "execution_count": null,
502
  "id": "ca9a4832",
503
  "metadata": {},
504
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
505
  "source": [
506
  "from transformers import pipeline\n",
507
  "import librosa\n",
 
522
  },
523
  {
524
  "cell_type": "code",
525
+ "execution_count": null,
526
  "id": "7cd4e28e",
527
  "metadata": {},
528
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
529
  "source": [
530
  "result"
531
  ]
packages.txt CHANGED
@@ -12,4 +12,5 @@ python3-setuptools
12
  python-is-python3
13
  wget
14
  zlib1g
15
- net-tools
 
 
12
  python-is-python3
13
  wget
14
  zlib1g
15
+ net-tools
16
+ curl
requirements.txt CHANGED
@@ -14,4 +14,5 @@ ipykernel==6.29.5
14
  ipywidgets==8.1.7
15
  yt-dlp==2025.5.22
16
  qwen-vl-utils==0.0.11
17
- librosa==0.11.0
 
 
14
  ipywidgets==8.1.7
15
  yt-dlp==2025.5.22
16
  qwen-vl-utils==0.0.11
17
+ librosa==0.11.0
18
+ gradio_client