Artyom Boyko commited on
Commit
c867d05
·
1 Parent(s): 56f0d7c

Testing a new variant of Gradion MCP server.

Browse files
Files changed (2) hide show
  1. app_srv/app_srv.py +157 -95
  2. requirements.txt +1 -1
app_srv/app_srv.py CHANGED
@@ -1,151 +1,213 @@
1
  import gradio as gr
2
  import torch
3
  import os
4
- from PIL import Image
5
- import base64
6
- from io import BytesIO
7
- from pathlib import Path
 
8
  from downloader import download_youtube_video
9
  from video_processing import extract_frames_with_timestamps, generate_frame_descriptions
10
  from audio_processing import transcribe_audio
11
  from model_api import get_device_and_dtype
12
 
13
- # Инициализация устройства и типа данных
14
  device, dtype = get_device_and_dtype()
15
 
16
- # Промпт по умолчанию
17
  DEFAULT_PROMPT = "Analyze the frame, describe what objects are in the frame, how many there are, the background and the action taking place."
18
 
19
- def process_video(youtube_url: str, prompt: str, quality: str, time_step: float):
20
- """Основная функция обработки видео"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  try:
 
 
 
 
 
22
 
23
- # 1. Скачивание видео
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  video_data = download_youtube_video(
25
  url=youtube_url,
26
  video_quality=quality
27
  )
28
 
29
- # 2. Извлечение кадров с CUDA
30
- frames = extract_frames_with_timestamps(
 
31
  video_path=video_data['video_path'],
32
  output_dir=video_data['data_path'],
33
  time_step=time_step,
34
  hw_device="cuda"
35
  )
36
 
37
- # 3. Генерация описаний
38
  descriptions = generate_frame_descriptions(
39
- frames_dict=frames,
40
  custom_prompt=prompt,
41
  device=device,
42
  torch_dtype=dtype
43
  )
44
 
45
- # 4. Транскрипция аудио
46
- transcription = transcribe_audio(video_data['audio_path'])
47
 
48
- # 5. Форматировани�� результатов
49
- results_html = []
50
- for timestamp, frame_path in frames.items():
51
- # Получаем описание для текущего кадра
52
- frame_desc = descriptions.get(timestamp, "No description available")
53
-
54
- # Обработка изображения
55
- if os.path.exists(frame_path):
56
- with Image.open(frame_path) as img:
57
- img.thumbnail((400, 400))
58
- buffered = BytesIO()
59
- img.save(buffered, format="JPEG", quality=85)
60
- img_base64 = base64.b64encode(buffered.getvalue()).decode()
61
- img_html = f'<img src="data:image/jpeg;base64,{img_base64}" style="max-height:300px; border-radius:5px; border:1px solid #ddd;">'
62
- else:
63
- img_html = f'<div style="color:red; padding:10px;">Image not found</div>'
64
-
65
- # Форматирование HTML блока
66
- frame_html = f"""
67
- <div style="border:1px solid #e0e0e0; border-radius:8px; padding:15px; margin-bottom:20px; background:#f8f8f8;">
68
- <div style="display:flex; gap:20px; align-items:flex-start;">
69
- <div style="flex:1; min-width:300px; display:flex; justify-content:center; align-items:center;">
70
- {img_html}
71
- </div>
72
- <div style="flex:2;">
73
- <h3 style="margin-top:0; color:#222; font-size:16px; font-weight:600;">Timestamp: {timestamp}</h3>
74
- <div style="background:#fff; padding:15px; border-radius:6px; border-left:4px solid #4285f4;
75
- color:#333; font-size:14px; line-height:1.5; box-shadow:0 1px 3px rgba(0,0,0,0.1);">
76
- {frame_desc}
77
- </div>
78
- </div>
79
- </div>
80
- </div>
81
- """
82
- results_html.append(frame_html)
83
 
84
- return "\n".join(results_html), transcription
 
 
 
85
 
86
  except Exception as e:
87
- return f"Processing error: {str(e)}", ""
 
 
 
 
 
 
 
 
 
88
 
89
- # Создание Gradio интерфейса
90
  with gr.Blocks(title="Video Analysis Tool", css="""
91
  .gradio-container {max-width: 1200px !important}
92
- .frame-results {max-height: 70vh; overflow-y: auto; padding-right:10px;}
93
  .output-box {border-radius: 8px !important; margin-top:15px;}
94
- .audio-output {background:#f8f8f8 !important; padding:15px !important;}
95
  h1 {color: #1a73e8 !important;}
 
 
 
 
 
 
96
  """) as demo:
97
 
98
  gr.Markdown("""
99
  # 🎥 Video Analysis Tool
100
- Analyze YouTube videos - get frame-by-frame descriptions with timestamps
101
  """)
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  with gr.Row():
104
- with gr.Column(scale=1, min_width=400):
105
- youtube_url = gr.Textbox(
106
- label="YouTube Video URL",
107
- value="https://www.youtube.com/watch?v=FK3dav4bA4s&t=1s",
108
- lines=1
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  )
110
- prompt = gr.Textbox(
111
- label="Analysis Prompt",
112
- value=DEFAULT_PROMPT,
113
- lines=5,
114
- max_lines=10
 
115
  )
116
- with gr.Row():
117
- quality = gr.Dropdown(
118
- label="Video Quality",
119
- choices=[144, 240, 360, 480, 720, 1080, 1440, 2160],
120
- value=720
121
- )
122
- time_step = gr.Slider(
123
- label="Frame Interval (seconds)",
124
- minimum=0.5,
125
- maximum=30,
126
- step=0.5,
127
- value=2
128
- )
129
- submit_btn = gr.Button("Analyze Video", variant="primary")
130
 
131
- with gr.Column(scale=2):
132
- video_output = gr.HTML(
133
- label="Frame Analysis Results",
134
- elem_classes=["frame-results", "output-box"]
135
- )
136
- audio_output = gr.Textbox(
137
- label="Audio Transcription",
138
- interactive=False,
139
- lines=10,
140
- max_lines=15,
141
- elem_classes=["output-box", "audio-output"]
142
- )
143
 
 
144
  submit_btn.click(
145
- fn=process_video,
146
  inputs=[youtube_url, prompt, quality, time_step],
147
- outputs=[video_output, audio_output]
148
  )
149
 
150
  if __name__ == "__main__":
151
- demo.launch(mcp_server=True)
 
1
  import gradio as gr
2
  import torch
3
  import os
4
+ import json
5
+ import requests # Added for making HTTP requests
6
+ import socket # Added for getting hostname
7
+
8
+ # Import your modules
9
  from downloader import download_youtube_video
10
  from video_processing import extract_frames_with_timestamps, generate_frame_descriptions
11
  from audio_processing import transcribe_audio
12
  from model_api import get_device_and_dtype
13
 
14
+ # Initialize device and data type
15
  device, dtype = get_device_and_dtype()
16
 
17
+ # Default prompt
18
  DEFAULT_PROMPT = "Analyze the frame, describe what objects are in the frame, how many there are, the background and the action taking place."
19
 
20
+ # --- FUNCTION TO GET PUBLIC IP AND HOSTNAME (NOT FOR MCP) ---
21
+ def get_public_ip_and_hostname() -> str:
22
+ """
23
+ Retrieves the public IP address and the hostname of the machine.
24
+ This function is intended for display purposes within the Gradio UI
25
+ and should NOT be exposed via MCP API.
26
+ """
27
+ public_ip = "N/A"
28
+ hostname = "N/A"
29
+
30
+ try:
31
+ # Get public IP address
32
+ response = requests.get("https://api.ipify.org?format=json", timeout=5)
33
+ response.raise_for_status() # Raise an exception for HTTP errors
34
+ public_ip = response.json().get("ip", "N/A")
35
+ except requests.exceptions.RequestException as e:
36
+ print(f"Error getting public IP: {e}")
37
+ public_ip = f"Error: {e}"
38
+
39
  try:
40
+ # Get hostname
41
+ hostname = socket.gethostname()
42
+ except Exception as e:
43
+ print(f"Error getting hostname: {e}")
44
+ hostname = f"Error: {e}"
45
 
46
+ return f"Public IP: {public_ip} | Hostname: {hostname}"
47
+
48
+ # --- OPTIMIZED FUNCTION, RETURNING JSON STRING ---
49
+ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: float) -> str:
50
+ """
51
+ Analyzes a YouTube video by downloading it, extracting frames, generating descriptions
52
+ for each frame, and transcribing the audio.
53
+
54
+ Args:
55
+ youtube_url (str): The URL of the YouTube video to analyze.
56
+ prompt (str): A custom prompt to guide the frame description generation.
57
+ quality (int): The desired video quality in pixels (e.g., 144, 240, 360, 480, 720, 1080, 1440, 2160).
58
+ Note: The actual quality might vary based on available streams.
59
+ time_step (float): The interval in seconds at which to extract frames. The lower the value, the better the quality of the analysis result.
60
+
61
+ Returns:
62
+ str: A JSON formatted string containing the analysis results.
63
+ The JSON structure includes:
64
+ - "status": "success" if the analysis was successful, "error" otherwise.
65
+ - "message": A brief description of the outcome (empty string for success,
66
+ or an error message for error).
67
+ - "frame_analysis": A list of dictionaries, where each dictionary represents a frame
68
+ and contains "timestamp" and "description".
69
+ - "audio_transcription": The transcribed text of the video's audio.
70
+
71
+ Raises:
72
+ Exception: Catches any exceptions during the process and returns them
73
+ within the JSON output for user feedback.
74
+ """
75
+
76
+ results = {
77
+ "status": "success", # Default to success
78
+ "message": "", # Default message is empty for success
79
+ "frame_analysis": [],
80
+ "audio_transcription": ""
81
+ }
82
+
83
+ try:
84
+ # 1. Download video
85
  video_data = download_youtube_video(
86
  url=youtube_url,
87
  video_quality=quality
88
  )
89
 
90
+ # 2. Extract frames
91
+ # frames_dict: {timestamp: path_to_frame_image}
92
+ frames_dict = extract_frames_with_timestamps(
93
  video_path=video_data['video_path'],
94
  output_dir=video_data['data_path'],
95
  time_step=time_step,
96
  hw_device="cuda"
97
  )
98
 
99
+ # 3. Generate descriptions for frames
100
  descriptions = generate_frame_descriptions(
101
+ frames_dict=frames_dict,
102
  custom_prompt=prompt,
103
  device=device,
104
  torch_dtype=dtype
105
  )
106
 
107
+ # 4. Transcribe audio
108
+ transcription_text = transcribe_audio(video_data['audio_path'])
109
 
110
+ # 5. Formulate results structure
111
+ for timestamp, frame_path in frames_dict.items():
112
+ description = descriptions.get(timestamp, "No description available")
113
+ results["frame_analysis"].append({
114
+ "timestamp": timestamp,
115
+ "description": description,
116
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ results["audio_transcription"] = transcription_text
119
+
120
+ # Return formatted JSON string
121
+ return json.dumps(results, indent=2, ensure_ascii=False)
122
 
123
  except Exception as e:
124
+ error_message = f"Processing error: {str(e)}"
125
+ print(f"An error occurred during video analysis: {e}") # For debugging
126
+
127
+ results["status"] = "error" # Set status to error
128
+ results["message"] = error_message # Set error message
129
+ results["frame_analysis"] = [] # Clear frame results on error
130
+ results["audio_transcription"] = "" # Clear transcription on error
131
+
132
+ # In case of error, return JSON string with error details
133
+ return json.dumps(results, indent=2, ensure_ascii=False)
134
 
135
+ # Create Gradio interface
136
  with gr.Blocks(title="Video Analysis Tool", css="""
137
  .gradio-container {max-width: 1200px !important}
 
138
  .output-box {border-radius: 8px !important; margin-top:15px;}
139
+ .results-output {background:#f8f8f8 !important; padding:15px !important;}
140
  h1 {color: #1a73e8 !important;}
141
+ .ip-info {
142
+ font-size: 0.9em;
143
+ color: #666;
144
+ margin-top: -15px; /* Adjust as needed to pull it closer to the title */
145
+ margin-bottom: 10px;
146
+ }
147
  """) as demo:
148
 
149
  gr.Markdown("""
150
  # 🎥 Video Analysis Tool
151
+ Analyze YouTube videos - get frame-by-frame descriptions with timestamps and audio transcription.
152
  """)
153
 
154
+ # NEW: Display Public IP and Hostname
155
+ # We use a gr.Markdown component to display the text.
156
+ # The key here is that get_public_ip_and_hostname is NOT directly an input/output
157
+ # of a button. It's called once when the app loads, or its output is static.
158
+ # To prevent it from being in MCP API, we typically don't expose it via gr.Interface
159
+ # or explicitly set show_api=False for the component if it were interactive.
160
+ # Here, it's a simple call rendered in Markdown, so it won't be exposed.
161
+ gr.Markdown(
162
+ f"<div class='ip-info'>{get_public_ip_and_hostname()}</div>",
163
+ # This component itself does not expose an API endpoint if it's just static Markdown
164
+ # or updated via a gr.State and not directly via a `fn` in `click` with `show_api=True`.
165
+ # The key is that the function `get_public_ip_and_hostname` is called
166
+ # during the UI definition, not as an API endpoint.
167
+ )
168
+
169
  with gr.Row():
170
+ youtube_url = gr.Textbox(
171
+ label="YouTube Video URL",
172
+ value="https://www.youtube.com/watch?v=FK3dav4bA4s",
173
+ lines=1,
174
+ scale=3
175
+ )
176
+ prompt = gr.Textbox(
177
+ label="Analysis Prompt",
178
+ value=DEFAULT_PROMPT,
179
+ lines=3,
180
+ max_lines=5,
181
+ scale=4
182
+ )
183
+ with gr.Column(scale=2, min_width=200):
184
+ quality = gr.Dropdown(
185
+ label="Video Quality",
186
+ choices=[144, 240, 360, 480, 720, 1080, 1440, 2160],
187
+ value=480
188
  )
189
+ time_step = gr.Slider(
190
+ label="Frame Interval (seconds)",
191
+ minimum=0.5,
192
+ maximum=30,
193
+ step=0.5,
194
+ value=30
195
  )
196
+ submit_btn = gr.Button("Start Video Analysis", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
+ # Next row: Analysis results (gr.JSON)
199
+ with gr.Row():
200
+ results_json_viewer = gr.JSON(
201
+ label="Raw Analysis Results (JSON)",
202
+ elem_classes=["output-box", "results-output"],
203
+ )
 
 
 
 
 
 
204
 
205
+ # Direct binding of the button to the single processing function
206
  submit_btn.click(
207
+ fn=analyze_video_data,
208
  inputs=[youtube_url, prompt, quality, time_step],
209
+ outputs=[results_json_viewer]
210
  )
211
 
212
  if __name__ == "__main__":
213
+ demo.launch(share=False, mcp_server=True)
requirements.txt CHANGED
@@ -5,7 +5,7 @@ tqdm==4.67.1
5
  datasets==3.6.0
6
  evaluate==0.4.3
7
  accelerate==1.7.0
8
- gradio==5.32.1
9
  gradio[mcp]
10
  ipython==9.3.0
11
  ipywidgets==8.1.7
 
5
  datasets==3.6.0
6
  evaluate==0.4.3
7
  accelerate==1.7.0
8
+ gradio==5.33.0
9
  gradio[mcp]
10
  ipython==9.3.0
11
  ipywidgets==8.1.7