Aura_AI_Scan / app_srv /app_srv.py
artyomboyko's picture
Update app_srv/app_srv.py
9e8bc02 verified
import gradio as gr
import torch
import os
import shutil
import json
import base64
import tempfile
import time # To simulate delays and show progress more smoothly
from pathlib import Path
from downloader import download_youtube_video
from video_processing import extract_frames_with_timestamps, generate_frame_descriptions
from audio_processing import transcribe_audio
from model_api import get_device_and_dtype
device, dtype = get_device_and_dtype()
gui_header_element = """# πŸš€ Aura AI Scan: Deep Analysis of YouTube Videos
([alpha version](https://en.wikipedia.org/wiki/Software_release_life_cycle))
Aura AI Scan is an MCP tool designed to deeply analyse YouTube videos, providing a frame-by-frame description with timestamps and full transcription of the audio track.
The following technologies were used in the implementation:
- [VSCode devcontainer](https://code.visualstudio.com/docs/devcontainers/containers), to simplify the development process
- [Docker](https://huggingface.co/docs/hub/spaces-sdks-docker), to simplify local deployment. This space is also implemented as [HuggingFace Docker Space](https://huggingface.co/docs/hub/spaces-sdks-docker).
- You can view the final results on the server's complete GUI in three formats: visual, audio, and JSON.
- Flexible settings that can be changed (prompt used during analysis, quality of the analysed video, time interval between frames).
- ability to include audio and frames used in the analysis process in the resulting JSON.
***IMPORTANT NOTE:***
0. Links to plugins for exporting cookies: [Chrome](https://chromewebstore.google.com/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc)
[FireFox](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/)
1. If you get an error message like this:
```text
Error: Processing error: Media download error: ERROR: [youtube] FK3dav4bA4s: Sign in to confirm you're not a bot. Use --cookies-from-browser or --cookies for the authentication. See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies. Also see https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies for tips on effectively exporting YouTube cookies
```
Export cookies from your browser. ***DO NOT CLOSE THE YOUTUBE TAB AFTER EXPORTING COOKIES!*** Import cookies into GUI. And perform video analysis.
2. Do not forget to delete cookies after use. Self-deletion is not provided. Remember, this is a public space. If possible, clone it to private or use it locally as a Docker container.
3. ***This space will not work correctly without GPU.*** Analyzing video clips is a rather resource-intensive process. The minimum tested is Nvidia T4 small.
4. If you have any problems, feel free to open a discussion on the "Community" tab.
[Quick video tutorial](https://drive.google.com/file/d/1LiZ9v5KsT3C_pJ8xLYmYkZDiM54vftjm/view?usp=drive_link)
"""
DEFAULT_PROMPT = """You are an expert at analyzing video, so pay close attention. Your main goal is to analyze the frame and find information in it to answer the MAIN QUESTION. Pay attention to details.
Provide the analysis for each frame in the following format, focusing on the frame at timestamp {timestamp}:
FRAME: {timestamp}
OBJECTS: List of objects with their count, for example: Bengal tiger - 1, Volvo car - 1, Person - 2 (male, female). Mentioning an object in the text on the frame does not count as a separate object.
If there are no objects, the field is equal to NONE.
BACKGROUND: Description of background and surroundings, e.g.: Muddy brown water. A road in the distance. An abandoned building on the horizon.
ACTION: A detailed description of what is happening in the frame, for example: A Bengal tiger swimming in murky water, its head and part of its back visible above the surface.
The shot is taken from above, above the tiger. A blue Volvo car is driving along the road in the distance. A part of a tree is visible in the right part of the frame.
If there are no actions, the field is equal to NONE.
RECOGNIZED TEXT: Any text recognized in the frame, e.g.: "STOP", "VOLVO", "EXIT 25". Only the text that is present in the frame, if it is not present, this field is NONE.
OBJECTS:
BACKGROUND:
ACTION:
RECOGNIZED TEXT:
"""
def analyze_video_data(
youtube_url: str,
quality: int = 720,
time_step: float = 5.0,
include_audio_data: bool = False,
include_frame_data: bool = False
) -> str:
"""
This tool returns a text description of the frames from a YouTube clip and a full transcription of the audio track of that clip.
Analyzing clips can be time-consuming (depending on the specified quality). You should always wait for the process to complete.
Args:
youtube_url (str): The URL of the YouTube video to be analyzed.
quality (int, optional): The desired video quality for download. Allowed values: 144, 240, 360, 480, 720, 1080, 1440, 2160. Default value: 720.
time_step (float, optional): The interval in seconds at which frames will be extracted from the video. Default value: 5.0.
include_audio_data (bool, optional): If True, the base64 encoded audio data (MP3) will be included in the JSON results. Default value: False.
include_frame_data (bool, optional): If True, base64 encoded image data (JPG) for each extracted frame will be included in the JSON results. Default value: False.
Returns:
str: A JSON string containing the analysis results.
On success, it includes 'status': 'success', 'frame_analysis' (list of dictionaries
with 'timestamp', 'description', and optional 'image_base64'),
'audio_transcription', and optional 'audio_base64'.
On error, it includes 'status': 'error' and a 'message' detailing the error.
"""
results = {
"status": "success",
"message": "",
"frame_analysis": [],
"audio_transcription": "",
"audio_base64": ""
}
try:
# --- ΠΠΠ§ΠΠ›Πž ΠžΠ’Π›ΠΠ”ΠžΠ§ΠΠžΠ“Πž ΠšΠžΠ”Π Π’ΠΠ£Π’Π Π˜ analyze_video_data ---
logging.info(f"DEBUG IN analyze_video_data: Received 'quality'.")
logging.info(f"DEBUG IN analyze_video_data: Type of 'quality': {type(quality)}")
logging.info(f"DEBUG IN analyze_video_data: Value of 'quality': '{quality}'")
logging.info(f"DEBUG IN analyze_video_data: Repr of 'quality': {repr(quality)}")
# Π”ΠΎΠ±Π°Π²ΠΈΠΌ Π΄ΠΎΠΏΠΎΠ»Π½ΠΈΡ‚Π΅Π»ΡŒΠ½ΡƒΡŽ ΠΏΡ€ΠΎΠ²Π΅Ρ€ΠΊΡƒ здСсь, Ρ‡Ρ‚ΠΎΠ±Ρ‹ явно ΡƒΠ²ΠΈΠ΄Π΅Ρ‚ΡŒ ΠΏΡ€ΠΎΠ±Π»Π΅ΠΌΡƒ
allowed_qualities = [144, 240, 360, 480, 720, 1080, 1440, 2160] #
if isinstance(quality, str): # Если quality ΠΏΡ€ΠΈΡˆΠ»ΠΎ ΠΊΠ°ΠΊ строка, пытаСмся ΠΏΡ€Π΅ΠΎΠ±Ρ€Π°Π·ΠΎΠ²Π°Ρ‚ΡŒ
try:
quality_as_int = int(quality) #
logging.info(f"DEBUG IN analyze_video_data: Converted string 'quality' to int: {quality_as_int}")
if quality_as_int not in allowed_qualities: #
# Π­Ρ‚ΠΎ Π΄ΠΎΠ»ΠΆΠ½ΠΎ ΠΏΠ΅Ρ€Π΅Ρ…Π²Π°Ρ‚ΠΈΡ‚ΡŒ ΠΎΡˆΠΈΠ±ΠΊΡƒ, Ссли числовоС Π·Π½Π°Ρ‡Π΅Π½ΠΈΠ΅ Π½Π΅ Π² спискС
raise ValueError(f"Value: {quality} (converted to {quality_as_int}) is not in the list of choices: |{', '.join(map(str, allowed_qualities))}]") #
except ValueError as ve:
# Если ΠΏΡ€Π΅ΠΎΠ±Ρ€Π°Π·ΠΎΠ²Π°Π½ΠΈΠ΅ Π² int Π½Π΅ ΡƒΠ΄Π°Π»ΠΎΡΡŒ ΠΈΠ»ΠΈ число Π½Π΅ Π² спискС
logging.error(f"DEBUG IN analyze_video_data: Error converting or validating string 'quality': {ve}")
raise ValueError(f"Value: {quality} is not in the list of choices: |{', '.join(map(str, allowed_qualities))}]") from ve #
elif quality not in allowed_qualities: #
# Если quality ΠΏΡ€ΠΈΡˆΠ»ΠΎ ΠΊΠ°ΠΊ int, Π½ΠΎ Π½Π΅ Π² спискС
raise ValueError(f"Value: {quality} is not in the list of choices: |{', '.join(map(str, allowed_qualities))}]") #
else:
logging.info(f"DEBUG IN analyze_video_data: 'quality' ({quality}) is a valid integer and in allowed choices.")
logging.info(f'Start uploading the video {youtube_url}.')
video_data = download_youtube_video(
url=youtube_url, video_quality=quality, # youtube_cookies=cookies
)
logging.info(f'Start extracting frames from the video {youtube_url}.')
frames_dict = extract_frames_with_timestamps(
video_path=video_data["video_path"],
output_dir=video_data["data_path"],
time_step=time_step,
hw_device="cuda",
)
logging.info(f'Starting describing frames from video {youtube_url}.')
descriptions = generate_frame_descriptions(
frames_dict=frames_dict,
custom_prompt=DEFAULT_PROMPT,
device=device,
torch_dtype=dtype,
)
logging.info(f'Starting audio transcription {youtube_url}.')
transcription_text = transcribe_audio(video_data["audio_path"])
for timestamp, frame_path in frames_dict.items():
description = descriptions.get(timestamp, "No description available")
frame_entry = {"timestamp": timestamp, "description": description, "image_base64": ""}
if include_frame_data and os.path.exists(frame_path): #
with open(frame_path, "rb") as f:
frame_entry["image_base64"] = base64.b64encode(f.read()).decode("utf-8")
results["frame_analysis"].append(frame_entry)
results["audio_transcription"] = transcription_text
if include_audio_data and os.path.exists(video_data["audio_path"]): #
with open(video_data["audio_path"], "rb") as f:
results["audio_base64"] = base64.b64encode(f.read()).decode("utf-8")
return json.dumps(results, indent=2, ensure_ascii=False)
except Exception as e:
error_message = f"Processing error in function analyze_video_data: {str(e)}"
print(error_message)
results["status"] = "error"
results["message"] = error_message
results["frame_analysis"] = []
results["audio_transcription"] = ""
results["audio_base64"] = ""
for frame_entry in results["frame_analysis"]: #
frame_entry["image_base64"] = "" #
return json.dumps(results, indent=2, ensure_ascii=False)
def get_video_html_from_json(json_string: str) -> str:
try:
data = json.loads(json_string)
if data["status"] == "error":
return f"<p style='color:red;'>Error: {data['message']}</p>"
html_content = ""
if not data["frame_analysis"]:
html_content += "<p>No frames analyzed or included.</p>"
else:
for frame in data["frame_analysis"]:
timestamp = frame.get("timestamp", "N/A")
description = frame.get("description", "No description available")
image_base64 = frame.get("image_base64", "")
html_content += f"<div style='margin-bottom: 20px; border: 1px solid #eee; padding: 10px; border-radius: 8px;'>"
html_content += f"<h3>FRAME: {timestamp}</h3>"
if image_base64:
html_content += f"<img src='data:image/jpeg;base64,{image_base64}' style='max-width: 100%; height: auto; border-radius: 4px; margin-bottom: 10px;'><br>"
else:
html_content += f"<p>Image data not included for this frame (checkbox 'Include Frame Data' was not selected).</p>"
html_content += f"<p><strong>Description:</strong> {description}</p>"
html_content += "</div>"
return html_content
except json.JSONDecodeError:
return "<p style='color:red;'>Invalid JSON response.</p>"
except Exception as e:
return f"<p style='color:red;'>Error processing video data for display: {str(e)}</p>"
def get_audio_data_from_json(json_string: str) -> tuple[str, str | None]:
try:
data = json.loads(json_string)
if data["status"] == "error":
return f"Error: {data['message']}", None
transcription = data.get("audio_transcription", "No transcription available.")
audio_base64 = data.get("audio_base64", "")
if audio_base64:
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
temp_audio_file.write(base64.b64decode(audio_base64))
temp_audio_path = temp_audio_file.name
return transcription, temp_audio_path
else:
transcription += "\n\nAudio data not included (checkbox 'Include Audio Data' was not selected)."
return transcription, None
except json.JSONDecodeError:
return "Invalid JSON response for audio.", None
except Exception as e:
return f"Error processing audio data for display: {str(e)}", None
# Wrapper function for analysis with progress bar
def analyze_video_data_with_progress_wrapper(
youtube_url: str,
prompt: str,
quality: int,
time_step: float,
include_audio_data: bool,
include_frame_data: bool,
progress=gr.Progress()
):
results = {
"status": "pending",
"message": "Starting analysis...",
"frame_analysis": [],
"audio_transcription": "",
"audio_base64": ""
}
try:
progress(0, desc="Downloading video...")
# Step 1: Downloading a YouTube video
video_data = download_youtube_video(
url=youtube_url, video_quality=quality, # youtube_cookies=cookies
)
progress(0.25, desc="Extracting frames...")
# Step 2: Extract frames from video
frames_dict = extract_frames_with_timestamps(
video_path=video_data["video_path"],
output_dir=video_data["data_path"],
time_step=time_step,
hw_device="cuda",
)
progress(0.5, desc="Generating frame descriptions...")
# Step 3: Generate frames descriptions
descriptions = generate_frame_descriptions(
frames_dict=frames_dict,
custom_prompt=prompt,
device=device,
torch_dtype=dtype,
)
progress(0.75, desc="Transcribing audio...")
# Step 4: Transcribe the audio
transcription_text = transcribe_audio(video_data["audio_path"])
progress(0.9, desc="Consolidating results...")
# Build the final results dictionary
for timestamp, frame_path in frames_dict.items():
description = descriptions.get(timestamp, "No description available")
frame_entry = {"timestamp": timestamp, "description": description, "image_base64": ""}
if include_frame_data and os.path.exists(frame_path):
with open(frame_path, "rb") as f:
frame_entry["image_base64"] = base64.b64encode(f.read()).decode("utf-8")
results["frame_analysis"].append(frame_entry)
results["audio_transcription"] = transcription_text
if include_audio_data and os.path.exists(video_data["audio_path"]):
with open(video_data["audio_path"], "rb") as f:
results["audio_base64"] = base64.b64encode(f.read()).decode("utf-8")
results["status"] = "success"
results["message"] = "Analysis complete!"
progress(1.0, desc="Analysis complete!")
yield json.dumps(results, indent=2, ensure_ascii=False)
except Exception as e:
error_message = f"Processing error: {str(e)}"
results["status"] = "error"
results["message"] = error_message
results["frame_analysis"] = []
results["audio_transcription"] = ""
results["audio_base64"] = ""
progress(1.0, desc="Analysis failed!")
yield json.dumps(results, indent=2, ensure_ascii=False)
# The path where the cookie.txt file is saved
working_cookies_file_path = "/home/mcp_user/app_srv/cookies.txt"
# Global variable to store the path to the last temporary Gradio file
gradio_temp_cookies_file_path = None
def upload_cookies_file(file):
global gradio_temp_cookies_file_path
if file is None:
return "Please first select a cookie file to upload."
try:
source_path = file.name
# ΠŸΡ€ΠΎΡΡ‚ΠΎ сохраняСм ΠΏΡƒΡ‚ΡŒ ΠΊ Π²Ρ€Π΅ΠΌΠ΅Π½Π½ΠΎΠΌΡƒ Ρ„Π°ΠΉΠ»Ρƒ Gradio
gradio_temp_cookies_file_path = source_path
message = (f"File uploaded successfully.\n"
f"Path to the Gradio temporary file (used as cookies): {source_path}.")
return message
except Exception as e:
error_message = f"Error occurred during file upload processing: {type(e).__name__}: {e}"
return error_message
def clear_cookies_file():
global gradio_temp_cookies_file_path
status_messages = []
# Π£Π΄Π°Π»Π΅Π½ΠΈΠ΅ Π²Ρ€Π΅ΠΌΠ΅Π½Π½ΠΎΠ³ΠΎ Ρ„Π°ΠΉΠ»Π° Gradio, Ссли ΠΎΠ½ извСстСн ΠΈ сущСствуСт
if gradio_temp_cookies_file_path and os.path.exists(gradio_temp_cookies_file_path):
try:
os.remove(gradio_temp_cookies_file_path)
status_messages.append(f"Temporary Gradio file ({os.path.basename(gradio_temp_cookies_file_path)}) was successfully deleted.")
except Exception as e:
error_msg = f"Error deleting a temporary Gradio file ({os.path.basename(gradio_temp_cookies_file_path)}): {type(e).__name__}: {e}."
status_messages.append(error_msg)
finally:
# БбрасываСм ΠΏΡƒΡ‚ΡŒ, Π΄Π°ΠΆΠ΅ Ссли ΡƒΠ΄Π°Π»Π΅Π½ΠΈΠ΅ Π½Π΅ ΡƒΠ΄Π°Π»ΠΎΡΡŒ, Ρ‡Ρ‚ΠΎΠ±Ρ‹ ΠΈΠ·Π±Π΅ΠΆΠ°Ρ‚ΡŒ ΠΏΠΎΠ²Ρ‚ΠΎΡ€Π½Ρ‹Ρ… ΠΏΠΎΠΏΡ‹Ρ‚ΠΎΠΊ
gradio_temp_cookies_file_path = None
elif gradio_temp_cookies_file_path is None:
status_messages.append("Path to the Gradio temporary file was unknown (no file uploaded yet or already cleared).")
else:
# Если gradio_temp_cookies_file_path Π½Π΅ None, Π½ΠΎ Ρ„Π°ΠΉΠ» Π½Π΅ сущСствуСт (Π²ΠΎΠ·ΠΌΠΎΠΆΠ½ΠΎ, auto-ΡƒΠ΄Π°Π»Π΅Π½ Gradio)
status_messages.append(f"Temporary Gradio file ({os.path.basename(gradio_temp_cookies_file_path)}) no longer exists (might have been auto-deleted by Gradio).")
final_message = "\n".join(status_messages)
return final_message
with gr.Blocks(title="Video Analysis Tool",) as demo:
gr.Markdown(gui_header_element)
with gr.Row():
youtube_url = gr.Textbox(
label="YouTube Video URL",
value="https://www.youtube.com/watch?v=FK3dav4bA4s&t=36s",
lines=1,
scale=5
)
with gr.Row():
prompt = gr.Textbox(
label="Analysis Prompt",
value=DEFAULT_PROMPT,
lines=3,
scale=4
)
with gr.Column(scale=2, min_width=200):
file_input = gr.File(label="Select a cookie file to upload", file_count="single", height=263)
output_message = gr.Textbox(label="Status of uploading file with cookies")
upload_cookies_file_button = gr.Button("Save file with cookies")
with gr.Column(scale=2, min_width=200):
quality = gr.Dropdown(
label="Video Quality",
choices=[144, 240, 360, 480, 720, 1080, 1440, 2160],
value=720
)
time_step = gr.Slider(
label="Frame Interval (seconds)",
minimum=0.5,
maximum=30,
step=0.5,
value=30
)
include_audio_data = gr.Checkbox(
label="Include Audio Data (MP3) in Results", value=False
)
include_frame_data = gr.Checkbox(
label="Include Frame Data (JPG) in Results", value=False
)
t1 = gr.Textbox(value="Waiting for task...",label="Task Progress", show_label=True, lines=3, interactive=False)
# Button to create an MCP server point
submit_btn = gr.Button("Start Video Analysis (No Progress Bar, for MCP Server use)", variant="primary", visible=False)
# Analyze button with progress bar
submit_btn_with_progress = gr.Button("Analyze Video", variant="secondary")
with gr.Tabs() as results_tabs:
with gr.TabItem("Video"):
video_output_html = gr.HTML(label="Video Frames Analysis", elem_id="video-output-html")
with gr.TabItem("Audio"):
audio_player_output = gr.Audio(label="Play Audio", type="filepath", render=True)
audio_transcription_output = gr.Textbox(label="Audio Transcription", lines=10)
with gr.TabItem("JSON"):
results_json_viewer = gr.JSON(
label="Raw Analysis Results (JSON)",
elem_classes=["output-box", "results-output"],
)
raw_json_output = gr.State()
# Logic for normal button (without progress bar), this button becomes the MCP server point.
submit_btn.click(
fn=analyze_video_data,
inputs=[youtube_url, quality, time_step, include_audio_data, include_frame_data],
outputs=[raw_json_output],
api_name="analyze_video_data",
show_api=True
).then(
fn=get_video_html_from_json,
inputs=[raw_json_output],
outputs=[video_output_html],
show_api=False
).then(
fn=get_audio_data_from_json,
inputs=[raw_json_output],
outputs=[audio_transcription_output, audio_player_output],
show_api=False
).then(
fn=lambda x: json.loads(x),
inputs=[raw_json_output],
outputs=[results_json_viewer],
show_api=False
)
# Logic for button with progress bar
submit_btn_with_progress.click(
fn=analyze_video_data_with_progress_wrapper,
inputs=[youtube_url, prompt, quality, time_step, include_audio_data, include_frame_data],
outputs=[raw_json_output],
api_name="analyze_video_data_with_progress_button",
show_progress_on=t1,
show_api=False
).then(
fn=get_video_html_from_json,
inputs=[raw_json_output],
outputs=[video_output_html],
show_api=False
).then(
fn=get_audio_data_from_json,
inputs=[raw_json_output],
outputs=[audio_transcription_output, audio_player_output],
show_api=False
).then(
fn=lambda x: json.loads(x),
inputs=[raw_json_output],
outputs=[results_json_viewer],
show_api=False
)
# Logic of processing cookies
upload_cookies_file_button.click(
fn=upload_cookies_file,
inputs=file_input,
outputs=output_message,
show_api=False
)
file_input.clear(
fn=clear_cookies_file,
inputs=None,
outputs=output_message,
show_api=False
)
if __name__ == "__main__":
demo.launch(mcp_server=True)