Spaces:

artyomboyko
/

Aura_AI_Scan

Running

App Files Files Community

Artyom Boyko commited on Jun 5

Commit

95b1e18

1 Parent(s): c867d05

Solving the YouTube problem.

Browse files

Files changed (6) hide show

app_srv/app_srv.py +27 -64
app_srv/audio_processing.py +1 -1
app_srv/downloader.py +4 -1
app_srv/test.ipynb +6 -66
packages.txt +2 -1
requirements.txt +2 -1

app_srv/app_srv.py CHANGED Viewed

@@ -2,8 +2,6 @@ import gradio as gr
 import torch
 import os
 import json
-import requests # Added for making HTTP requests
-import socket   # Added for getting hostname
 # Import your modules
 from downloader import download_youtube_video
@@ -15,35 +13,14 @@ from model_api import get_device_and_dtype
 device, dtype = get_device_and_dtype()
 # Default prompt
-DEFAULT_PROMPT = "Analyze the frame, describe what objects are in the frame, how many there are, the background and the action taking place."
-# --- FUNCTION TO GET PUBLIC IP AND HOSTNAME (NOT FOR MCP) ---
-def get_public_ip_and_hostname() -> str:
-    """
-    Retrieves the public IP address and the hostname of the machine.
-    This function is intended for display purposes within the Gradio UI
-    and should NOT be exposed via MCP API.
-    """
-    public_ip = "N/A"
-    hostname = "N/A"
-    try:
-        # Get public IP address
-        response = requests.get("https://api.ipify.org?format=json", timeout=5)
-        response.raise_for_status() # Raise an exception for HTTP errors
-        public_ip = response.json().get("ip", "N/A")
-    except requests.exceptions.RequestException as e:
-        print(f"Error getting public IP: {e}")
-        public_ip = f"Error: {e}"
-    try:
-        # Get hostname
-        hostname = socket.gethostname()
-    except Exception as e:
-        print(f"Error getting hostname: {e}")
-        hostname = f"Error: {e}"
-    return f"Public IP: {public_ip} | Hostname: {hostname}"
 # --- OPTIMIZED FUNCTION, RETURNING JSON STRING ---
 def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: float) -> str:
@@ -62,8 +39,8 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
         str: A JSON formatted string containing the analysis results.
              The JSON structure includes:
              - "status": "success" if the analysis was successful, "error" otherwise.
-             - "message": A brief description of the outcome (empty string for success,
-                          or an error message for error).
              - "frame_analysis": A list of dictionaries, where each dictionary represents a frame
                                  and contains "timestamp" and "description".
              - "audio_transcription": The transcribed text of the video's audio.
@@ -74,21 +51,25 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
     """
     results = {
-        "status": "success",  # Default to success
-        "message": "",        # Default message is empty for success
         "frame_analysis": [],
         "audio_transcription": ""
     }
     try:
         # 1. Download video
         video_data = download_youtube_video(
             url=youtube_url,
-            video_quality=quality
         )
         # 2. Extract frames
-        # frames_dict: {timestamp: path_to_frame_image}
         frames_dict = extract_frames_with_timestamps(
             video_path=video_data['video_path'],
             output_dir=video_data['data_path'],
@@ -99,7 +80,7 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
         # 3. Generate descriptions for frames
         descriptions = generate_frame_descriptions(
             frames_dict=frames_dict,
-            custom_prompt=prompt,
             device=device,
             torch_dtype=dtype
         )
@@ -116,6 +97,8 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
             })
         results["audio_transcription"] = transcription_text
         # Return formatted JSON string
         return json.dumps(results, indent=2, ensure_ascii=False)
@@ -124,12 +107,12 @@ def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: f
         error_message = f"Processing error: {str(e)}"
         print(f"An error occurred during video analysis: {e}") # For debugging
-        results["status"] = "error" # Set status to error
-        results["message"] = error_message # Set error message
-        results["frame_analysis"] = [] # Clear frame results on error
-        results["audio_transcription"] = "" # Clear transcription on error
-        # In case of error, return JSON string with error details
         return json.dumps(results, indent=2, ensure_ascii=False)
 # Create Gradio interface
@@ -138,12 +121,6 @@ with gr.Blocks(title="Video Analysis Tool", css="""
     .output-box {border-radius: 8px !important; margin-top:15px;}
     .results-output {background:#f8f8f8 !important; padding:15px !important;}
     h1 {color: #1a73e8 !important;}
-    .ip-info {
-        font-size: 0.9em;
-        color: #666;
-        margin-top: -15px; /* Adjust as needed to pull it closer to the title */
-        margin-bottom: 10px;
-    }
 """) as demo:
     gr.Markdown("""
@@ -151,21 +128,7 @@ with gr.Blocks(title="Video Analysis Tool", css="""
     Analyze YouTube videos - get frame-by-frame descriptions with timestamps and audio transcription.
     """)
-    # NEW: Display Public IP and Hostname
-    # We use a gr.Markdown component to display the text.
-    # The key here is that get_public_ip_and_hostname is NOT directly an input/output
-    # of a button. It's called once when the app loads, or its output is static.
-    # To prevent it from being in MCP API, we typically don't expose it via gr.Interface
-    # or explicitly set show_api=False for the component if it were interactive.
-    # Here, it's a simple call rendered in Markdown, so it won't be exposed.
-    gr.Markdown(
-        f"<div class='ip-info'>{get_public_ip_and_hostname()}</div>",
-        # This component itself does not expose an API endpoint if it's just static Markdown
-        # or updated via a gr.State and not directly via a `fn` in `click` with `show_api=True`.
-        # The key is that the function `get_public_ip_and_hostname` is called
-        # during the UI definition, not as an API endpoint.
-    )
     with gr.Row():
         youtube_url = gr.Textbox(
             label="YouTube Video URL",

 import torch
 import os
 import json
 # Import your modules
 from downloader import download_youtube_video
 device, dtype = get_device_and_dtype()
 # Default prompt
+DEFAULT_PROMPT = """
+Present the frame analysis in the following format, focusing on the details in the frame:
+FRAME: {timestamp} \n
+OBJECTS: List of objects with their count, for example: Bengal tiger - 1, Volvo car - 1, Person - 2 (male, female). Don't count the mention of an object in the text with the video as a separate object. \n
+TEXT: This is where you place the text that is present in the frame. Just bring the text present in the frame from left to right, top to bottom. \n
+BACKGROUND: Description of background and surroundings, e.g.: Muddy brown water. A road in the distance. An abandoned building on the horizon. Describe only what is visible. \n
+ACTION: A detailed description of what is happening in the frame, for example: A Bengal tiger is swimming in murky water, its head and part of its back are visible above the surface. A blue Volvo car is driving along the road in the distance. A part of a tree is visible in the right part of the frame.
+"""
 # --- OPTIMIZED FUNCTION, RETURNING JSON STRING ---
 def analyze_video_data(youtube_url: str, prompt: str, quality: int, time_step: float) -> str:
         str: A JSON formatted string containing the analysis results.
              The JSON structure includes:
              - "status": "success" if the analysis was successful, "error" otherwise.
+             - "message": A brief description of the outcome (e.g., "Analysis completed successfully."
+                          or an error message).
              - "frame_analysis": A list of dictionaries, where each dictionary represents a frame
                                  and contains "timestamp" and "description".
              - "audio_transcription": The transcribed text of the video's audio.
     """
     results = {
+        "status": "success",
+        "message": "Analysis completed successfully.",
         "frame_analysis": [],
         "audio_transcription": ""
     }
     try:
+        # YouTube
+        cookies = os.getenv("YOUTUBE_COOKIES")
         # 1. Download video
         video_data = download_youtube_video(
             url=youtube_url,
+            video_quality=quality,
+            youtube_cookies=cookies
         )
         # 2. Extract frames
         frames_dict = extract_frames_with_timestamps(
             video_path=video_data['video_path'],
             output_dir=video_data['data_path'],
         # 3. Generate descriptions for frames
         descriptions = generate_frame_descriptions(
             frames_dict=frames_dict,
+            custom_prompt=prompt, # Now `prompt` can contain the {timestamp} placeholder
             device=device,
             torch_dtype=dtype
         )
             })
         results["audio_transcription"] = transcription_text
+        print("Video processing complete")
         # Return formatted JSON string
         return json.dumps(results, indent=2, ensure_ascii=False)
         error_message = f"Processing error: {str(e)}"
         print(f"An error occurred during video analysis: {e}") # For debugging
+        results["status"] = "error"
+        results["message"] = error_message
+        results["frame_analysis"] = []
+        results["audio_transcription"] = ""
+        # В случае ошибки возвращаем JSON-строку с ошибкой
         return json.dumps(results, indent=2, ensure_ascii=False)
 # Create Gradio interface
     .output-box {border-radius: 8px !important; margin-top:15px;}
     .results-output {background:#f8f8f8 !important; padding:15px !important;}
     h1 {color: #1a73e8 !important;}
 """) as demo:
     gr.Markdown("""
     Analyze YouTube videos - get frame-by-frame descriptions with timestamps and audio transcription.
     """)
+    # Top row: Video URL, prompt, analysis parameters, and analyze button
     with gr.Row():
         youtube_url = gr.Textbox(
             label="YouTube Video URL",

app_srv/audio_processing.py CHANGED Viewed

@@ -49,5 +49,5 @@ if __name__ == "__main__":
     selected_device, selected_dtype = get_device_and_dtype()
-    result = transcribe_audio("/workspaces/Video_Analyser/app_srv/temp/a28af289-377d-468d-b0eb-ed0f7dcd2ab3/audio.mp3.mp3", selected_device, selected_dtype)
     print(result)

     selected_device, selected_dtype = get_device_and_dtype()
+    result = transcribe_audio("/workspaces/Video_Analyser/app_srv/downloads/45677153-510d-4f47-95ee-c1b4b0843433/audio.mp3.mp3", selected_device, selected_dtype)
     print(result)

app_srv/downloader.py CHANGED Viewed

@@ -9,7 +9,8 @@ from datetime import datetime
 def download_youtube_video(url: str,
                            base_dir: str = None,
-                           video_quality: int = 720) -> Dict[str, str]:
     """
     Downloads video and audio from YouTube, saving them to a unique GUID folder.
     Metadata is saved in JSON format including download datetime and timezone.
@@ -75,6 +76,7 @@ def download_youtube_video(url: str,
             'quiet': True,
             'no_warnings': True,
             'restrict_filenames': True,
         }
         with yt_dlp.YoutubeDL(video_opts) as ydl:
@@ -90,6 +92,7 @@ def download_youtube_video(url: str,
                 'preferredcodec': 'mp3',
                 'preferredquality': '128',
             }],
         }
         with yt_dlp.YoutubeDL(audio_opts) as ydl:

 def download_youtube_video(url: str,
                            base_dir: str = None,
+                           video_quality: int = 720,
+                           youtube_cookies: str = "") -> Dict[str, str]:
     """
     Downloads video and audio from YouTube, saving them to a unique GUID folder.
     Metadata is saved in JSON format including download datetime and timezone.
             'quiet': True,
             'no_warnings': True,
             'restrict_filenames': True,
+            'cookiefile': youtube_cookies,
         }
         with yt_dlp.YoutubeDL(video_opts) as ydl:
                 'preferredcodec': 'mp3',
                 'preferredquality': '128',
             }],
+            'cookiefile': youtube_cookies,
         }
         with yt_dlp.YoutubeDL(audio_opts) as ydl:

app_srv/test.ipynb CHANGED Viewed

@@ -425,49 +425,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "e6d0e5fd",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Используемое устройство: cuda:0\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Device set to use cuda:0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Начало транскрипции файла: /workspaces/Video_Analyser/app_srv/temp/a6fba6eb-038e-4f4e-bcb7-f41d87ee1422/audio.mp3.mp3\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.12/dist-packages/transformers/models/whisper/generation_whisper.py:573: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.\n",
-      "  warnings.warn(\n",
-      "Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Транскрипция завершена.\n",
-      "{0.0: \"with their signature orange fur and black stripes, Tigers have become icons of beauty, power, and the importance of conservation. Tigers have evolved into six subspecies. to six subspecies. The tiger's tail of evolution can be traced back to about two million years ago when the earliest known tiger ancestor left Africa and ventured into Asia. Over time, the big cat split into nine subspecies with six still alive today. The most numerous subspecies is the Bengal tiger, accounting for approximately 50% of the tiger population worldwide. Tigers are the world's largest cats. On average, the big cats weigh about 450 pounds, but the largest is the Siberian tiger subspecies, measuring up to 13 feet long and weighing up to 660 pounds. This extra weight is primarily because of large powerful muscles. Unlike lions, the second largest of the big cats, tigers have more muscle mass and are therefore heavier. Tigers have webbed toes, often living near bodies of water, tigers have adapted for a semi-aquatic lifestyle. They appear to enjoy being in the water, unlike most cats, and are excellent swimmers using their powerful muscles to propel their bodies in the water. They've also evolved to have webbing between their toes. The webbing allows the big cats to push around a greater volume of water with each stroke of their paws. White tigers are a type of Bengal tiger. The classic black and orange coloration of most tigers is caused by the pigments U melanin, which turns for black, and pheomelanin, which turns for black, and pheomelanin, which turns fur orange. The production of pheomelanin is triggered by the gene SLC45A2. White tigers carry a mutated version of this gene which prevents them from producing orange pigmentation. Fewer than 4,000 tigers remain in the wild. At the turn of the 20th century, approximately 100,000 tigers roamed the wild, living as far west as Turkey and as far north as Russia, but due to deforestation, human development, and poaching, their range severely decreased and within 100 years, the world's tiger population declined by about 96%. by about 96%. Over the past few decades, programs have been put in place to protect tigers and their habitats with the help of the global community, tiger populations may slowly rebound. slowly rebound. Music. you Thank you. you you you you you you Music playing you Music playing you you you Music playing you you\"}\n"
-     ]
-    }
-   ],
    "source": [
     "import torch\n",
     "from transformers import pipeline\n",
@@ -537,20 +498,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
    "id": "ca9a4832",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Device set to use cuda:0\n",
-      "/usr/local/lib/python3.12/dist-packages/transformers/models/whisper/generation_whisper.py:573: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
    "source": [
     "from transformers import pipeline\n",
     "import librosa\n",
@@ -571,21 +522,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
    "id": "7cd4e28e",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "\" With their signature orange fur and black stripes, tigers have become icons of beauty, power, and the importance of conservation. Tigers have evolved into six subspecies. The tiger's tale of evolution can be traced back to about two million years ago when the earliest known tiger ancestor left Africa and ventured into Asia. Over time, the big cats split into nine subspecies, with six still alive today. The most numerous subspecies is the Bengal tiger, accounting for approximately 50% of the tiger population worldwide. Tigers are the world's largest cats. On average, the big cats weigh about 450 pounds, but the largest is the Siberian tiger subspecies, measuring up to 13 feet long and weighing up to 660 pounds. This extra weight is primarily because of large powerful muscles. Unlike lions, the second largest of the big cats, tigers have more muscle mass and are therefore heavier. Tigers have webbed toes. Even living near bodies of water, tigers have adapted for a semi-aquatic lifestyle. They appear to enjoy being in the water, unlike most cats, and are excellent swimmers, using their powerful muscles to propel their bodies in the water. They've also evolved to have webbing between their toes. The webbing allows the big cats to push around a greater volume of water with each stroke of their paws. White tigers are a type of Bengal tiger. The classic black and orange coloration of most tigers is caused by the pigments U-melanin which turns for black and pheomelanin which turns for orange. The production of pheomelanin is triggered by the gene SLC45A2. White tigers carry a mutated version of this gene which prevents them from producing orange pigmentation. Fewer than 4,000 tigers remain in the wild. At the turn of the 20th century, approximately 100,000 tigers roamed the wild, living as far west as Turkey and as far north as Russia. But due to deforestation, human development, and poaching, their range severely decreased, within 100 years, the world's tiger population declined by about 96%. Over the past few decades, programs have been put in place to protect tigers and their habitats. With the help of the global community, tiger populations may slowly rebound. Thank you very much for watching this video, and I'll see you in the next one.\""
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "result"
    ]

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "e6d0e5fd",
    "metadata": {},
+   "outputs": [],
    "source": [
     "import torch\n",
     "from transformers import pipeline\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "ca9a4832",
    "metadata": {},
+   "outputs": [],
    "source": [
     "from transformers import pipeline\n",
     "import librosa\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "7cd4e28e",
    "metadata": {},
+   "outputs": [],
    "source": [
     "result"
    ]

packages.txt CHANGED Viewed

@@ -12,4 +12,5 @@ python3-setuptools
 python-is-python3
 wget
 zlib1g
-net-tools

 python-is-python3
 wget
 zlib1g
+net-tools
+curl

requirements.txt CHANGED Viewed

@@ -14,4 +14,5 @@ ipykernel==6.29.5
 ipywidgets==8.1.7
 yt-dlp==2025.5.22
 qwen-vl-utils==0.0.11
-librosa==0.11.0

 ipywidgets==8.1.7
 yt-dlp==2025.5.22
 qwen-vl-utils==0.0.11
+librosa==0.11.0
+gradio_client