Spaces:

artyomboyko
/

Aura_AI_Scan

Running

App Files Files Community

Artyom Boyko commited on Jun 4

Commit

5e4771d

1 Parent(s): 8e0748d

Testing alpha version of App.

Browse files

Files changed (2) hide show

app_srv/app_srv.py +151 -31
app_srv/video_processing.py +2 -4

app_srv/app_srv.py CHANGED Viewed

@@ -1,31 +1,151 @@
-import gradio as gr
-import torch
-from downloader import download_youtube_video
-from video_processing import extract_frames_with_timestamps, generate_frame_descriptions
-from audio_processing import transcribe_audio
-from model_api import clear_gpu_cache, get_device_and_dtype
-# Detect CUDA
-selected_device, selected_dtype = get_device_and_dtype()
-def describe_video(youtube_video_url: str, temp_dir: str = None, quality: int = 720, video_time_step: float = 2, prompt: str = None):
-    video_data = download_youtube_video(youtube_video_url, base_dir=temp_dir, video_quality=quality)
-    frames = extract_frames_with_timestamps(video_path=video_data['video_path'], output_dir=video_data['data_path'], time_step=video_time_step)
-    video_description = generate_frame_descriptions(frames_dict=frames, custom_prompt=prompt, device=selected_device, torch_dtype=selected_dtype)
-    audio_text = transcribe_audio(video_data['audio_path'])
-    return video_description, audio_text
-# Запуск приложения
-if __name__ == "__main__":
-    url = "https://www.youtube.com/watch?v=FK3dav4bA4s&t=1s"
-    text = "Count the tigers of different species in the frame. Return only number in your answer. ANSWER:"
-    video, audio = describe_video(url, temp_dir="./app_srv/temp", quality=720, video_time_step=10, prompt=text)
-    print(video)
-    print(audio)

+import gradio as gr
+import torch
+import os
+from PIL import Image
+import base64
+from io import BytesIO
+from pathlib import Path
+from downloader import download_youtube_video
+from video_processing import extract_frames_with_timestamps, generate_frame_descriptions
+from audio_processing import transcribe_audio
+from model_api import get_device_and_dtype
+# Инициализация устройства и типа данных
+device, dtype = get_device_and_dtype()
+# Промпт по умолчанию
+DEFAULT_PROMPT = "Analyze the frame, describe what objects are in the frame, how many there are, the background and the action taking place."
+def process_video(youtube_url: str, prompt: str, quality: str, time_step: float):
+    """Основная функция обработки видео"""
+    try:
+        # 1. Скачивание видео
+        video_data = download_youtube_video(
+            url=youtube_url,
+            video_quality=quality
+        )
+        # 2. Извлечение кадров с CUDA
+        frames = extract_frames_with_timestamps(
+            video_path=video_data['video_path'],
+            output_dir=video_data['data_path'],
+            time_step=time_step,
+            hw_device="cuda"
+        )
+        # 3. Генерация описаний
+        descriptions = generate_frame_descriptions(
+            frames_dict=frames,
+            custom_prompt=prompt,
+            device=device,
+            torch_dtype=dtype
+        )
+        # 4. Транскрипция аудио
+        transcription = transcribe_audio(video_data['audio_path'])
+        # 5. Форматирование результатов
+        results_html = []
+        for timestamp, frame_path in frames.items():
+            # Получаем описание для текущего кадра
+            frame_desc = descriptions.get(timestamp, "No description available")
+            # Обработка изображения
+            if os.path.exists(frame_path):
+                with Image.open(frame_path) as img:
+                    img.thumbnail((400, 400))
+                    buffered = BytesIO()
+                    img.save(buffered, format="JPEG", quality=85)
+                    img_base64 = base64.b64encode(buffered.getvalue()).decode()
+                    img_html = f'<img src="data:image/jpeg;base64,{img_base64}" style="max-height:300px; border-radius:5px; border:1px solid #ddd;">'
+            else:
+                img_html = f'<div style="color:red; padding:10px;">Image not found</div>'
+            # Форматирование HTML блока
+            frame_html = f"""
+            <div style="border:1px solid #e0e0e0; border-radius:8px; padding:15px; margin-bottom:20px; background:#f8f8f8;">
+                <div style="display:flex; gap:20px; align-items:flex-start;">
+                    <div style="flex:1; min-width:300px; display:flex; justify-content:center; align-items:center;">
+                        {img_html}
+                    </div>
+                    <div style="flex:2;">
+                        <h3 style="margin-top:0; color:#222; font-size:16px; font-weight:600;">Timestamp: {timestamp}</h3>
+                        <div style="background:#fff; padding:15px; border-radius:6px; border-left:4px solid #4285f4;
+                                    color:#333; font-size:14px; line-height:1.5; box-shadow:0 1px 3px rgba(0,0,0,0.1);">
+                            {frame_desc}
+                        </div>
+                    </div>
+                </div>
+            </div>
+            """
+            results_html.append(frame_html)
+        return "\n".join(results_html), transcription
+    except Exception as e:
+        return f"❌ Processing error: {str(e)}", ""
+# Создание Gradio интерфейса
+with gr.Blocks(title="Video Analysis Tool", css="""
+    .gradio-container {max-width: 1200px !important}
+    .frame-results {max-height: 70vh; overflow-y: auto; padding-right:10px;}
+    .output-box {border-radius: 8px !important; margin-top:15px;}
+    .audio-output {background:#f8f8f8 !important; padding:15px !important;}
+    h1 {color: #1a73e8 !important;}
+""") as demo:
+    gr.Markdown("""
+    # 🎥 Video Analysis Tool
+    Analyze YouTube videos - get frame-by-frame descriptions with timestamps
+    """)
+    with gr.Row():
+        with gr.Column(scale=1, min_width=400):
+            youtube_url = gr.Textbox(
+                label="YouTube Video URL",
+                value="https://www.youtube.com/watch?v=FK3dav4bA4s&t=1s",
+                lines=1
+            )
+            prompt = gr.Textbox(
+                label="Analysis Prompt",
+                value=DEFAULT_PROMPT,
+                lines=5,
+                max_lines=10
+            )
+            with gr.Row():
+                quality = gr.Dropdown(
+                    label="Video Quality",
+                    choices=[144, 240, 360, 480, 720, 1080, 1440, 2160],
+                    value=720
+                )
+                time_step = gr.Slider(
+                    label="Frame Interval (seconds)",
+                    minimum=0.5,
+                    maximum=30,
+                    step=0.5,
+                    value=2
+                )
+            submit_btn = gr.Button("Analyze Video", variant="primary")
+        with gr.Column(scale=2):
+            video_output = gr.HTML(
+                label="Frame Analysis Results",
+                elem_classes=["frame-results", "output-box"]
+            )
+            audio_output = gr.Textbox(
+                label="Audio Transcription",
+                interactive=False,
+                lines=10,
+                max_lines=15,
+                elem_classes=["output-box", "audio-output"]
+            )
+    submit_btn.click(
+        fn=process_video,
+        inputs=[youtube_url, prompt, quality, time_step],
+        outputs=[video_output, audio_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

app_srv/video_processing.py CHANGED Viewed

@@ -153,7 +153,5 @@ def generate_frame_descriptions(frames_dict: Dict, custom_prompt: str = None, de
 if __name__ == "__main__":
     video_url = "https://www.youtube.com/watch?v=L1vXCYZAYYM"
     video_data = download_youtube_video(video_url)
-    frames = extract_frames_with_timestamps(video_path=video_data['video_path'], output_dir=video_data['data_path'], time_step=5)
-    video_description = generate_frame_descriptions(frames)
-    print(type(video_description))
-    print(video_description)

 if __name__ == "__main__":
     video_url = "https://www.youtube.com/watch?v=L1vXCYZAYYM"
     video_data = download_youtube_video(video_url)
+    frames = extract_frames_with_timestamps(video_path=video_data['video_path'], output_dir=video_data['data_path'], time_step=10)
+    print(frames)