Artyom Boyko commited on
Commit
5e4771d
·
1 Parent(s): 8e0748d

Testing alpha version of App.

Browse files
Files changed (2) hide show
  1. app_srv/app_srv.py +151 -31
  2. app_srv/video_processing.py +2 -4
app_srv/app_srv.py CHANGED
@@ -1,31 +1,151 @@
1
- import gradio as gr
2
- import torch
3
- from downloader import download_youtube_video
4
- from video_processing import extract_frames_with_timestamps, generate_frame_descriptions
5
- from audio_processing import transcribe_audio
6
- from model_api import clear_gpu_cache, get_device_and_dtype
7
-
8
- # Detect CUDA
9
- selected_device, selected_dtype = get_device_and_dtype()
10
-
11
- def describe_video(youtube_video_url: str, temp_dir: str = None, quality: int = 720, video_time_step: float = 2, prompt: str = None):
12
-
13
- video_data = download_youtube_video(youtube_video_url, base_dir=temp_dir, video_quality=quality)
14
- frames = extract_frames_with_timestamps(video_path=video_data['video_path'], output_dir=video_data['data_path'], time_step=video_time_step)
15
- video_description = generate_frame_descriptions(frames_dict=frames, custom_prompt=prompt, device=selected_device, torch_dtype=selected_dtype)
16
- audio_text = transcribe_audio(video_data['audio_path'])
17
-
18
- return video_description, audio_text
19
-
20
- # Запуск приложения
21
- if __name__ == "__main__":
22
-
23
- url = "https://www.youtube.com/watch?v=FK3dav4bA4s&t=1s"
24
- text = "Count the tigers of different species in the frame. Return only number in your answer. ANSWER:"
25
-
26
-
27
-
28
- video, audio = describe_video(url, temp_dir="./app_srv/temp", quality=720, video_time_step=10, prompt=text)
29
-
30
- print(video)
31
- print(audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ from PIL import Image
5
+ import base64
6
+ from io import BytesIO
7
+ from pathlib import Path
8
+ from downloader import download_youtube_video
9
+ from video_processing import extract_frames_with_timestamps, generate_frame_descriptions
10
+ from audio_processing import transcribe_audio
11
+ from model_api import get_device_and_dtype
12
+
13
+ # Инициализация устройства и типа данных
14
+ device, dtype = get_device_and_dtype()
15
+
16
+ # Промпт по умолчанию
17
+ DEFAULT_PROMPT = "Analyze the frame, describe what objects are in the frame, how many there are, the background and the action taking place."
18
+
19
+ def process_video(youtube_url: str, prompt: str, quality: str, time_step: float):
20
+ """Основная функция обработки видео"""
21
+ try:
22
+
23
+ # 1. Скачивание видео
24
+ video_data = download_youtube_video(
25
+ url=youtube_url,
26
+ video_quality=quality
27
+ )
28
+
29
+ # 2. Извлечение кадров с CUDA
30
+ frames = extract_frames_with_timestamps(
31
+ video_path=video_data['video_path'],
32
+ output_dir=video_data['data_path'],
33
+ time_step=time_step,
34
+ hw_device="cuda"
35
+ )
36
+
37
+ # 3. Генерация описаний
38
+ descriptions = generate_frame_descriptions(
39
+ frames_dict=frames,
40
+ custom_prompt=prompt,
41
+ device=device,
42
+ torch_dtype=dtype
43
+ )
44
+
45
+ # 4. Транскрипция аудио
46
+ transcription = transcribe_audio(video_data['audio_path'])
47
+
48
+ # 5. Форматирование результатов
49
+ results_html = []
50
+ for timestamp, frame_path in frames.items():
51
+ # Получаем описание для текущего кадра
52
+ frame_desc = descriptions.get(timestamp, "No description available")
53
+
54
+ # Обработка изображения
55
+ if os.path.exists(frame_path):
56
+ with Image.open(frame_path) as img:
57
+ img.thumbnail((400, 400))
58
+ buffered = BytesIO()
59
+ img.save(buffered, format="JPEG", quality=85)
60
+ img_base64 = base64.b64encode(buffered.getvalue()).decode()
61
+ img_html = f'<img src="data:image/jpeg;base64,{img_base64}" style="max-height:300px; border-radius:5px; border:1px solid #ddd;">'
62
+ else:
63
+ img_html = f'<div style="color:red; padding:10px;">Image not found</div>'
64
+
65
+ # Форматирование HTML блока
66
+ frame_html = f"""
67
+ <div style="border:1px solid #e0e0e0; border-radius:8px; padding:15px; margin-bottom:20px; background:#f8f8f8;">
68
+ <div style="display:flex; gap:20px; align-items:flex-start;">
69
+ <div style="flex:1; min-width:300px; display:flex; justify-content:center; align-items:center;">
70
+ {img_html}
71
+ </div>
72
+ <div style="flex:2;">
73
+ <h3 style="margin-top:0; color:#222; font-size:16px; font-weight:600;">Timestamp: {timestamp}</h3>
74
+ <div style="background:#fff; padding:15px; border-radius:6px; border-left:4px solid #4285f4;
75
+ color:#333; font-size:14px; line-height:1.5; box-shadow:0 1px 3px rgba(0,0,0,0.1);">
76
+ {frame_desc}
77
+ </div>
78
+ </div>
79
+ </div>
80
+ </div>
81
+ """
82
+ results_html.append(frame_html)
83
+
84
+ return "\n".join(results_html), transcription
85
+
86
+ except Exception as e:
87
+ return f"❌ Processing error: {str(e)}", ""
88
+
89
+ # Создание Gradio интерфейса
90
+ with gr.Blocks(title="Video Analysis Tool", css="""
91
+ .gradio-container {max-width: 1200px !important}
92
+ .frame-results {max-height: 70vh; overflow-y: auto; padding-right:10px;}
93
+ .output-box {border-radius: 8px !important; margin-top:15px;}
94
+ .audio-output {background:#f8f8f8 !important; padding:15px !important;}
95
+ h1 {color: #1a73e8 !important;}
96
+ """) as demo:
97
+
98
+ gr.Markdown("""
99
+ # 🎥 Video Analysis Tool
100
+ Analyze YouTube videos - get frame-by-frame descriptions with timestamps
101
+ """)
102
+
103
+ with gr.Row():
104
+ with gr.Column(scale=1, min_width=400):
105
+ youtube_url = gr.Textbox(
106
+ label="YouTube Video URL",
107
+ value="https://www.youtube.com/watch?v=FK3dav4bA4s&t=1s",
108
+ lines=1
109
+ )
110
+ prompt = gr.Textbox(
111
+ label="Analysis Prompt",
112
+ value=DEFAULT_PROMPT,
113
+ lines=5,
114
+ max_lines=10
115
+ )
116
+ with gr.Row():
117
+ quality = gr.Dropdown(
118
+ label="Video Quality",
119
+ choices=[144, 240, 360, 480, 720, 1080, 1440, 2160],
120
+ value=720
121
+ )
122
+ time_step = gr.Slider(
123
+ label="Frame Interval (seconds)",
124
+ minimum=0.5,
125
+ maximum=30,
126
+ step=0.5,
127
+ value=2
128
+ )
129
+ submit_btn = gr.Button("Analyze Video", variant="primary")
130
+
131
+ with gr.Column(scale=2):
132
+ video_output = gr.HTML(
133
+ label="Frame Analysis Results",
134
+ elem_classes=["frame-results", "output-box"]
135
+ )
136
+ audio_output = gr.Textbox(
137
+ label="Audio Transcription",
138
+ interactive=False,
139
+ lines=10,
140
+ max_lines=15,
141
+ elem_classes=["output-box", "audio-output"]
142
+ )
143
+
144
+ submit_btn.click(
145
+ fn=process_video,
146
+ inputs=[youtube_url, prompt, quality, time_step],
147
+ outputs=[video_output, audio_output]
148
+ )
149
+
150
+ if __name__ == "__main__":
151
+ demo.launch()
app_srv/video_processing.py CHANGED
@@ -153,7 +153,5 @@ def generate_frame_descriptions(frames_dict: Dict, custom_prompt: str = None, de
153
  if __name__ == "__main__":
154
  video_url = "https://www.youtube.com/watch?v=L1vXCYZAYYM"
155
  video_data = download_youtube_video(video_url)
156
- frames = extract_frames_with_timestamps(video_path=video_data['video_path'], output_dir=video_data['data_path'], time_step=5)
157
- video_description = generate_frame_descriptions(frames)
158
- print(type(video_description))
159
- print(video_description)
 
153
  if __name__ == "__main__":
154
  video_url = "https://www.youtube.com/watch?v=L1vXCYZAYYM"
155
  video_data = download_youtube_video(video_url)
156
+ frames = extract_frames_with_timestamps(video_path=video_data['video_path'], output_dir=video_data['data_path'], time_step=10)
157
+ print(frames)