Bug fixes
Browse files- .hf-space +4 -1
- Dockerfile +3 -1
- docker-compose.yml +3 -0
- extract_signed_segments_from_annotations.py +37 -214
- flask_app.py +745 -17
- get_transcription_with_amazon.py +47 -8
- templates/player.html +25 -2
.hf-space
CHANGED
@@ -7,8 +7,11 @@ config:
|
|
7 |
- PORT=7860
|
8 |
- SPACE_ID=true
|
9 |
- BYPASS_AUTH=true
|
|
|
|
|
|
|
10 |
resources:
|
11 |
cpu: 1
|
12 |
memory: 1
|
13 |
gpu: null
|
14 |
-
restarts: true
|
|
|
7 |
- PORT=7860
|
8 |
- SPACE_ID=true
|
9 |
- BYPASS_AUTH=true
|
10 |
+
- S3_BUCKET=sorenson-ai-sb-scratch
|
11 |
+
- S3_VIDEO_PREFIX=awilkinson/kylie_dataset_videos_for_alignment_webapp/
|
12 |
+
- USE_S3_FOR_VIDEOS=true
|
13 |
resources:
|
14 |
cpu: 1
|
15 |
memory: 1
|
16 |
gpu: null
|
17 |
+
restarts: true
|
Dockerfile
CHANGED
@@ -16,10 +16,12 @@ COPY . .
|
|
16 |
ENV PYTHONUNBUFFERED=1
|
17 |
ENV PORT=7860
|
18 |
ENV SPACE_ID="true"
|
19 |
-
# Add explicit environment variable to enable authentication bypass for troubleshooting
|
20 |
ENV BYPASS_AUTH="true"
|
21 |
ENV SECRET_KEY="f7290fc27f11dbf14be6cd348638ad62"
|
22 |
ENV DEBUG="True"
|
|
|
|
|
|
|
23 |
|
24 |
# Create necessary directories
|
25 |
RUN mkdir -p data/videos data/annotations data/temp data/word_timestamps data/alignments data/transcripts
|
|
|
16 |
ENV PYTHONUNBUFFERED=1
|
17 |
ENV PORT=7860
|
18 |
ENV SPACE_ID="true"
|
|
|
19 |
ENV BYPASS_AUTH="true"
|
20 |
ENV SECRET_KEY="f7290fc27f11dbf14be6cd348638ad62"
|
21 |
ENV DEBUG="True"
|
22 |
+
ENV S3_BUCKET="sorenson-ai-sb-scratch"
|
23 |
+
ENV S3_VIDEO_PREFIX="awilkinson/kylie_dataset_videos_for_alignment_webapp/"
|
24 |
+
ENV USE_S3_FOR_VIDEOS="true"
|
25 |
|
26 |
# Create necessary directories
|
27 |
RUN mkdir -p data/videos data/annotations data/temp data/word_timestamps data/alignments data/transcripts
|
docker-compose.yml
CHANGED
@@ -13,5 +13,8 @@ services:
|
|
13 |
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
|
14 |
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
|
15 |
- AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-us-west-2}
|
|
|
|
|
|
|
16 |
volumes:
|
17 |
- ./data:/app/data
|
|
|
13 |
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
|
14 |
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
|
15 |
- AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-us-west-2}
|
16 |
+
- S3_BUCKET=sorenson-ai-sb-scratch
|
17 |
+
- S3_VIDEO_PREFIX=awilkinson/kylie_dataset_videos_for_alignment_webapp/
|
18 |
+
- USE_S3_FOR_VIDEOS=true
|
19 |
volumes:
|
20 |
- ./data:/app/data
|
extract_signed_segments_from_annotations.py
CHANGED
@@ -1,212 +1,3 @@
|
|
1 |
-
# #!/usr/bin/env python3
|
2 |
-
|
3 |
-
# import json
|
4 |
-
# import os
|
5 |
-
# import pathlib
|
6 |
-
# import subprocess
|
7 |
-
# from dataclasses import dataclass
|
8 |
-
# from decimal import Decimal
|
9 |
-
# from typing import List, Optional, Tuple
|
10 |
-
# import argparse
|
11 |
-
|
12 |
-
# try:
|
13 |
-
# from tqdm import tqdm
|
14 |
-
# except ImportError:
|
15 |
-
# def tqdm(iterable, **kwargs):
|
16 |
-
# return iterable
|
17 |
-
# def write(msg):
|
18 |
-
# print(msg)
|
19 |
-
|
20 |
-
# @dataclass
|
21 |
-
# class VideoClip:
|
22 |
-
# """Represents a video clip with timing information."""
|
23 |
-
# start_time: float
|
24 |
-
# end_time: float
|
25 |
-
# clip_path: str
|
26 |
-
# original_video: str
|
27 |
-
# index: int
|
28 |
-
|
29 |
-
# @property
|
30 |
-
# def duration(self) -> float:
|
31 |
-
# return self.end_time - self.start_time
|
32 |
-
|
33 |
-
# class ClipExtractor:
|
34 |
-
# """Handles extraction of video clips based on annotation timestamps."""
|
35 |
-
|
36 |
-
# def __init__(self, base_dir: str) -> None:
|
37 |
-
# """Initialize with project base directory."""
|
38 |
-
# self.base_dir = base_dir
|
39 |
-
# self.temp_dir = os.path.join(base_dir, "data", "temp")
|
40 |
-
# self.videos_dir = os.path.join(base_dir, "data", "videos")
|
41 |
-
# self.annotations_dir = os.path.join(base_dir, "data", "annotations")
|
42 |
-
# self.metadata_dir = os.path.join(base_dir, "data", "segment_metadata")
|
43 |
-
# os.makedirs(self.temp_dir, exist_ok=True)
|
44 |
-
# os.makedirs(self.metadata_dir, exist_ok=True)
|
45 |
-
|
46 |
-
# def validate_timestamps(self, timestamps: List[float]) -> Tuple[bool, Optional[str]]:
|
47 |
-
# if not timestamps:
|
48 |
-
# return False, "No timestamps found in annotation file."
|
49 |
-
# for i in range(len(timestamps) - 1):
|
50 |
-
# if timestamps[i] >= timestamps[i + 1]:
|
51 |
-
# return False, (f"Invalid timestamp order: {str(round(Decimal(timestamps[i]), 3))} seconds "
|
52 |
-
# f"followed by {str(round(Decimal(timestamps[i + 1]), 3))} seconds")
|
53 |
-
# return True, None
|
54 |
-
|
55 |
-
# def get_video_duration(self, video_path: str) -> float:
|
56 |
-
# try:
|
57 |
-
# cmd = [
|
58 |
-
# "ffprobe",
|
59 |
-
# "-v", "error",
|
60 |
-
# "-show_entries", "format=duration",
|
61 |
-
# "-of", "default=noprint_wrappers=1:nokey=1",
|
62 |
-
# video_path
|
63 |
-
# ]
|
64 |
-
# output = subprocess.check_output(cmd).decode().strip()
|
65 |
-
# return float(output)
|
66 |
-
# except subprocess.CalledProcessError as e:
|
67 |
-
# raise RuntimeError(f"Failed to get video duration. Error: {str(e)}")
|
68 |
-
|
69 |
-
# def extract_clip(self, video_path: str, start_time: float, end_time: float, output_path: str) -> bool:
|
70 |
-
# try:
|
71 |
-
# cmd = [
|
72 |
-
# "ffmpeg",
|
73 |
-
# "-i", video_path,
|
74 |
-
# "-ss", str(start_time),
|
75 |
-
# "-t", str(end_time - start_time),
|
76 |
-
# "-c:v", "libx264",
|
77 |
-
# "-c:a", "aac",
|
78 |
-
# "-y",
|
79 |
-
# output_path
|
80 |
-
# ]
|
81 |
-
# subprocess.run(cmd, check=True, capture_output=True)
|
82 |
-
# if not os.path.exists(output_path):
|
83 |
-
# print(f"Warning: ffmpeg completed but file not found: {output_path}")
|
84 |
-
# return False
|
85 |
-
# file_size = os.path.getsize(output_path)
|
86 |
-
# print(f"Created clip: {output_path} ({file_size} bytes)")
|
87 |
-
# return True
|
88 |
-
# except subprocess.CalledProcessError as e:
|
89 |
-
# print(f"Error extracting clip. Details: {str(e)}")
|
90 |
-
# return False
|
91 |
-
|
92 |
-
# def extract_clips_from_annotations(self, video_id: str, progress_callback=None) -> List[VideoClip]:
|
93 |
-
# # Determine paths
|
94 |
-
# video_path = os.path.join(self.videos_dir, f"{video_id}.mp4")
|
95 |
-
# annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
|
96 |
-
# if not os.path.exists(video_path):
|
97 |
-
# raise FileNotFoundError(f"Video file not found: {video_path}")
|
98 |
-
# if not os.path.exists(annotation_path):
|
99 |
-
# raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
|
100 |
-
|
101 |
-
# with open(annotation_path, "r") as f:
|
102 |
-
# annotations = json.load(f)
|
103 |
-
# timestamps = sorted(annotations["timestamps"])
|
104 |
-
# is_valid, error_message = self.validate_timestamps(timestamps)
|
105 |
-
# if not is_valid:
|
106 |
-
# raise ValueError(f"Invalid timestamps in annotation file. {error_message}")
|
107 |
-
# video_duration = self.get_video_duration(video_path)
|
108 |
-
# if timestamps[-1] > video_duration:
|
109 |
-
# raise ValueError(
|
110 |
-
# f"Final timestamp ({str(round(Decimal(timestamps[-1]), 3))} seconds) " +
|
111 |
-
# f"exceeds video duration ({str(round(Decimal(video_duration), 3))} seconds)"
|
112 |
-
# )
|
113 |
-
|
114 |
-
# # Create segments using only consecutive pairs from the annotated boundaries.
|
115 |
-
# segments = [(timestamps[i], timestamps[i+1]) for i in range(len(timestamps)-1)]
|
116 |
-
# total_clips = len(segments)
|
117 |
-
|
118 |
-
# # Check metadata to see if segmentation is up-to-date.
|
119 |
-
# metadata_file = os.path.join(self.metadata_dir, f"{video_id}_metadata.json")
|
120 |
-
# use_cached = False
|
121 |
-
# if os.path.exists(metadata_file):
|
122 |
-
# with open(metadata_file, "r") as meta_f:
|
123 |
-
# try:
|
124 |
-
# meta_data = json.load(meta_f)
|
125 |
-
# if meta_data.get("segments") == segments:
|
126 |
-
# use_cached = True
|
127 |
-
# except Exception as ex:
|
128 |
-
# use_cached = False
|
129 |
-
|
130 |
-
# if use_cached:
|
131 |
-
# clips = []
|
132 |
-
# for i, (start, end) in enumerate(segments):
|
133 |
-
# clip_path = os.path.join(self.temp_dir, f"{video_id}_clip_{i:03d}.mp4")
|
134 |
-
# if not os.path.exists(clip_path):
|
135 |
-
# use_cached = False
|
136 |
-
# break
|
137 |
-
# clips.append(VideoClip(start, end, clip_path, video_id, i))
|
138 |
-
# if use_cached:
|
139 |
-
# if progress_callback:
|
140 |
-
# progress_callback(total_clips, total_clips)
|
141 |
-
# else:
|
142 |
-
# print("Using cached segmentation as boundaries haven't changed.")
|
143 |
-
# return clips
|
144 |
-
|
145 |
-
# # If metadata is missing, boundaries have changed, or some clip file is missing, re-run segmentation.
|
146 |
-
# clips = []
|
147 |
-
# current_clip = 0
|
148 |
-
# use_tqdm = progress_callback is None
|
149 |
-
# if use_tqdm:
|
150 |
-
# pbar = tqdm(total=total_clips, desc="Extracting clips")
|
151 |
-
# for segment in segments:
|
152 |
-
# start, end = segment
|
153 |
-
# clip_path = os.path.join(self.temp_dir, f"{video_id}_clip_{current_clip:03d}.mp4")
|
154 |
-
# if self.extract_clip(video_path, start, end, clip_path):
|
155 |
-
# clips.append(VideoClip(start, end, clip_path, video_id, current_clip))
|
156 |
-
# current_clip += 1
|
157 |
-
# if progress_callback:
|
158 |
-
# progress_callback(current_clip, total_clips)
|
159 |
-
# elif use_tqdm:
|
160 |
-
# pbar.update(1)
|
161 |
-
# if use_tqdm:
|
162 |
-
# pbar.close()
|
163 |
-
# # Save segmentation metadata for future use.
|
164 |
-
# meta_data = {"segments": segments}
|
165 |
-
# with open(metadata_file, "w") as meta_f:
|
166 |
-
# json.dump(meta_data, meta_f, indent=4)
|
167 |
-
# return clips
|
168 |
-
|
169 |
-
# def cleanup_clips(self, clips: List[VideoClip]) -> None:
|
170 |
-
# if not clips:
|
171 |
-
# return
|
172 |
-
# print("\nCleaning up temporary files...")
|
173 |
-
# for clip in clips:
|
174 |
-
# if os.path.exists(clip.clip_path):
|
175 |
-
# try:
|
176 |
-
# print(f"Removing: {clip.clip_path}")
|
177 |
-
# os.remove(clip.clip_path)
|
178 |
-
# except OSError as e:
|
179 |
-
# print(f"Warning: Failed to remove clip {clip.clip_path}. Error: {str(e)}")
|
180 |
-
# else:
|
181 |
-
# print(f"Warning: File not found for cleanup: {clip.clip_path}")
|
182 |
-
|
183 |
-
# def main() -> None:
|
184 |
-
# parser = argparse.ArgumentParser(description="Extract video clips based on annotations for a given video file ID.")
|
185 |
-
# parser.add_argument("video_id", help="Video file ID (without extension)")
|
186 |
-
# args = parser.parse_args()
|
187 |
-
|
188 |
-
# base_dir = os.path.join(
|
189 |
-
# str(pathlib.Path.home()),
|
190 |
-
# "andrew_messaround",
|
191 |
-
# "vsl_speech_to_signing_alignment",
|
192 |
-
# "boundary_annotation_webapp"
|
193 |
-
# )
|
194 |
-
# extractor = ClipExtractor(base_dir)
|
195 |
-
# try:
|
196 |
-
# clips = extractor.extract_clips_from_annotations(args.video_id)
|
197 |
-
# print(f"\nSuccessfully extracted {len(clips)} clips:")
|
198 |
-
# for clip in clips:
|
199 |
-
# print(f"Clip {clip.index}: {round(clip.start_time, 2)}s → {round(clip.end_time, 2)}s")
|
200 |
-
# print(f"Duration: {round(clip.duration, 2)}s")
|
201 |
-
# print(f"Path: {clip.clip_path}\n")
|
202 |
-
# except Exception as e:
|
203 |
-
# print(f"Error: {str(e)}")
|
204 |
-
|
205 |
-
# if __name__ == "__main__":
|
206 |
-
# main()
|
207 |
-
|
208 |
-
#!/usr/bin/env python3
|
209 |
-
|
210 |
import json
|
211 |
import os
|
212 |
import pathlib
|
@@ -215,6 +6,8 @@ from dataclasses import dataclass
|
|
215 |
from decimal import Decimal
|
216 |
from typing import List, Optional, Tuple
|
217 |
import argparse
|
|
|
|
|
218 |
|
219 |
try:
|
220 |
from tqdm import tqdm
|
@@ -226,6 +19,20 @@ except ImportError:
|
|
226 |
|
227 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
@dataclass
|
230 |
class VideoClip:
|
231 |
"""Represents a video clip with timing information."""
|
@@ -300,15 +107,31 @@ class ClipExtractor:
|
|
300 |
print(f"Error extracting clip. Details: {str(e)}")
|
301 |
return False
|
302 |
|
303 |
-
def extract_clips_from_annotations(self, video_id
|
|
|
304 |
# Determine paths
|
305 |
-
|
|
|
306 |
annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
|
307 |
-
|
308 |
-
raise FileNotFoundError(f"Video file not found: {video_path}")
|
309 |
if not os.path.exists(annotation_path):
|
310 |
raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
|
311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
with open(annotation_path, "r") as f:
|
313 |
annotations = json.load(f)
|
314 |
timestamps = sorted(annotations["timestamps"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import pathlib
|
|
|
6 |
from decimal import Decimal
|
7 |
from typing import List, Optional, Tuple
|
8 |
import argparse
|
9 |
+
import boto3
|
10 |
+
from botocore.exceptions import ClientError
|
11 |
|
12 |
try:
|
13 |
from tqdm import tqdm
|
|
|
19 |
|
20 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
21 |
|
22 |
+
# Add these constants at the top of the file
|
23 |
+
S3_BUCKET = "sorenson-ai-sb-scratch"
|
24 |
+
S3_VIDEO_PREFIX = "awilkinson/kylie_dataset_videos_for_alignment_webapp/"
|
25 |
+
USE_S3_FOR_VIDEOS = True # Set to True to use S3, False to use local files
|
26 |
+
|
27 |
+
def get_s3_client():
|
28 |
+
"""Get a boto3 S3 client."""
|
29 |
+
return boto3.client(
|
30 |
+
's3',
|
31 |
+
region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
|
32 |
+
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
|
33 |
+
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
|
34 |
+
)
|
35 |
+
|
36 |
@dataclass
|
37 |
class VideoClip:
|
38 |
"""Represents a video clip with timing information."""
|
|
|
107 |
print(f"Error extracting clip. Details: {str(e)}")
|
108 |
return False
|
109 |
|
110 |
+
def extract_clips_from_annotations(self, video_id, progress_callback=None) -> List[VideoClip]:
|
111 |
+
"""Extract clips based on annotation timestamps, handling S3 videos."""
|
112 |
# Determine paths
|
113 |
+
video_filename = f"{video_id}.mp4"
|
114 |
+
video_path = os.path.join(self.videos_dir, video_filename)
|
115 |
annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
|
116 |
+
|
|
|
117 |
if not os.path.exists(annotation_path):
|
118 |
raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
|
119 |
+
|
120 |
+
# Check if we need to download the video from S3
|
121 |
+
if USE_S3_FOR_VIDEOS and not os.path.exists(video_path):
|
122 |
+
print(f"Video not found locally. Downloading from S3: {video_id}")
|
123 |
+
s3_client = get_s3_client()
|
124 |
+
try:
|
125 |
+
s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
|
126 |
+
s3_client.download_file(S3_BUCKET, s3_key, video_path)
|
127 |
+
print(f"Video downloaded successfully: {video_path}")
|
128 |
+
except ClientError as e:
|
129 |
+
raise FileNotFoundError(f"Video file not found in S3: {s3_key}. Error: {str(e)}")
|
130 |
+
|
131 |
+
if not os.path.exists(video_path):
|
132 |
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
133 |
+
|
134 |
+
# Now continue with the original extraction process
|
135 |
with open(annotation_path, "r") as f:
|
136 |
annotations = json.load(f)
|
137 |
timestamps = sorted(annotations["timestamps"])
|
flask_app.py
CHANGED
@@ -1,9 +1,591 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
|
2 |
import os, json, threading, time, signal, sys
|
3 |
from datetime import datetime
|
4 |
from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
|
5 |
import logging
|
6 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Load environment variables
|
9 |
load_dotenv()
|
@@ -47,6 +629,11 @@ WORD_TIMESTAMPS_DIR = os.path.abspath("data/word_timestamps")
|
|
47 |
ALIGNMENTS_DIR = os.path.abspath("data/alignments")
|
48 |
TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
|
49 |
|
|
|
|
|
|
|
|
|
|
|
50 |
# Ensure all required directories exist
|
51 |
for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALIGNMENTS_DIR, TRANSCRIPTS_DIR]:
|
52 |
os.makedirs(directory, exist_ok=True)
|
@@ -55,6 +642,82 @@ for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALI
|
|
55 |
clip_extraction_status = {}
|
56 |
transcription_progress_status = {}
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
# Graceful shutdown handler
|
59 |
def graceful_shutdown(signum, frame):
|
60 |
"""Handle graceful shutdown on signals."""
|
@@ -116,7 +779,19 @@ def run_transcription(video_id):
|
|
116 |
transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
|
117 |
return
|
118 |
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
transcription_progress_status[video_id] = {"status": "started", "percent": 10}
|
121 |
|
122 |
# Check if AWS credentials are available
|
@@ -177,7 +852,6 @@ def auth_callback():
|
|
177 |
return render_template('error.html', message="Authentication failed. No username provided.")
|
178 |
return redirect(url_for('login'))
|
179 |
|
180 |
-
# Replace the health check route with this improved version
|
181 |
@app.route('/health')
|
182 |
def health_check():
|
183 |
"""Health check endpoint for container verification."""
|
@@ -187,7 +861,10 @@ def health_check():
|
|
187 |
"DEBUG": os.environ.get('DEBUG', 'Not set'),
|
188 |
"SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
|
189 |
"BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
|
190 |
-
"SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set'
|
|
|
|
|
|
|
191 |
}
|
192 |
|
193 |
logger.info(f"Health check called. Environment: {env_vars}")
|
@@ -301,6 +978,11 @@ def debug_info():
|
|
301 |
"app_config": {k: str(v) for k, v in app.config.items() if k in
|
302 |
['SESSION_COOKIE_SECURE', 'SESSION_COOKIE_HTTPONLY',
|
303 |
'SESSION_COOKIE_SAMESITE', 'PERMANENT_SESSION_LIFETIME']},
|
|
|
|
|
|
|
|
|
|
|
304 |
}
|
305 |
return jsonify(info)
|
306 |
|
@@ -315,10 +997,14 @@ def index():
|
|
315 |
@login_required
|
316 |
def select_video():
|
317 |
"""Page to select a video for annotation."""
|
318 |
-
if
|
319 |
-
|
320 |
-
|
321 |
-
|
|
|
|
|
|
|
|
|
322 |
return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
|
323 |
|
324 |
@app.route('/player/<video_id>')
|
@@ -331,20 +1017,44 @@ def player(video_id):
|
|
331 |
@login_required
|
332 |
def get_videos():
|
333 |
"""API endpoint to get available videos."""
|
334 |
-
if
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
|
341 |
@app.route('/video/<path:filename>')
|
342 |
@login_required
|
343 |
def serve_video(filename):
|
344 |
-
"""Serve a video file."""
|
345 |
-
|
346 |
-
|
347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
|
349 |
@app.route('/save_annotations', methods=['POST'])
|
350 |
@login_required
|
@@ -532,6 +1242,15 @@ def save_alignments():
|
|
532 |
@login_required
|
533 |
def extract_clips_for_video(video_id):
|
534 |
"""Extract clips and start transcription for a video."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
535 |
status = clip_extraction_status.get(video_id, {})
|
536 |
if status.get("percent", 0) < 100:
|
537 |
thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
|
@@ -563,7 +1282,16 @@ if __name__ == '__main__':
|
|
563 |
print(f"- Running in HF Space: {is_hf_space}")
|
564 |
print(f"- Auth bypass: {bypass_auth}")
|
565 |
print(f"- Port: {os.getenv('PORT', 5000)}")
|
|
|
|
|
|
|
566 |
print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
|
|
|
|
|
|
|
|
|
|
|
|
|
567 |
print("=" * 50)
|
568 |
|
569 |
port = int(os.getenv('PORT', 5000))
|
|
|
1 |
+
# from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
|
2 |
+
# import os, json, threading, time, signal, sys
|
3 |
+
# from datetime import datetime
|
4 |
+
# from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
|
5 |
+
# import logging
|
6 |
+
# from dotenv import load_dotenv
|
7 |
+
|
8 |
+
# # Load environment variables
|
9 |
+
# load_dotenv()
|
10 |
+
|
11 |
+
# # Add this near the top with other environment variables
|
12 |
+
# bypass_auth = os.getenv('BYPASS_AUTH', 'false').lower() == 'true'
|
13 |
+
|
14 |
+
# # Configure logging first
|
15 |
+
# logging.basicConfig(
|
16 |
+
# level=logging.INFO,
|
17 |
+
# format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
18 |
+
# )
|
19 |
+
# logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
# # Hugging Face specific configuration
|
22 |
+
# is_hf_space = os.getenv('SPACE_ID') is not None
|
23 |
+
# if is_hf_space:
|
24 |
+
# logger.info("Running in Hugging Face Spaces environment")
|
25 |
+
# # Allow insecure transport for development in HF
|
26 |
+
# os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
|
27 |
+
# # Ensure port is set correctly
|
28 |
+
# os.environ['PORT'] = '7860'
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
# app = Flask(__name__)
|
33 |
+
# app.secret_key = os.getenv('SECRET_KEY', 'dev_key_for_testing')
|
34 |
+
|
35 |
+
# # Configure session for HF
|
36 |
+
# if is_hf_space:
|
37 |
+
# app.config['SESSION_COOKIE_SECURE'] = False
|
38 |
+
# app.config['SESSION_COOKIE_HTTPONLY'] = True
|
39 |
+
# app.config['SESSION_COOKIE_SAMESITE'] = None # Add this line
|
40 |
+
# app.config['PERMANENT_SESSION_LIFETIME'] = 86400 # 24 hours
|
41 |
+
|
42 |
+
# # Directory paths
|
43 |
+
# VIDEO_DIR = os.path.abspath("data/videos")
|
44 |
+
# ANNOTATIONS_DIR = os.path.abspath("data/annotations")
|
45 |
+
# TEMP_DIR = os.path.abspath("data/temp")
|
46 |
+
# WORD_TIMESTAMPS_DIR = os.path.abspath("data/word_timestamps")
|
47 |
+
# ALIGNMENTS_DIR = os.path.abspath("data/alignments")
|
48 |
+
# TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
|
49 |
+
|
50 |
+
# # Ensure all required directories exist
|
51 |
+
# for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALIGNMENTS_DIR, TRANSCRIPTS_DIR]:
|
52 |
+
# os.makedirs(directory, exist_ok=True)
|
53 |
+
|
54 |
+
# # Global dictionaries for progress tracking
|
55 |
+
# clip_extraction_status = {}
|
56 |
+
# transcription_progress_status = {}
|
57 |
+
|
58 |
+
# # Graceful shutdown handler
|
59 |
+
# def graceful_shutdown(signum, frame):
|
60 |
+
# """Handle graceful shutdown on signals."""
|
61 |
+
# logger.info(f"Received signal {signum}, shutting down gracefully...")
|
62 |
+
# # Clean up as needed here
|
63 |
+
# sys.exit(0)
|
64 |
+
|
65 |
+
# # Register signal handlers
|
66 |
+
# signal.signal(signal.SIGTERM, graceful_shutdown)
|
67 |
+
# signal.signal(signal.SIGINT, graceful_shutdown)
|
68 |
+
|
69 |
+
# # Login required decorator
|
70 |
+
# def login_required(f):
|
71 |
+
# from functools import wraps
|
72 |
+
# @wraps(f)
|
73 |
+
# def decorated_function(*args, **kwargs):
|
74 |
+
# if 'user' not in session:
|
75 |
+
# logger.info(f"User not in session, redirecting to login")
|
76 |
+
# return redirect(url_for('login'))
|
77 |
+
# return f(*args, **kwargs)
|
78 |
+
# return decorated_function
|
79 |
+
|
80 |
+
# # Allow specific users (for testing)
|
81 |
+
# def is_allowed_user(username):
|
82 |
+
# allowed_users_env = os.getenv('ALLOWED_USERS', 'Perilon') # Default to your username
|
83 |
+
# allowed_users = [user.strip() for user in allowed_users_env.split(',')]
|
84 |
+
# return username in allowed_users or not is_hf_space # Allow all users in local dev
|
85 |
+
|
86 |
+
# def update_extraction_progress(video_id, current, total):
|
87 |
+
# percent = int((current / total) * 100)
|
88 |
+
# clip_extraction_status[video_id] = {"current": current, "total": total, "percent": percent}
|
89 |
+
|
90 |
+
# def run_clip_extraction(video_id):
|
91 |
+
# try:
|
92 |
+
# base_dir = app.root_path
|
93 |
+
# extractor = ClipExtractor(base_dir)
|
94 |
+
# extractor.extract_clips_from_annotations(
|
95 |
+
# video_id,
|
96 |
+
# progress_callback=lambda current, total: update_extraction_progress(video_id, current, total)
|
97 |
+
# )
|
98 |
+
# if video_id in clip_extraction_status:
|
99 |
+
# status = clip_extraction_status[video_id]
|
100 |
+
# if status.get("percent", 0) < 100:
|
101 |
+
# update_extraction_progress(video_id, status["total"], status["total"])
|
102 |
+
# else:
|
103 |
+
# update_extraction_progress(video_id, 1, 1)
|
104 |
+
# except Exception as e:
|
105 |
+
# logger.error(f"Error during clip extraction for {video_id}: {str(e)}")
|
106 |
+
# clip_extraction_status[video_id] = {"error": str(e)}
|
107 |
+
|
108 |
+
# def run_transcription(video_id):
|
109 |
+
# try:
|
110 |
+
# base_dir = app.root_path
|
111 |
+
# output_path = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
|
112 |
+
|
113 |
+
# # Check if transcription already exists and is valid.
|
114 |
+
# if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
115 |
+
# logger.info(f"Using cached transcription for video {video_id}.")
|
116 |
+
# transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
|
117 |
+
# return
|
118 |
+
|
119 |
+
# video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
|
120 |
+
# transcription_progress_status[video_id] = {"status": "started", "percent": 10}
|
121 |
+
|
122 |
+
# # Check if AWS credentials are available
|
123 |
+
# if not os.environ.get('AWS_ACCESS_KEY_ID') or not os.environ.get('AWS_SECRET_ACCESS_KEY'):
|
124 |
+
# logger.warning("AWS credentials not found. Transcription will not work properly.")
|
125 |
+
# transcription_progress_status[video_id] = {
|
126 |
+
# "status": "error",
|
127 |
+
# "percent": 0,
|
128 |
+
# "message": "AWS credentials missing"
|
129 |
+
# }
|
130 |
+
# return
|
131 |
+
|
132 |
+
# # Run transcription via the imported function from get_transcription_with_amazon.py
|
133 |
+
# from get_transcription_with_amazon import get_word_timestamps
|
134 |
+
# word_timestamps = get_word_timestamps(video_path)
|
135 |
+
|
136 |
+
# with open(output_path, "w") as f:
|
137 |
+
# json.dump(word_timestamps, f, indent=4)
|
138 |
+
|
139 |
+
# transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
|
140 |
+
# except Exception as e:
|
141 |
+
# logger.error(f"Error during transcription for {video_id}: {str(e)}")
|
142 |
+
# transcription_progress_status[video_id] = {"status": "error", "percent": 0, "message": str(e)}
|
143 |
+
|
144 |
+
# # Authentication routes
|
145 |
+
# @app.route('/login')
|
146 |
+
# def login():
|
147 |
+
# """Handle login for both local and HF environments."""
|
148 |
+
# logger.info(f"Login route called. Headers: {dict(request.headers)}")
|
149 |
+
|
150 |
+
# if is_hf_space:
|
151 |
+
# username = request.headers.get('X-Spaces-Username')
|
152 |
+
# logger.info(f"Username from headers in login: {username}")
|
153 |
+
|
154 |
+
# if username and is_allowed_user(username):
|
155 |
+
# session['user'] = {'name': username, 'is_hf': True}
|
156 |
+
# return redirect(url_for('index'))
|
157 |
+
# else:
|
158 |
+
# # Redirect to the HF auth endpoint
|
159 |
+
# return redirect('/auth')
|
160 |
+
# else:
|
161 |
+
# # For local development
|
162 |
+
# session['user'] = {'name': 'LocalDeveloper', 'is_mock': True}
|
163 |
+
# return redirect(url_for('index'))
|
164 |
+
|
165 |
+
# @app.route('/auth/callback')
|
166 |
+
# def auth_callback():
|
167 |
+
# """This route will be called by Hugging Face after successful authentication."""
|
168 |
+
# logger.info(f"Auth callback called. Headers: {dict(request.headers)}")
|
169 |
+
|
170 |
+
# if is_hf_space:
|
171 |
+
# # In Hugging Face Spaces, the user info is available in the request headers
|
172 |
+
# username = request.headers.get('X-Spaces-Username')
|
173 |
+
# if username:
|
174 |
+
# session['user'] = {'name': username, 'is_hf': True}
|
175 |
+
# return redirect(url_for('index'))
|
176 |
+
# else:
|
177 |
+
# return render_template('error.html', message="Authentication failed. No username provided.")
|
178 |
+
# return redirect(url_for('login'))
|
179 |
+
|
180 |
+
# # Replace the health check route with this improved version
|
181 |
+
# @app.route('/health')
|
182 |
+
# def health_check():
|
183 |
+
# """Health check endpoint for container verification."""
|
184 |
+
# # Log environment variables for debugging
|
185 |
+
# env_vars = {
|
186 |
+
# "FLASK_ENV": os.environ.get('FLASK_ENV', 'production'),
|
187 |
+
# "DEBUG": os.environ.get('DEBUG', 'Not set'),
|
188 |
+
# "SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
|
189 |
+
# "BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
|
190 |
+
# "SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set'
|
191 |
+
# }
|
192 |
+
|
193 |
+
# logger.info(f"Health check called. Environment: {env_vars}")
|
194 |
+
|
195 |
+
# # Get session information for debugging
|
196 |
+
# session_info = dict(session) if session else None
|
197 |
+
# session_keys = list(session.keys()) if session else []
|
198 |
+
|
199 |
+
# return jsonify({
|
200 |
+
# "status": "healthy",
|
201 |
+
# "environment": env_vars,
|
202 |
+
# "session_keys": session_keys,
|
203 |
+
# "is_hf_space": is_hf_space,
|
204 |
+
# "bypass_auth": bypass_auth,
|
205 |
+
# "directories": {
|
206 |
+
# "videos": os.path.exists(VIDEO_DIR),
|
207 |
+
# "annotations": os.path.exists(ANNOTATIONS_DIR),
|
208 |
+
# "temp": os.path.exists(TEMP_DIR)
|
209 |
+
# }
|
210 |
+
# })
|
211 |
+
|
212 |
+
# @app.route('/auth')
|
213 |
+
# def auth():
|
214 |
+
# """This route handles HF authentication."""
|
215 |
+
# logger.info(f"Auth route called. Headers: {dict(request.headers)}")
|
216 |
+
|
217 |
+
# # Force bypass auth to be true for debugging
|
218 |
+
# bypass_auth = True
|
219 |
+
|
220 |
+
# # If bypass is enabled, authenticate immediately
|
221 |
+
# if bypass_auth:
|
222 |
+
# logger.info("Auth bypass enabled, setting default user")
|
223 |
+
# session['user'] = {'name': 'Perilon', 'is_hf': True}
|
224 |
+
# return redirect(url_for('index'))
|
225 |
+
|
226 |
+
# # Normal authentication logic
|
227 |
+
# username = request.headers.get('X-Spaces-Username')
|
228 |
+
# logger.info(f"Username from headers in auth: {username}")
|
229 |
+
|
230 |
+
# if is_hf_space and username and is_allowed_user(username):
|
231 |
+
# logger.info(f"Setting user in session: {username}")
|
232 |
+
# session['user'] = {'name': username, 'is_hf': True}
|
233 |
+
# return redirect(url_for('index'))
|
234 |
+
# elif not is_hf_space:
|
235 |
+
# # For local development
|
236 |
+
# session['user'] = {'name': 'LocalDeveloper', 'is_mock': True}
|
237 |
+
# return redirect(url_for('index'))
|
238 |
+
# else:
|
239 |
+
# # For HF with no valid username yet
|
240 |
+
# return render_template('error.html', message=
|
241 |
+
# "Waiting for Hugging Face authentication. If you continue to see this message, "
|
242 |
+
# "please make sure you're logged into Hugging Face and your username is allowed.")
|
243 |
+
|
244 |
+
# @app.before_request
|
245 |
+
# def check_auth():
|
246 |
+
# """Check authentication before processing requests."""
|
247 |
+
# # Skip authentication for certain routes and static files
|
248 |
+
# if request.path in ['/login', '/logout', '/auth', '/auth/callback', '/debug', '/health'] or request.path.startswith('/static/'):
|
249 |
+
# return
|
250 |
+
|
251 |
+
# # Force bypass auth to be true for debugging
|
252 |
+
# bypass_auth = True
|
253 |
+
|
254 |
+
# # Log all request paths to help troubleshoot
|
255 |
+
# logger.debug(f"Request path: {request.path}, User in session: {'user' in session}")
|
256 |
+
|
257 |
+
# if bypass_auth:
|
258 |
+
# # Set default user for bypass mode if not already set
|
259 |
+
# if 'user' not in session:
|
260 |
+
# session['user'] = {'name': 'Perilon', 'is_hf': True}
|
261 |
+
# return
|
262 |
+
|
263 |
+
# if is_hf_space:
|
264 |
+
# # Check for HF username header
|
265 |
+
# username = request.headers.get('X-Spaces-Username')
|
266 |
+
|
267 |
+
# if 'user' in session:
|
268 |
+
# logger.debug(f"User in session: {session['user']}")
|
269 |
+
# return
|
270 |
+
|
271 |
+
# if username and is_allowed_user(username):
|
272 |
+
# logger.info(f"Setting user from headers: {username}")
|
273 |
+
# session['user'] = {'name': username, 'is_hf': True}
|
274 |
+
# return
|
275 |
+
|
276 |
+
# # No valid user in session or headers
|
277 |
+
# logger.info(f"No authenticated user, redirecting to /auth")
|
278 |
+
# return redirect('/auth')
|
279 |
+
# elif 'user' not in session:
|
280 |
+
# return redirect(url_for('login'))
|
281 |
+
|
282 |
+
# @app.route('/logout')
|
283 |
+
# def logout():
|
284 |
+
# """Clear session and redirect to login."""
|
285 |
+
# session.clear() # Clear the entire session
|
286 |
+
# if is_hf_space:
|
287 |
+
# return redirect('/auth/logout')
|
288 |
+
# return redirect(url_for('login'))
|
289 |
+
|
290 |
+
# @app.route('/debug')
|
291 |
+
# def debug_info():
|
292 |
+
# """Return debug information."""
|
293 |
+
# cookies = {key: request.cookies.get(key) for key in request.cookies.keys()}
|
294 |
+
|
295 |
+
# info = {
|
296 |
+
# "session": dict(session) if session else None,
|
297 |
+
# "headers": dict(request.headers),
|
298 |
+
# "cookies": cookies,
|
299 |
+
# "is_hf_space": is_hf_space,
|
300 |
+
# "allowed_users": os.getenv('ALLOWED_USERS', 'Perilon'),
|
301 |
+
# "app_config": {k: str(v) for k, v in app.config.items() if k in
|
302 |
+
# ['SESSION_COOKIE_SECURE', 'SESSION_COOKIE_HTTPONLY',
|
303 |
+
# 'SESSION_COOKIE_SAMESITE', 'PERMANENT_SESSION_LIFETIME']},
|
304 |
+
# }
|
305 |
+
# return jsonify(info)
|
306 |
+
|
307 |
+
# # Main application routes
|
308 |
+
# @app.route('/')
|
309 |
+
# @login_required
|
310 |
+
# def index():
|
311 |
+
# """Main entry point, redirects to video selection."""
|
312 |
+
# return redirect(url_for('select_video'))
|
313 |
+
|
314 |
+
# @app.route('/select_video')
|
315 |
+
# @login_required
|
316 |
+
# def select_video():
|
317 |
+
# """Page to select a video for annotation."""
|
318 |
+
# if not os.path.exists(VIDEO_DIR):
|
319 |
+
# return render_template('error.html', message="Video directory not found.")
|
320 |
+
# videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
|
321 |
+
# video_ids = [os.path.splitext(v)[0] for v in videos]
|
322 |
+
# return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
|
323 |
+
|
324 |
+
# @app.route('/player/<video_id>')
|
325 |
+
# @login_required
|
326 |
+
# def player(video_id):
|
327 |
+
# """Video player page for annotation."""
|
328 |
+
# return render_template('player.html', video_id=video_id, user=session.get('user'))
|
329 |
+
|
330 |
+
# @app.route('/videos')
|
331 |
+
# @login_required
|
332 |
+
# def get_videos():
|
333 |
+
# """API endpoint to get available videos."""
|
334 |
+
# if not os.path.exists(VIDEO_DIR):
|
335 |
+
# return jsonify({'error': 'Video directory not found'}), 404
|
336 |
+
# videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
|
337 |
+
# if not videos:
|
338 |
+
# return jsonify({'error': 'No videos found'}), 404
|
339 |
+
# return jsonify(videos)
|
340 |
+
|
341 |
+
# @app.route('/video/<path:filename>')
|
342 |
+
# @login_required
|
343 |
+
# def serve_video(filename):
|
344 |
+
# """Serve a video file."""
|
345 |
+
# if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
|
346 |
+
# return jsonify({'error': 'Video not found'}), 404
|
347 |
+
# return send_from_directory(VIDEO_DIR, filename)
|
348 |
+
|
349 |
+
# @app.route('/save_annotations', methods=['POST'])
|
350 |
+
# @login_required
|
351 |
+
# def save_annotations():
|
352 |
+
# """Save annotation data."""
|
353 |
+
# data = request.json
|
354 |
+
# if not data or 'video' not in data or 'timestamps' not in data:
|
355 |
+
# return jsonify({'success': False, 'message': 'Invalid data'}), 400
|
356 |
+
|
357 |
+
# annotation_file = os.path.join(ANNOTATIONS_DIR, f"{data['video']}_annotations.json")
|
358 |
+
# annotation_data = {
|
359 |
+
# "video_name": data['video'] + ".mp4",
|
360 |
+
# "timestamps": sorted(data['timestamps']),
|
361 |
+
# "annotation_date": datetime.now().isoformat(),
|
362 |
+
# "annotated_by": session.get('user', {}).get('name', 'unknown')
|
363 |
+
# }
|
364 |
+
# with open(annotation_file, 'w') as f:
|
365 |
+
# json.dump(annotation_data, f, indent=4)
|
366 |
+
# return jsonify({'success': True, 'message': 'Annotations saved successfully'})
|
367 |
+
|
368 |
+
# @app.route('/get_annotations/<path:video_name>')
|
369 |
+
# @login_required
|
370 |
+
# def get_annotations(video_name):
|
371 |
+
# """Get annotations for a video."""
|
372 |
+
# annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_name}_annotations.json")
|
373 |
+
# if not os.path.exists(annotation_file):
|
374 |
+
# return jsonify({'error': 'No annotations found'}), 404
|
375 |
+
# with open(annotation_file, 'r') as f:
|
376 |
+
# annotations = json.load(f)
|
377 |
+
# return jsonify(annotations)
|
378 |
+
|
379 |
+
# @app.route("/alignment/<video_id>")
|
380 |
+
# @login_required
|
381 |
+
# def alignment_mode(video_id):
|
382 |
+
# """Page for aligning sign language with transcribed text."""
|
383 |
+
# annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_id}_annotations.json")
|
384 |
+
# if not os.path.exists(annotation_file):
|
385 |
+
# return render_template("error.html", message="No annotations found for this video. Please annotate the video first.")
|
386 |
+
# with open(annotation_file, 'r') as f:
|
387 |
+
# annotations = json.load(f)
|
388 |
+
# return render_template(
|
389 |
+
# "alignment.html",
|
390 |
+
# video_id=video_id,
|
391 |
+
# total_clips=len(annotations['timestamps']) - 1,
|
392 |
+
# user=session.get('user')
|
393 |
+
# )
|
394 |
+
|
395 |
+
# @app.route("/api/transcript/<video_id>")
|
396 |
+
# @login_required
|
397 |
+
# def get_transcript(video_id):
|
398 |
+
# """Get transcript for a video."""
|
399 |
+
# timestamps_file = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
|
400 |
+
# logger.info(f"Attempting to load word timestamps from: {timestamps_file}")
|
401 |
+
# if not os.path.exists(timestamps_file):
|
402 |
+
# logger.warning(f"Word timestamps file not found: {timestamps_file}")
|
403 |
+
# return jsonify({
|
404 |
+
# "status": "error",
|
405 |
+
# "message": "No word timestamps found for this video"
|
406 |
+
# }), 404
|
407 |
+
# try:
|
408 |
+
# with open(timestamps_file, 'r') as f:
|
409 |
+
# word_data = json.load(f)
|
410 |
+
# full_text = " ".join(item["punctuated_word"] for item in word_data)
|
411 |
+
# words_with_times = [{
|
412 |
+
# "word": item["punctuated_word"],
|
413 |
+
# "start": float(item["start_time"]),
|
414 |
+
# "end": float(item["end_time"])
|
415 |
+
# } for item in word_data]
|
416 |
+
# logger.info(f"Successfully created transcript ({len(full_text)} characters)")
|
417 |
+
# return jsonify({
|
418 |
+
# "status": "success",
|
419 |
+
# "text": full_text,
|
420 |
+
# "words": words_with_times
|
421 |
+
# })
|
422 |
+
# except Exception as e:
|
423 |
+
# logger.error(f"Error processing word timestamps: {str(e)}")
|
424 |
+
# return jsonify({
|
425 |
+
# "status": "error",
|
426 |
+
# "message": f"Error processing word timestamps: {str(e)}"
|
427 |
+
# }), 500
|
428 |
+
|
429 |
+
# @app.route("/api/word_timestamps/<video_id>")
|
430 |
+
# @login_required
|
431 |
+
# def get_word_timestamps(video_id):
|
432 |
+
# """Get word-level timestamps for a video."""
|
433 |
+
# timestamps_file = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
|
434 |
+
# logger.info(f"Attempting to load word timestamps from: {timestamps_file}")
|
435 |
+
# if not os.path.exists(timestamps_file):
|
436 |
+
# logger.warning(f"Word timestamps file not found: {timestamps_file}")
|
437 |
+
# return jsonify({
|
438 |
+
# "status": "error",
|
439 |
+
# "message": "No word timestamps found for this video"
|
440 |
+
# }), 404
|
441 |
+
# try:
|
442 |
+
# with open(timestamps_file, 'r') as f:
|
443 |
+
# word_data = json.load(f)
|
444 |
+
# logger.info(f"Successfully loaded {len(word_data)} word timestamps")
|
445 |
+
# return jsonify({
|
446 |
+
# "status": "success",
|
447 |
+
# "words": word_data
|
448 |
+
# })
|
449 |
+
# except Exception as e:
|
450 |
+
# logger.error(f"Error processing word timestamps: {str(e)}")
|
451 |
+
# return jsonify({
|
452 |
+
# "status": "error",
|
453 |
+
# "message": f"Error processing word timestamps: {str(e)}"
|
454 |
+
# }), 500
|
455 |
+
|
456 |
+
# @app.route("/api/clips/<video_id>")
|
457 |
+
# @login_required
|
458 |
+
# def get_video_clips(video_id):
|
459 |
+
# """Get clips for a video."""
|
460 |
+
# try:
|
461 |
+
# annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_id}_annotations.json")
|
462 |
+
# if not os.path.exists(annotation_file):
|
463 |
+
# raise FileNotFoundError("Annotations not found")
|
464 |
+
# with open(annotation_file, 'r') as f:
|
465 |
+
# annotations = json.load(f)
|
466 |
+
# timestamps = annotations['timestamps']
|
467 |
+
# clips = []
|
468 |
+
# for i in range(len(timestamps)-1):
|
469 |
+
# clips.append({
|
470 |
+
# "index": i,
|
471 |
+
# "start": timestamps[i],
|
472 |
+
# "end": timestamps[i+1],
|
473 |
+
# "path": f"/clip/{video_id}/{i}"
|
474 |
+
# })
|
475 |
+
# return jsonify({
|
476 |
+
# "status": "success",
|
477 |
+
# "clips": clips
|
478 |
+
# })
|
479 |
+
# except Exception as e:
|
480 |
+
# logger.error(f"Error getting clips: {str(e)}")
|
481 |
+
# return jsonify({
|
482 |
+
# "status": "error",
|
483 |
+
# "message": str(e)
|
484 |
+
# }), 500
|
485 |
+
|
486 |
+
# @app.route("/clip/<video_id>/<int:clip_index>")
|
487 |
+
# @login_required
|
488 |
+
# def serve_clip(video_id, clip_index):
|
489 |
+
# """Serve a specific clip."""
|
490 |
+
# clip_path = os.path.join(
|
491 |
+
# TEMP_DIR,
|
492 |
+
# f"{video_id}_clip_{clip_index:03d}.mp4"
|
493 |
+
# )
|
494 |
+
# logger.info(f"Attempting to serve clip: {clip_path}")
|
495 |
+
# if not os.path.exists(clip_path):
|
496 |
+
# logger.error(f"Clip not found: {clip_path}")
|
497 |
+
# return jsonify({
|
498 |
+
# "status": "error",
|
499 |
+
# "message": "Clip not found"
|
500 |
+
# }), 404
|
501 |
+
# return send_file(clip_path, mimetype="video/mp4")
|
502 |
+
|
503 |
+
# @app.route("/api/save_alignments", methods=["POST"])
|
504 |
+
# @login_required
|
505 |
+
# def save_alignments():
|
506 |
+
# """Save alignment data."""
|
507 |
+
# try:
|
508 |
+
# data = request.json
|
509 |
+
# if not data or 'video_id' not in data or 'alignments' not in data:
|
510 |
+
# return jsonify({'success': False, 'message': 'Invalid data'}), 400
|
511 |
+
|
512 |
+
# # Add user information to the alignments
|
513 |
+
# for alignment in data['alignments']:
|
514 |
+
# if alignment:
|
515 |
+
# alignment['aligned_by'] = session.get('user', {}).get('name', 'unknown')
|
516 |
+
|
517 |
+
# output_path = os.path.join(ALIGNMENTS_DIR, f"{data['video_id']}.json")
|
518 |
+
# with open(output_path, "w") as f:
|
519 |
+
# json.dump(data['alignments'], f, indent=2)
|
520 |
+
# return jsonify({
|
521 |
+
# "success": True,
|
522 |
+
# "message": "Alignments saved successfully"
|
523 |
+
# })
|
524 |
+
# except Exception as e:
|
525 |
+
# logger.error(f"Error saving alignments: {str(e)}")
|
526 |
+
# return jsonify({
|
527 |
+
# "success": False,
|
528 |
+
# "message": str(e)
|
529 |
+
# }), 500
|
530 |
+
|
531 |
+
# @app.route("/api/extract_clips/<video_id>")
|
532 |
+
# @login_required
|
533 |
+
# def extract_clips_for_video(video_id):
|
534 |
+
# """Extract clips and start transcription for a video."""
|
535 |
+
# status = clip_extraction_status.get(video_id, {})
|
536 |
+
# if status.get("percent", 0) < 100:
|
537 |
+
# thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
|
538 |
+
# thread.start()
|
539 |
+
# if video_id not in transcription_progress_status or transcription_progress_status.get(video_id, {}).get("percent", 0) < 100:
|
540 |
+
# thread_trans = threading.Thread(target=run_transcription, args=(video_id,))
|
541 |
+
# thread_trans.start()
|
542 |
+
# return jsonify({"status": "started"})
|
543 |
+
|
544 |
+
# @app.route("/api/clip_progress/<video_id>")
|
545 |
+
# @login_required
|
546 |
+
# def clip_progress(video_id):
|
547 |
+
# """Get clip extraction progress."""
|
548 |
+
# progress = clip_extraction_status.get(video_id, {"current": 0, "total": 0, "percent": 0})
|
549 |
+
# return jsonify(progress)
|
550 |
+
|
551 |
+
# @app.route("/api/transcription_progress/<video_id>")
|
552 |
+
# @login_required
|
553 |
+
# def transcription_progress(video_id):
|
554 |
+
# """Get transcription progress."""
|
555 |
+
# progress = transcription_progress_status.get(video_id, {"status": "not started", "percent": 0})
|
556 |
+
# return jsonify(progress)
|
557 |
+
|
558 |
+
# if __name__ == '__main__':
|
559 |
+
# try:
|
560 |
+
# # Print diagnostic information
|
561 |
+
# print("=" * 50)
|
562 |
+
# print(f"Starting app with configuration:")
|
563 |
+
# print(f"- Running in HF Space: {is_hf_space}")
|
564 |
+
# print(f"- Auth bypass: {bypass_auth}")
|
565 |
+
# print(f"- Port: {os.getenv('PORT', 5000)}")
|
566 |
+
# print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
|
567 |
+
# print("=" * 50)
|
568 |
+
|
569 |
+
# port = int(os.getenv('PORT', 5000))
|
570 |
+
# app.run(host='0.0.0.0', port=port, debug=True)
|
571 |
+
# except Exception as e:
|
572 |
+
# print(f"Error starting the application: {e}")
|
573 |
+
# import traceback
|
574 |
+
# traceback.print_exc()
|
575 |
+
|
576 |
+
|
577 |
from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
|
578 |
import os, json, threading, time, signal, sys
|
579 |
from datetime import datetime
|
580 |
from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
|
581 |
import logging
|
582 |
from dotenv import load_dotenv
|
583 |
+
import boto3
|
584 |
+
from botocore.exceptions import ClientError
|
585 |
+
import tempfile
|
586 |
+
import uuid
|
587 |
+
import requests
|
588 |
+
from urllib.parse import urlparse
|
589 |
|
590 |
# Load environment variables
|
591 |
load_dotenv()
|
|
|
629 |
ALIGNMENTS_DIR = os.path.abspath("data/alignments")
|
630 |
TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
|
631 |
|
632 |
+
# S3 configuration
|
633 |
+
S3_BUCKET = os.getenv('S3_BUCKET', "sorenson-ai-sb-scratch")
|
634 |
+
S3_VIDEO_PREFIX = os.getenv('S3_VIDEO_PREFIX', "awilkinson/kylie_dataset_videos_for_alignment_webapp/")
|
635 |
+
USE_S3_FOR_VIDEOS = os.getenv('USE_S3_FOR_VIDEOS', 'true').lower() == 'true'
|
636 |
+
|
637 |
# Ensure all required directories exist
|
638 |
for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALIGNMENTS_DIR, TRANSCRIPTS_DIR]:
|
639 |
os.makedirs(directory, exist_ok=True)
|
|
|
642 |
clip_extraction_status = {}
|
643 |
transcription_progress_status = {}
|
644 |
|
645 |
+
# S3 helper functions
|
646 |
+
def get_s3_client():
|
647 |
+
"""Get a boto3 S3 client."""
|
648 |
+
return boto3.client(
|
649 |
+
's3',
|
650 |
+
region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
|
651 |
+
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
|
652 |
+
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
|
653 |
+
)
|
654 |
+
|
655 |
+
def list_s3_videos():
|
656 |
+
"""List all videos in the S3 bucket with the given prefix."""
|
657 |
+
try:
|
658 |
+
s3_client = get_s3_client()
|
659 |
+
response = s3_client.list_objects_v2(
|
660 |
+
Bucket=S3_BUCKET,
|
661 |
+
Prefix=S3_VIDEO_PREFIX
|
662 |
+
)
|
663 |
+
|
664 |
+
if 'Contents' not in response:
|
665 |
+
logger.warning(f"No videos found in S3 bucket {S3_BUCKET} with prefix {S3_VIDEO_PREFIX}")
|
666 |
+
return []
|
667 |
+
|
668 |
+
# Extract video IDs (filenames without extension) from S3 keys
|
669 |
+
videos = []
|
670 |
+
for item in response['Contents']:
|
671 |
+
key = item['Key']
|
672 |
+
if key.endswith('.mp4'):
|
673 |
+
# Extract just the filename without extension
|
674 |
+
filename = os.path.basename(key)
|
675 |
+
video_id = os.path.splitext(filename)[0]
|
676 |
+
videos.append(video_id)
|
677 |
+
|
678 |
+
return videos
|
679 |
+
except ClientError as e:
|
680 |
+
logger.error(f"Error listing videos from S3: {str(e)}")
|
681 |
+
return []
|
682 |
+
|
683 |
+
def download_video_from_s3(video_id):
|
684 |
+
"""Download a video from S3 to the local videos directory."""
|
685 |
+
video_filename = f"{video_id}.mp4"
|
686 |
+
s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
|
687 |
+
local_path = os.path.join(VIDEO_DIR, video_filename)
|
688 |
+
|
689 |
+
# Check if the file already exists locally
|
690 |
+
if os.path.exists(local_path):
|
691 |
+
logger.info(f"Video {video_id} already exists locally.")
|
692 |
+
return local_path
|
693 |
+
|
694 |
+
try:
|
695 |
+
logger.info(f"Downloading video {video_id} from S3...")
|
696 |
+
s3_client = get_s3_client()
|
697 |
+
s3_client.download_file(S3_BUCKET, s3_key, local_path)
|
698 |
+
logger.info(f"Video {video_id} downloaded successfully to {local_path}")
|
699 |
+
return local_path
|
700 |
+
except ClientError as e:
|
701 |
+
logger.error(f"Error downloading video from S3: {str(e)}")
|
702 |
+
return None
|
703 |
+
|
704 |
+
def generate_presigned_url(video_id, expiration=3600):
|
705 |
+
"""Generate a presigned URL for direct access to the video in S3."""
|
706 |
+
video_filename = f"{video_id}.mp4"
|
707 |
+
s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
|
708 |
+
|
709 |
+
try:
|
710 |
+
s3_client = get_s3_client()
|
711 |
+
url = s3_client.generate_presigned_url(
|
712 |
+
'get_object',
|
713 |
+
Params={'Bucket': S3_BUCKET, 'Key': s3_key},
|
714 |
+
ExpiresIn=expiration
|
715 |
+
)
|
716 |
+
return url
|
717 |
+
except ClientError as e:
|
718 |
+
logger.error(f"Error generating presigned URL: {str(e)}")
|
719 |
+
return None
|
720 |
+
|
721 |
# Graceful shutdown handler
|
722 |
def graceful_shutdown(signum, frame):
|
723 |
"""Handle graceful shutdown on signals."""
|
|
|
779 |
transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
|
780 |
return
|
781 |
|
782 |
+
# Download video from S3 if needed
|
783 |
+
if USE_S3_FOR_VIDEOS:
|
784 |
+
video_path = download_video_from_s3(video_id)
|
785 |
+
if not video_path:
|
786 |
+
transcription_progress_status[video_id] = {
|
787 |
+
"status": "error",
|
788 |
+
"percent": 0,
|
789 |
+
"message": f"Failed to download video {video_id} from S3"
|
790 |
+
}
|
791 |
+
return
|
792 |
+
else:
|
793 |
+
video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
|
794 |
+
|
795 |
transcription_progress_status[video_id] = {"status": "started", "percent": 10}
|
796 |
|
797 |
# Check if AWS credentials are available
|
|
|
852 |
return render_template('error.html', message="Authentication failed. No username provided.")
|
853 |
return redirect(url_for('login'))
|
854 |
|
|
|
855 |
@app.route('/health')
|
856 |
def health_check():
|
857 |
"""Health check endpoint for container verification."""
|
|
|
861 |
"DEBUG": os.environ.get('DEBUG', 'Not set'),
|
862 |
"SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
|
863 |
"BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
|
864 |
+
"SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set',
|
865 |
+
"S3_BUCKET": os.environ.get('S3_BUCKET', 'Not set'),
|
866 |
+
"S3_VIDEO_PREFIX": os.environ.get('S3_VIDEO_PREFIX', 'Not set'),
|
867 |
+
"USE_S3_FOR_VIDEOS": os.environ.get('USE_S3_FOR_VIDEOS', 'Not set')
|
868 |
}
|
869 |
|
870 |
logger.info(f"Health check called. Environment: {env_vars}")
|
|
|
978 |
"app_config": {k: str(v) for k, v in app.config.items() if k in
|
979 |
['SESSION_COOKIE_SECURE', 'SESSION_COOKIE_HTTPONLY',
|
980 |
'SESSION_COOKIE_SAMESITE', 'PERMANENT_SESSION_LIFETIME']},
|
981 |
+
"s3_config": {
|
982 |
+
"S3_BUCKET": S3_BUCKET,
|
983 |
+
"S3_VIDEO_PREFIX": S3_VIDEO_PREFIX,
|
984 |
+
"USE_S3_FOR_VIDEOS": USE_S3_FOR_VIDEOS
|
985 |
+
}
|
986 |
}
|
987 |
return jsonify(info)
|
988 |
|
|
|
997 |
@login_required
|
998 |
def select_video():
|
999 |
"""Page to select a video for annotation."""
|
1000 |
+
if USE_S3_FOR_VIDEOS:
|
1001 |
+
video_ids = list_s3_videos()
|
1002 |
+
else:
|
1003 |
+
if not os.path.exists(VIDEO_DIR):
|
1004 |
+
return render_template('error.html', message="Video directory not found.")
|
1005 |
+
videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
|
1006 |
+
video_ids = [os.path.splitext(v)[0] for v in videos]
|
1007 |
+
|
1008 |
return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
|
1009 |
|
1010 |
@app.route('/player/<video_id>')
|
|
|
1017 |
@login_required
|
1018 |
def get_videos():
|
1019 |
"""API endpoint to get available videos."""
|
1020 |
+
if USE_S3_FOR_VIDEOS:
|
1021 |
+
videos = list_s3_videos()
|
1022 |
+
if not videos:
|
1023 |
+
return jsonify({'error': 'No videos found in S3'}), 404
|
1024 |
+
# Return just the filenames with .mp4 extension for compatibility
|
1025 |
+
return jsonify([f"{vid}.mp4" for vid in videos])
|
1026 |
+
else:
|
1027 |
+
# Original local file behavior
|
1028 |
+
if not os.path.exists(VIDEO_DIR):
|
1029 |
+
return jsonify({'error': 'Video directory not found'}), 404
|
1030 |
+
videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
|
1031 |
+
if not videos:
|
1032 |
+
return jsonify({'error': 'No videos found'}), 404
|
1033 |
+
return jsonify(videos)
|
1034 |
|
1035 |
@app.route('/video/<path:filename>')
|
1036 |
@login_required
|
1037 |
def serve_video(filename):
|
1038 |
+
"""Serve a video file from S3 or local storage."""
|
1039 |
+
video_id = os.path.splitext(filename)[0] # Remove extension
|
1040 |
+
|
1041 |
+
if USE_S3_FOR_VIDEOS:
|
1042 |
+
# Option 1: Generate a presigned URL and redirect
|
1043 |
+
presigned_url = generate_presigned_url(video_id)
|
1044 |
+
if presigned_url:
|
1045 |
+
return redirect(presigned_url)
|
1046 |
+
|
1047 |
+
# Option 2 (fallback): Download from S3 to local temporary storage and serve
|
1048 |
+
local_path = download_video_from_s3(video_id)
|
1049 |
+
if local_path and os.path.exists(local_path):
|
1050 |
+
return send_from_directory(VIDEO_DIR, filename)
|
1051 |
+
|
1052 |
+
return jsonify({'error': 'Video not found in S3'}), 404
|
1053 |
+
else:
|
1054 |
+
# Original local file behavior
|
1055 |
+
if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
|
1056 |
+
return jsonify({'error': 'Video not found'}), 404
|
1057 |
+
return send_from_directory(VIDEO_DIR, filename)
|
1058 |
|
1059 |
@app.route('/save_annotations', methods=['POST'])
|
1060 |
@login_required
|
|
|
1242 |
@login_required
|
1243 |
def extract_clips_for_video(video_id):
|
1244 |
"""Extract clips and start transcription for a video."""
|
1245 |
+
# If using S3, ensure the video is downloaded first
|
1246 |
+
if USE_S3_FOR_VIDEOS:
|
1247 |
+
video_path = download_video_from_s3(video_id)
|
1248 |
+
if not video_path:
|
1249 |
+
return jsonify({
|
1250 |
+
"status": "error",
|
1251 |
+
"message": f"Failed to download video {video_id} from S3"
|
1252 |
+
}), 404
|
1253 |
+
|
1254 |
status = clip_extraction_status.get(video_id, {})
|
1255 |
if status.get("percent", 0) < 100:
|
1256 |
thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
|
|
|
1282 |
print(f"- Running in HF Space: {is_hf_space}")
|
1283 |
print(f"- Auth bypass: {bypass_auth}")
|
1284 |
print(f"- Port: {os.getenv('PORT', 5000)}")
|
1285 |
+
print(f"- S3 for videos: {USE_S3_FOR_VIDEOS}")
|
1286 |
+
print(f"- S3 bucket: {S3_BUCKET}")
|
1287 |
+
print(f"- S3 prefix: {S3_VIDEO_PREFIX}")
|
1288 |
print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
|
1289 |
+
if USE_S3_FOR_VIDEOS:
|
1290 |
+
try:
|
1291 |
+
s3_videos = list_s3_videos()
|
1292 |
+
print(f"- Available S3 videos: {s3_videos if s3_videos else 'None'}")
|
1293 |
+
except Exception as e:
|
1294 |
+
print(f"- Error listing S3 videos: {str(e)}")
|
1295 |
print("=" * 50)
|
1296 |
|
1297 |
port = int(os.getenv('PORT', 5000))
|
get_transcription_with_amazon.py
CHANGED
@@ -8,6 +8,41 @@ import requests
|
|
8 |
import time
|
9 |
from decimal import Decimal
|
10 |
from typing import Any, Dict, List
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def extract_audio(video_path: str) -> str:
|
13 |
"""Extract audio from video file using ffmpeg.
|
@@ -113,17 +148,21 @@ def main() -> None:
|
|
113 |
)
|
114 |
video_filename = args.video_id + ".mp4" # Source video file (with .mp4)
|
115 |
video_path = os.path.join(base_dir, "data", "videos", video_filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
word_timestamps = get_word_timestamps(video_path)
|
117 |
output_dir = os.path.join(base_dir, "data", "word_timestamps")
|
118 |
os.makedirs(output_dir, exist_ok=True)
|
119 |
output_path = os.path.join(output_dir, args.video_id + "_word_timestamps.json")
|
120 |
with open(output_path, "w") as f:
|
121 |
json.dump(word_timestamps, f, indent=4)
|
122 |
-
print(f"Word timestamps saved to: {output_path}")
|
123 |
-
|
124 |
-
if __name__ == "__main__":
|
125 |
-
import argparse
|
126 |
-
parser = argparse.ArgumentParser(description="Get word timestamps for a given video file ID.")
|
127 |
-
parser.add_argument("video_id", help="Video file ID (without extension)")
|
128 |
-
args = parser.parse_args()
|
129 |
-
main()
|
|
|
8 |
import time
|
9 |
from decimal import Decimal
|
10 |
from typing import Any, Dict, List
|
11 |
+
from botocore.exceptions import ClientError
|
12 |
+
|
13 |
+
S3_BUCKET = "sorenson-ai-sb-scratch"
|
14 |
+
S3_VIDEO_PREFIX = "awilkinson/kylie_dataset_videos_for_alignment_webapp/"
|
15 |
+
USE_S3_FOR_VIDEOS = True # Set to True to use S3, False to use local files
|
16 |
+
|
17 |
+
def get_s3_client():
|
18 |
+
"""Get a boto3 S3 client."""
|
19 |
+
return boto3.client(
|
20 |
+
's3',
|
21 |
+
region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
|
22 |
+
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
|
23 |
+
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
|
24 |
+
)
|
25 |
+
|
26 |
+
def download_video_from_s3(video_id, output_dir):
|
27 |
+
"""Download a video from S3."""
|
28 |
+
video_filename = f"{video_id}.mp4"
|
29 |
+
s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
|
30 |
+
local_path = os.path.join(output_dir, video_filename)
|
31 |
+
|
32 |
+
# Check if the file already exists locally
|
33 |
+
if os.path.exists(local_path):
|
34 |
+
print(f"Video {video_id} already exists locally.")
|
35 |
+
return local_path
|
36 |
+
|
37 |
+
try:
|
38 |
+
print(f"Downloading video {video_id} from S3...")
|
39 |
+
s3_client = get_s3_client()
|
40 |
+
s3_client.download_file(S3_BUCKET, s3_key, local_path)
|
41 |
+
print(f"Video {video_id} downloaded successfully to {local_path}")
|
42 |
+
return local_path
|
43 |
+
except ClientError as e:
|
44 |
+
print(f"Error downloading video from S3: {str(e)}")
|
45 |
+
return None
|
46 |
|
47 |
def extract_audio(video_path: str) -> str:
|
48 |
"""Extract audio from video file using ffmpeg.
|
|
|
148 |
)
|
149 |
video_filename = args.video_id + ".mp4" # Source video file (with .mp4)
|
150 |
video_path = os.path.join(base_dir, "data", "videos", video_filename)
|
151 |
+
|
152 |
+
# Check if we need to download from S3
|
153 |
+
if USE_S3_FOR_VIDEOS and not os.path.exists(video_path):
|
154 |
+
videos_dir = os.path.join(base_dir, "data", "videos")
|
155 |
+
os.makedirs(videos_dir, exist_ok=True)
|
156 |
+
download_video_from_s3(args.video_id, videos_dir)
|
157 |
+
|
158 |
+
if not os.path.exists(video_path):
|
159 |
+
print(f"Error: Video file not found: {video_path}")
|
160 |
+
return
|
161 |
+
|
162 |
word_timestamps = get_word_timestamps(video_path)
|
163 |
output_dir = os.path.join(base_dir, "data", "word_timestamps")
|
164 |
os.makedirs(output_dir, exist_ok=True)
|
165 |
output_path = os.path.join(output_dir, args.video_id + "_word_timestamps.json")
|
166 |
with open(output_path, "w") as f:
|
167 |
json.dump(word_timestamps, f, indent=4)
|
168 |
+
print(f"Word timestamps saved to: {output_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
templates/player.html
CHANGED
@@ -137,8 +137,10 @@
|
|
137 |
// Use the provided template video_id if available; it should be the base ID (without .mp4)
|
138 |
const templateVideoId = "{{ video_id|default('') }}";
|
139 |
let currentVideo = "";
|
|
|
140 |
if (templateVideoId) {
|
141 |
currentVideo = templateVideoId;
|
|
|
142 |
} else {
|
143 |
// Fallback: use /videos API and remove the .mp4 extension
|
144 |
fetch('/videos')
|
@@ -150,8 +152,7 @@
|
|
150 |
}
|
151 |
if (videos.length > 0) {
|
152 |
currentVideo = videos[0].replace(/\.mp4$/, "");
|
153 |
-
|
154 |
-
document.getElementById('video').load();
|
155 |
}
|
156 |
})
|
157 |
.catch(error => {
|
@@ -311,6 +312,28 @@
|
|
311 |
div.textContent = timestamps.map(t => t.toFixed(2)).join(', ');
|
312 |
}
|
313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
video.addEventListener('timeupdate', () => {
|
315 |
currentTimeDisplay.textContent = `Current Time: ${video.currentTime.toFixed(2)}`;
|
316 |
});
|
|
|
137 |
// Use the provided template video_id if available; it should be the base ID (without .mp4)
|
138 |
const templateVideoId = "{{ video_id|default('') }}";
|
139 |
let currentVideo = "";
|
140 |
+
|
141 |
if (templateVideoId) {
|
142 |
currentVideo = templateVideoId;
|
143 |
+
loadVideoSource(currentVideo);
|
144 |
} else {
|
145 |
// Fallback: use /videos API and remove the .mp4 extension
|
146 |
fetch('/videos')
|
|
|
152 |
}
|
153 |
if (videos.length > 0) {
|
154 |
currentVideo = videos[0].replace(/\.mp4$/, "");
|
155 |
+
loadVideoSource(currentVideo);
|
|
|
156 |
}
|
157 |
})
|
158 |
.catch(error => {
|
|
|
312 |
div.textContent = timestamps.map(t => t.toFixed(2)).join(', ');
|
313 |
}
|
314 |
|
315 |
+
function loadVideoSource(videoId) {
|
316 |
+
const videoUrl = `/video/${videoId}.mp4`;
|
317 |
+
|
318 |
+
fetch(videoUrl)
|
319 |
+
.then(response => {
|
320 |
+
if (response.redirected) {
|
321 |
+
// If we've been redirected to a presigned S3 URL
|
322 |
+
document.getElementById('video-source').src = response.url;
|
323 |
+
document.getElementById('video').load();
|
324 |
+
} else if (response.ok) {
|
325 |
+
// If it's a direct file response
|
326 |
+
document.getElementById('video-source').src = videoUrl;
|
327 |
+
document.getElementById('video').load();
|
328 |
+
} else {
|
329 |
+
throw new Error('Video not found');
|
330 |
+
}
|
331 |
+
})
|
332 |
+
.catch(error => {
|
333 |
+
document.getElementById('error-message').textContent = 'Error loading video: ' + error;
|
334 |
+
});
|
335 |
+
}
|
336 |
+
|
337 |
video.addEventListener('timeupdate', () => {
|
338 |
currentTimeDisplay.textContent = `Current Time: ${video.currentTime.toFixed(2)}`;
|
339 |
});
|