Bug fixes
Browse files- .hf-space +4 -1
- Dockerfile +3 -1
- docker-compose.yml +3 -0
- extract_signed_segments_from_annotations.py +37 -214
- flask_app.py +745 -17
- get_transcription_with_amazon.py +47 -8
- templates/player.html +25 -2
@@ -7,8 +7,11 @@ config:
7 |
- PORT=7860
8 |
- SPACE_ID=true
9 |
10 |
11 |
cpu: 1
12 |
memory: 1
13 |
gpu: null
14 |
restarts: true
7 |
- PORT=7860
8 |
- SPACE_ID=true
9 |
10 |
- S3_BUCKET=sorenson-ai-sb-scratch
11 |
- S3_VIDEO_PREFIX=awilkinson/kylie_dataset_videos_for_alignment_webapp/
12 |
13 |
14 |
cpu: 1
15 |
memory: 1
16 |
gpu: null
17 |
restarts: true
@@ -16,10 +16,12 @@ COPY . .
16 |
17 |
18 |
19 |
# Add explicit environment variable to enable authentication bypass for troubleshooting
20 |
21 |
ENV SECRET_KEY="f7290fc27f11dbf14be6cd348638ad62"
22 |
23 |
24 |
# Create necessary directories
25 |
RUN mkdir -p data/videos data/annotations data/temp data/word_timestamps data/alignments data/transcripts
16 |
17 |
18 |
19 |
20 |
ENV SECRET_KEY="f7290fc27f11dbf14be6cd348638ad62"
21 |
22 |
ENV S3_BUCKET="sorenson-ai-sb-scratch"
23 |
ENV S3_VIDEO_PREFIX="awilkinson/kylie_dataset_videos_for_alignment_webapp/"
24 |
25 |
26 |
# Create necessary directories
27 |
RUN mkdir -p data/videos data/annotations data/temp data/word_timestamps data/alignments data/transcripts
@@ -13,5 +13,8 @@ services:
13 |
14 |
15 |
16 |
17 |
- ./data:/app/data
13 |
14 |
15 |
16 |
- S3_BUCKET=sorenson-ai-sb-scratch
17 |
- S3_VIDEO_PREFIX=awilkinson/kylie_dataset_videos_for_alignment_webapp/
18 |
19 |
20 |
- ./data:/app/data
@@ -1,212 +1,3 @@
1 |
# #!/usr/bin/env python3
2 |
3 |
# import json
4 |
# import os
5 |
# import pathlib
6 |
# import subprocess
7 |
# from dataclasses import dataclass
8 |
# from decimal import Decimal
9 |
# from typing import List, Optional, Tuple
10 |
# import argparse
11 |
12 |
# try:
13 |
# from tqdm import tqdm
14 |
# except ImportError:
15 |
# def tqdm(iterable, **kwargs):
16 |
# return iterable
17 |
# def write(msg):
18 |
# print(msg)
19 |
20 |
# @dataclass
21 |
# class VideoClip:
22 |
# """Represents a video clip with timing information."""
23 |
# start_time: float
24 |
# end_time: float
25 |
# clip_path: str
26 |
# original_video: str
27 |
# index: int
28 |
29 |
# @property
30 |
# def duration(self) -> float:
31 |
# return self.end_time - self.start_time
32 |
33 |
# class ClipExtractor:
34 |
# """Handles extraction of video clips based on annotation timestamps."""
35 |
36 |
# def __init__(self, base_dir: str) -> None:
37 |
# """Initialize with project base directory."""
38 |
# self.base_dir = base_dir
39 |
# self.temp_dir = os.path.join(base_dir, "data", "temp")
40 |
# self.videos_dir = os.path.join(base_dir, "data", "videos")
41 |
# self.annotations_dir = os.path.join(base_dir, "data", "annotations")
42 |
# self.metadata_dir = os.path.join(base_dir, "data", "segment_metadata")
43 |
# os.makedirs(self.temp_dir, exist_ok=True)
44 |
# os.makedirs(self.metadata_dir, exist_ok=True)
45 |
46 |
# def validate_timestamps(self, timestamps: List[float]) -> Tuple[bool, Optional[str]]:
47 |
# if not timestamps:
48 |
# return False, "No timestamps found in annotation file."
49 |
# for i in range(len(timestamps) - 1):
50 |
# if timestamps[i] >= timestamps[i + 1]:
51 |
# return False, (f"Invalid timestamp order: {str(round(Decimal(timestamps[i]), 3))} seconds "
52 |
# f"followed by {str(round(Decimal(timestamps[i + 1]), 3))} seconds")
53 |
# return True, None
54 |
55 |
# def get_video_duration(self, video_path: str) -> float:
56 |
# try:
57 |
# cmd = [
58 |
# "ffprobe",
59 |
# "-v", "error",
60 |
# "-show_entries", "format=duration",
61 |
# "-of", "default=noprint_wrappers=1:nokey=1",
62 |
# video_path
63 |
# ]
64 |
# output = subprocess.check_output(cmd).decode().strip()
65 |
# return float(output)
66 |
# except subprocess.CalledProcessError as e:
67 |
# raise RuntimeError(f"Failed to get video duration. Error: {str(e)}")
68 |
69 |
# def extract_clip(self, video_path: str, start_time: float, end_time: float, output_path: str) -> bool:
70 |
# try:
71 |
# cmd = [
72 |
# "ffmpeg",
73 |
# "-i", video_path,
74 |
# "-ss", str(start_time),
75 |
# "-t", str(end_time - start_time),
76 |
# "-c:v", "libx264",
77 |
# "-c:a", "aac",
78 |
# "-y",
79 |
# output_path
80 |
# ]
81 |
# subprocess.run(cmd, check=True, capture_output=True)
82 |
# if not os.path.exists(output_path):
83 |
# print(f"Warning: ffmpeg completed but file not found: {output_path}")
84 |
# return False
85 |
# file_size = os.path.getsize(output_path)
86 |
# print(f"Created clip: {output_path} ({file_size} bytes)")
87 |
# return True
88 |
# except subprocess.CalledProcessError as e:
89 |
# print(f"Error extracting clip. Details: {str(e)}")
90 |
# return False
91 |
92 |
# def extract_clips_from_annotations(self, video_id: str, progress_callback=None) -> List[VideoClip]:
93 |
# # Determine paths
94 |
# video_path = os.path.join(self.videos_dir, f"{video_id}.mp4")
95 |
# annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
96 |
# if not os.path.exists(video_path):
97 |
# raise FileNotFoundError(f"Video file not found: {video_path}")
98 |
# if not os.path.exists(annotation_path):
99 |
# raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
100 |
101 |
# with open(annotation_path, "r") as f:
102 |
# annotations = json.load(f)
103 |
# timestamps = sorted(annotations["timestamps"])
104 |
# is_valid, error_message = self.validate_timestamps(timestamps)
105 |
# if not is_valid:
106 |
# raise ValueError(f"Invalid timestamps in annotation file. {error_message}")
107 |
# video_duration = self.get_video_duration(video_path)
108 |
# if timestamps[-1] > video_duration:
109 |
# raise ValueError(
110 |
# f"Final timestamp ({str(round(Decimal(timestamps[-1]), 3))} seconds) " +
111 |
# f"exceeds video duration ({str(round(Decimal(video_duration), 3))} seconds)"
112 |
# )
113 |
114 |
# # Create segments using only consecutive pairs from the annotated boundaries.
115 |
# segments = [(timestamps[i], timestamps[i+1]) for i in range(len(timestamps)-1)]
116 |
# total_clips = len(segments)
117 |
118 |
# # Check metadata to see if segmentation is up-to-date.
119 |
# metadata_file = os.path.join(self.metadata_dir, f"{video_id}_metadata.json")
120 |
# use_cached = False
121 |
# if os.path.exists(metadata_file):
122 |
# with open(metadata_file, "r") as meta_f:
123 |
# try:
124 |
# meta_data = json.load(meta_f)
125 |
# if meta_data.get("segments") == segments:
126 |
# use_cached = True
127 |
# except Exception as ex:
128 |
# use_cached = False
129 |
130 |
# if use_cached:
131 |
# clips = []
132 |
# for i, (start, end) in enumerate(segments):
133 |
# clip_path = os.path.join(self.temp_dir, f"{video_id}_clip_{i:03d}.mp4")
134 |
# if not os.path.exists(clip_path):
135 |
# use_cached = False
136 |
# break
137 |
# clips.append(VideoClip(start, end, clip_path, video_id, i))
138 |
# if use_cached:
139 |
# if progress_callback:
140 |
# progress_callback(total_clips, total_clips)
141 |
# else:
142 |
# print("Using cached segmentation as boundaries haven't changed.")
143 |
# return clips
144 |
145 |
# # If metadata is missing, boundaries have changed, or some clip file is missing, re-run segmentation.
146 |
# clips = []
147 |
# current_clip = 0
148 |
# use_tqdm = progress_callback is None
149 |
# if use_tqdm:
150 |
# pbar = tqdm(total=total_clips, desc="Extracting clips")
151 |
# for segment in segments:
152 |
# start, end = segment
153 |
# clip_path = os.path.join(self.temp_dir, f"{video_id}_clip_{current_clip:03d}.mp4")
154 |
# if self.extract_clip(video_path, start, end, clip_path):
155 |
# clips.append(VideoClip(start, end, clip_path, video_id, current_clip))
156 |
# current_clip += 1
157 |
# if progress_callback:
158 |
# progress_callback(current_clip, total_clips)
159 |
# elif use_tqdm:
160 |
# pbar.update(1)
161 |
# if use_tqdm:
162 |
# pbar.close()
163 |
# # Save segmentation metadata for future use.
164 |
# meta_data = {"segments": segments}
165 |
# with open(metadata_file, "w") as meta_f:
166 |
# json.dump(meta_data, meta_f, indent=4)
167 |
# return clips
168 |
169 |
# def cleanup_clips(self, clips: List[VideoClip]) -> None:
170 |
# if not clips:
171 |
# return
172 |
# print("\nCleaning up temporary files...")
173 |
# for clip in clips:
174 |
# if os.path.exists(clip.clip_path):
175 |
# try:
176 |
# print(f"Removing: {clip.clip_path}")
177 |
# os.remove(clip.clip_path)
178 |
# except OSError as e:
179 |
# print(f"Warning: Failed to remove clip {clip.clip_path}. Error: {str(e)}")
180 |
# else:
181 |
# print(f"Warning: File not found for cleanup: {clip.clip_path}")
182 |
183 |
# def main() -> None:
184 |
# parser = argparse.ArgumentParser(description="Extract video clips based on annotations for a given video file ID.")
185 |
# parser.add_argument("video_id", help="Video file ID (without extension)")
186 |
# args = parser.parse_args()
187 |
188 |
# base_dir = os.path.join(
189 |
# str(pathlib.Path.home()),
190 |
# "andrew_messaround",
191 |
# "vsl_speech_to_signing_alignment",
192 |
# "boundary_annotation_webapp"
193 |
# )
194 |
# extractor = ClipExtractor(base_dir)
195 |
# try:
196 |
# clips = extractor.extract_clips_from_annotations(args.video_id)
197 |
# print(f"\nSuccessfully extracted {len(clips)} clips:")
198 |
# for clip in clips:
199 |
# print(f"Clip {clip.index}: {round(clip.start_time, 2)}s → {round(clip.end_time, 2)}s")
200 |
# print(f"Duration: {round(clip.duration, 2)}s")
201 |
# print(f"Path: {clip.clip_path}\n")
202 |
# except Exception as e:
203 |
# print(f"Error: {str(e)}")
204 |
205 |
# if __name__ == "__main__":
206 |
# main()
207 |
208 |
#!/usr/bin/env python3
209 |
210 |
import json
211 |
import os
212 |
import pathlib
@@ -215,6 +6,8 @@ from dataclasses import dataclass
215 |
from decimal import Decimal
216 |
from typing import List, Optional, Tuple
217 |
import argparse
218 |
219 |
220 |
from tqdm import tqdm
@@ -226,6 +19,20 @@ except ImportError:
226 |
227 |
from concurrent.futures import ThreadPoolExecutor, as_completed
228 |
229 |
230 |
class VideoClip:
231 |
"""Represents a video clip with timing information."""
@@ -300,15 +107,31 @@ class ClipExtractor:
300 |
print(f"Error extracting clip. Details: {str(e)}")
301 |
return False
302 |
303 |
def extract_clips_from_annotations(self, video_id
304 |
# Determine paths
305 |
306 |
annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
307 |
308 |
raise FileNotFoundError(f"Video file not found: {video_path}")
309 |
if not os.path.exists(annotation_path):
310 |
raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
311 |
312 |
with open(annotation_path, "r") as f:
313 |
annotations = json.load(f)
314 |
timestamps = sorted(annotations["timestamps"])
1 |
import json
2 |
import os
3 |
import pathlib
6 |
from decimal import Decimal
7 |
from typing import List, Optional, Tuple
8 |
import argparse
9 |
import boto3
10 |
from botocore.exceptions import ClientError
11 |
12 |
13 |
from tqdm import tqdm
19 |
20 |
from concurrent.futures import ThreadPoolExecutor, as_completed
21 |
22 |
# Add these constants at the top of the file
23 |
S3_BUCKET = "sorenson-ai-sb-scratch"
24 |
S3_VIDEO_PREFIX = "awilkinson/kylie_dataset_videos_for_alignment_webapp/"
25 |
USE_S3_FOR_VIDEOS = True # Set to True to use S3, False to use local files
26 |
27 |
def get_s3_client():
28 |
"""Get a boto3 S3 client."""
29 |
return boto3.client(
30 |
31 |
region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
32 |
33 |
34 |
35 |
36 |
37 |
class VideoClip:
38 |
"""Represents a video clip with timing information."""
107 |
print(f"Error extracting clip. Details: {str(e)}")
108 |
return False
109 |
110 |
def extract_clips_from_annotations(self, video_id, progress_callback=None) -> List[VideoClip]:
111 |
"""Extract clips based on annotation timestamps, handling S3 videos."""
112 |
# Determine paths
113 |
video_filename = f"{video_id}.mp4"
114 |
video_path = os.path.join(self.videos_dir, video_filename)
115 |
annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
116 |
117 |
if not os.path.exists(annotation_path):
118 |
raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
119 |
120 |
# Check if we need to download the video from S3
121 |
if USE_S3_FOR_VIDEOS and not os.path.exists(video_path):
122 |
print(f"Video not found locally. Downloading from S3: {video_id}")
123 |
s3_client = get_s3_client()
124 |
125 |
s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
126 |
s3_client.download_file(S3_BUCKET, s3_key, video_path)
127 |
print(f"Video downloaded successfully: {video_path}")
128 |
except ClientError as e:
129 |
raise FileNotFoundError(f"Video file not found in S3: {s3_key}. Error: {str(e)}")
130 |
131 |
if not os.path.exists(video_path):
132 |
raise FileNotFoundError(f"Video file not found: {video_path}")
133 |
134 |
# Now continue with the original extraction process
135 |
with open(annotation_path, "r") as f:
136 |
annotations = json.load(f)
137 |
timestamps = sorted(annotations["timestamps"])
@@ -1,9 +1,591 @@
1 |
from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
2 |
import os, json, threading, time, signal, sys
3 |
from datetime import datetime
4 |
from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
5 |
import logging
6 |
from dotenv import load_dotenv
7 |
8 |
# Load environment variables
9 |
@@ -47,6 +629,11 @@ WORD_TIMESTAMPS_DIR = os.path.abspath("data/word_timestamps")
47 |
ALIGNMENTS_DIR = os.path.abspath("data/alignments")
48 |
TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
49 |
50 |
# Ensure all required directories exist
51 |
52 |
os.makedirs(directory, exist_ok=True)
@@ -55,6 +642,82 @@ for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALI
55 |
clip_extraction_status = {}
56 |
transcription_progress_status = {}
57 |
58 |
# Graceful shutdown handler
59 |
def graceful_shutdown(signum, frame):
60 |
"""Handle graceful shutdown on signals."""
@@ -116,7 +779,19 @@ def run_transcription(video_id):
116 |
transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
117 |
118 |
119 |
120 |
transcription_progress_status[video_id] = {"status": "started", "percent": 10}
121 |
122 |
# Check if AWS credentials are available
@@ -177,7 +852,6 @@ def auth_callback():
177 |
return render_template('error.html', message="Authentication failed. No username provided.")
178 |
return redirect(url_for('login'))
179 |
180 |
# Replace the health check route with this improved version
181 |
182 |
def health_check():
183 |
"""Health check endpoint for container verification."""
@@ -187,7 +861,10 @@ def health_check():
187 |
"DEBUG": os.environ.get('DEBUG', 'Not set'),
188 |
"SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
189 |
"BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
190 |
"SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set'
191 |
192 |
193 |
logger.info(f"Health check called. Environment: {env_vars}")
@@ -301,6 +978,11 @@ def debug_info():
301 |
"app_config": {k: str(v) for k, v in app.config.items() if k in
302 |
303 |
304 |
305 |
return jsonify(info)
306 |
@@ -315,10 +997,14 @@ def index():
315 |
316 |
def select_video():
317 |
"""Page to select a video for annotation."""
318 |
319 |
320 |
321 |
322 |
return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
323 |
324 |
@@ -331,20 +1017,44 @@ def player(video_id):
331 |
332 |
def get_videos():
333 |
"""API endpoint to get available videos."""
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
def serve_video(filename):
344 |
"""Serve a video file."""
345 |
346 |
347 |
348 |
349 |
@app.route('/save_annotations', methods=['POST'])
350 |
@@ -532,6 +1242,15 @@ def save_alignments():
532 |
533 |
def extract_clips_for_video(video_id):
534 |
"""Extract clips and start transcription for a video."""
535 |
status = clip_extraction_status.get(video_id, {})
536 |
if status.get("percent", 0) < 100:
537 |
thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
@@ -563,7 +1282,16 @@ if __name__ == '__main__':
563 |
print(f"- Running in HF Space: {is_hf_space}")
564 |
print(f"- Auth bypass: {bypass_auth}")
565 |
print(f"- Port: {os.getenv('PORT', 5000)}")
566 |
print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
567 |
print("=" * 50)
568 |
569 |
port = int(os.getenv('PORT', 5000))
1 |
# from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
2 |
# import os, json, threading, time, signal, sys
3 |
# from datetime import datetime
4 |
# from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
5 |
# import logging
6 |
# from dotenv import load_dotenv
7 |
8 |
# # Load environment variables
9 |
# load_dotenv()
10 |
11 |
# # Add this near the top with other environment variables
12 |
# bypass_auth = os.getenv('BYPASS_AUTH', 'false').lower() == 'true'
13 |
14 |
# # Configure logging first
15 |
# logging.basicConfig(
16 |
# level=logging.INFO,
17 |
# format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18 |
# )
19 |
# logger = logging.getLogger(__name__)
20 |
21 |
# # Hugging Face specific configuration
22 |
# is_hf_space = os.getenv('SPACE_ID') is not None
23 |
# if is_hf_space:
24 |
# logger.info("Running in Hugging Face Spaces environment")
25 |
# # Allow insecure transport for development in HF
26 |
27 |
# # Ensure port is set correctly
28 |
# os.environ['PORT'] = '7860'
29 |
30 |
31 |
32 |
# app = Flask(__name__)
33 |
# app.secret_key = os.getenv('SECRET_KEY', 'dev_key_for_testing')
34 |
35 |
# # Configure session for HF
36 |
# if is_hf_space:
37 |
# app.config['SESSION_COOKIE_SECURE'] = False
38 |
# app.config['SESSION_COOKIE_HTTPONLY'] = True
39 |
# app.config['SESSION_COOKIE_SAMESITE'] = None # Add this line
40 |
# app.config['PERMANENT_SESSION_LIFETIME'] = 86400 # 24 hours
41 |
42 |
# # Directory paths
43 |
# VIDEO_DIR = os.path.abspath("data/videos")
44 |
# ANNOTATIONS_DIR = os.path.abspath("data/annotations")
45 |
# TEMP_DIR = os.path.abspath("data/temp")
46 |
# WORD_TIMESTAMPS_DIR = os.path.abspath("data/word_timestamps")
47 |
# ALIGNMENTS_DIR = os.path.abspath("data/alignments")
48 |
# TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
49 |
50 |
# # Ensure all required directories exist
51 |
52 |
# os.makedirs(directory, exist_ok=True)
53 |
54 |
# # Global dictionaries for progress tracking
55 |
# clip_extraction_status = {}
56 |
# transcription_progress_status = {}
57 |
58 |
# # Graceful shutdown handler
59 |
# def graceful_shutdown(signum, frame):
60 |
# """Handle graceful shutdown on signals."""
61 |
# logger.info(f"Received signal {signum}, shutting down gracefully...")
62 |
# # Clean up as needed here
63 |
# sys.exit(0)
64 |
65 |
# # Register signal handlers
66 |
# signal.signal(signal.SIGTERM, graceful_shutdown)
67 |
# signal.signal(signal.SIGINT, graceful_shutdown)
68 |
69 |
# # Login required decorator
70 |
# def login_required(f):
71 |
# from functools import wraps
72 |
# @wraps(f)
73 |
# def decorated_function(*args, **kwargs):
74 |
# if 'user' not in session:
75 |
# logger.info(f"User not in session, redirecting to login")
76 |
# return redirect(url_for('login'))
77 |
# return f(*args, **kwargs)
78 |
# return decorated_function
79 |
80 |
# # Allow specific users (for testing)
81 |
# def is_allowed_user(username):
82 |
# allowed_users_env = os.getenv('ALLOWED_USERS', 'Perilon') # Default to your username
83 |
# allowed_users = [user.strip() for user in allowed_users_env.split(',')]
84 |
# return username in allowed_users or not is_hf_space # Allow all users in local dev
85 |
86 |
# def update_extraction_progress(video_id, current, total):
87 |
# percent = int((current / total) * 100)
88 |
# clip_extraction_status[video_id] = {"current": current, "total": total, "percent": percent}
89 |
90 |
# def run_clip_extraction(video_id):
91 |
# try:
92 |
# base_dir = app.root_path
93 |
# extractor = ClipExtractor(base_dir)
94 |
# extractor.extract_clips_from_annotations(
95 |
# video_id,
96 |
# progress_callback=lambda current, total: update_extraction_progress(video_id, current, total)
97 |
# )
98 |
# if video_id in clip_extraction_status:
99 |
# status = clip_extraction_status[video_id]
100 |
# if status.get("percent", 0) < 100:
101 |
# update_extraction_progress(video_id, status["total"], status["total"])
102 |
# else:
103 |
# update_extraction_progress(video_id, 1, 1)
104 |
# except Exception as e:
105 |
# logger.error(f"Error during clip extraction for {video_id}: {str(e)}")
106 |
# clip_extraction_status[video_id] = {"error": str(e)}
107 |
108 |
# def run_transcription(video_id):
109 |
# try:
110 |
# base_dir = app.root_path
111 |
# output_path = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
112 |
113 |
# # Check if transcription already exists and is valid.
114 |
# if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
115 |
# logger.info(f"Using cached transcription for video {video_id}.")
116 |
# transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
117 |
# return
118 |
119 |
# video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
120 |
# transcription_progress_status[video_id] = {"status": "started", "percent": 10}
121 |
122 |
# # Check if AWS credentials are available
123 |
# if not os.environ.get('AWS_ACCESS_KEY_ID') or not os.environ.get('AWS_SECRET_ACCESS_KEY'):
124 |
# logger.warning("AWS credentials not found. Transcription will not work properly.")
125 |
# transcription_progress_status[video_id] = {
126 |
# "status": "error",
127 |
# "percent": 0,
128 |
# "message": "AWS credentials missing"
129 |
# }
130 |
# return
131 |
132 |
# # Run transcription via the imported function from get_transcription_with_amazon.py
133 |
# from get_transcription_with_amazon import get_word_timestamps
134 |
# word_timestamps = get_word_timestamps(video_path)
135 |
136 |
# with open(output_path, "w") as f:
137 |
# json.dump(word_timestamps, f, indent=4)
138 |
139 |
# transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
140 |
# except Exception as e:
141 |
# logger.error(f"Error during transcription for {video_id}: {str(e)}")
142 |
# transcription_progress_status[video_id] = {"status": "error", "percent": 0, "message": str(e)}
143 |
144 |
# # Authentication routes
145 |
# @app.route('/login')
146 |
# def login():
147 |
# """Handle login for both local and HF environments."""
148 |
# logger.info(f"Login route called. Headers: {dict(request.headers)}")
149 |
150 |
# if is_hf_space:
151 |
# username = request.headers.get('X-Spaces-Username')
152 |
# logger.info(f"Username from headers in login: {username}")
153 |
154 |
# if username and is_allowed_user(username):
155 |
# session['user'] = {'name': username, 'is_hf': True}
156 |
# return redirect(url_for('index'))
157 |
# else:
158 |
# # Redirect to the HF auth endpoint
159 |
# return redirect('/auth')
160 |
# else:
161 |
# # For local development
162 |
# session['user'] = {'name': 'LocalDeveloper', 'is_mock': True}
163 |
# return redirect(url_for('index'))
164 |
165 |
# @app.route('/auth/callback')
166 |
# def auth_callback():
167 |
# """This route will be called by Hugging Face after successful authentication."""
168 |
# logger.info(f"Auth callback called. Headers: {dict(request.headers)}")
169 |
170 |
# if is_hf_space:
171 |
# # In Hugging Face Spaces, the user info is available in the request headers
172 |
# username = request.headers.get('X-Spaces-Username')
173 |
# if username:
174 |
# session['user'] = {'name': username, 'is_hf': True}
175 |
# return redirect(url_for('index'))
176 |
# else:
177 |
# return render_template('error.html', message="Authentication failed. No username provided.")
178 |
# return redirect(url_for('login'))
179 |
180 |
# # Replace the health check route with this improved version
181 |
# @app.route('/health')
182 |
# def health_check():
183 |
# """Health check endpoint for container verification."""
184 |
# # Log environment variables for debugging
185 |
# env_vars = {
186 |
# "FLASK_ENV": os.environ.get('FLASK_ENV', 'production'),
187 |
# "DEBUG": os.environ.get('DEBUG', 'Not set'),
188 |
# "SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
189 |
# "BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
190 |
# "SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set'
191 |
# }
192 |
193 |
# logger.info(f"Health check called. Environment: {env_vars}")
194 |
195 |
# # Get session information for debugging
196 |
# session_info = dict(session) if session else None
197 |
# session_keys = list(session.keys()) if session else []
198 |
199 |
# return jsonify({
200 |
# "status": "healthy",
201 |
# "environment": env_vars,
202 |
# "session_keys": session_keys,
203 |
# "is_hf_space": is_hf_space,
204 |
# "bypass_auth": bypass_auth,
205 |
# "directories": {
206 |
# "videos": os.path.exists(VIDEO_DIR),
207 |
# "annotations": os.path.exists(ANNOTATIONS_DIR),
208 |
# "temp": os.path.exists(TEMP_DIR)
209 |
# }
210 |
# })
211 |
212 |
# @app.route('/auth')
213 |
# def auth():
214 |
# """This route handles HF authentication."""
215 |
# logger.info(f"Auth route called. Headers: {dict(request.headers)}")
216 |
217 |
# # Force bypass auth to be true for debugging
218 |
# bypass_auth = True
219 |
220 |
# # If bypass is enabled, authenticate immediately
221 |
# if bypass_auth:
222 |
# logger.info("Auth bypass enabled, setting default user")
223 |
# session['user'] = {'name': 'Perilon', 'is_hf': True}
224 |
# return redirect(url_for('index'))
225 |
226 |
# # Normal authentication logic
227 |
# username = request.headers.get('X-Spaces-Username')
228 |
# logger.info(f"Username from headers in auth: {username}")
229 |
230 |
# if is_hf_space and username and is_allowed_user(username):
231 |
# logger.info(f"Setting user in session: {username}")
232 |
# session['user'] = {'name': username, 'is_hf': True}
233 |
# return redirect(url_for('index'))
234 |
# elif not is_hf_space:
235 |
# # For local development
236 |
# session['user'] = {'name': 'LocalDeveloper', 'is_mock': True}
237 |
# return redirect(url_for('index'))
238 |
# else:
239 |
# # For HF with no valid username yet
240 |
# return render_template('error.html', message=
241 |
# "Waiting for Hugging Face authentication. If you continue to see this message, "
242 |
# "please make sure you're logged into Hugging Face and your username is allowed.")
243 |
244 |
# @app.before_request
245 |
# def check_auth():
246 |
# """Check authentication before processing requests."""
247 |
# # Skip authentication for certain routes and static files
248 |
# if request.path in ['/login', '/logout', '/auth', '/auth/callback', '/debug', '/health'] or request.path.startswith('/static/'):
249 |
# return
250 |
251 |
# # Force bypass auth to be true for debugging
252 |
# bypass_auth = True
253 |
254 |
# # Log all request paths to help troubleshoot
255 |
# logger.debug(f"Request path: {request.path}, User in session: {'user' in session}")
256 |
257 |
# if bypass_auth:
258 |
# # Set default user for bypass mode if not already set
259 |
# if 'user' not in session:
260 |
# session['user'] = {'name': 'Perilon', 'is_hf': True}
261 |
# return
262 |
263 |
# if is_hf_space:
264 |
# # Check for HF username header
265 |
# username = request.headers.get('X-Spaces-Username')
266 |
267 |
# if 'user' in session:
268 |
# logger.debug(f"User in session: {session['user']}")
269 |
# return
270 |
271 |
# if username and is_allowed_user(username):
272 |
# logger.info(f"Setting user from headers: {username}")
273 |
# session['user'] = {'name': username, 'is_hf': True}
274 |
# return
275 |
276 |
# # No valid user in session or headers
277 |
# logger.info(f"No authenticated user, redirecting to /auth")
278 |
# return redirect('/auth')
279 |
# elif 'user' not in session:
280 |
# return redirect(url_for('login'))
281 |
282 |
# @app.route('/logout')
283 |
# def logout():
284 |
# """Clear session and redirect to login."""
285 |
# session.clear() # Clear the entire session
286 |
# if is_hf_space:
287 |
# return redirect('/auth/logout')
288 |
# return redirect(url_for('login'))
289 |
290 |
# @app.route('/debug')
291 |
# def debug_info():
292 |
# """Return debug information."""
293 |
# cookies = {key: request.cookies.get(key) for key in request.cookies.keys()}
294 |
295 |
# info = {
296 |
# "session": dict(session) if session else None,
297 |
# "headers": dict(request.headers),
298 |
# "cookies": cookies,
299 |
# "is_hf_space": is_hf_space,
300 |
# "allowed_users": os.getenv('ALLOWED_USERS', 'Perilon'),
301 |
# "app_config": {k: str(v) for k, v in app.config.items() if k in
302 |
303 |
304 |
# }
305 |
# return jsonify(info)
306 |
307 |
# # Main application routes
308 |
# @app.route('/')
309 |
# @login_required
310 |
# def index():
311 |
# """Main entry point, redirects to video selection."""
312 |
# return redirect(url_for('select_video'))
313 |
314 |
# @app.route('/select_video')
315 |
# @login_required
316 |
# def select_video():
317 |
# """Page to select a video for annotation."""
318 |
# if not os.path.exists(VIDEO_DIR):
319 |
# return render_template('error.html', message="Video directory not found.")
320 |
# videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
321 |
# video_ids = [os.path.splitext(v)[0] for v in videos]
322 |
# return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
323 |
324 |
# @app.route('/player/<video_id>')
325 |
# @login_required
326 |
# def player(video_id):
327 |
# """Video player page for annotation."""
328 |
# return render_template('player.html', video_id=video_id, user=session.get('user'))
329 |
330 |
# @app.route('/videos')
331 |
# @login_required
332 |
# def get_videos():
333 |
# """API endpoint to get available videos."""
334 |
# if not os.path.exists(VIDEO_DIR):
335 |
# return jsonify({'error': 'Video directory not found'}), 404
336 |
# videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
337 |
# if not videos:
338 |
# return jsonify({'error': 'No videos found'}), 404
339 |
# return jsonify(videos)
340 |
341 |
# @app.route('/video/<path:filename>')
342 |
# @login_required
343 |
# def serve_video(filename):
344 |
# """Serve a video file."""
345 |
# if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
346 |
# return jsonify({'error': 'Video not found'}), 404
347 |
# return send_from_directory(VIDEO_DIR, filename)
348 |
349 |
# @app.route('/save_annotations', methods=['POST'])
350 |
# @login_required
351 |
# def save_annotations():
352 |
# """Save annotation data."""
353 |
# data = request.json
354 |
# if not data or 'video' not in data or 'timestamps' not in data:
355 |
# return jsonify({'success': False, 'message': 'Invalid data'}), 400
356 |
357 |
# annotation_file = os.path.join(ANNOTATIONS_DIR, f"{data['video']}_annotations.json")
358 |
# annotation_data = {
359 |
# "video_name": data['video'] + ".mp4",
360 |
# "timestamps": sorted(data['timestamps']),
361 |
# "annotation_date": datetime.now().isoformat(),
362 |
# "annotated_by": session.get('user', {}).get('name', 'unknown')
363 |
# }
364 |
# with open(annotation_file, 'w') as f:
365 |
# json.dump(annotation_data, f, indent=4)
366 |
# return jsonify({'success': True, 'message': 'Annotations saved successfully'})
367 |
368 |
# @app.route('/get_annotations/<path:video_name>')
369 |
# @login_required
370 |
# def get_annotations(video_name):
371 |
# """Get annotations for a video."""
372 |
# annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_name}_annotations.json")
373 |
# if not os.path.exists(annotation_file):
374 |
# return jsonify({'error': 'No annotations found'}), 404
375 |
# with open(annotation_file, 'r') as f:
376 |
# annotations = json.load(f)
377 |
# return jsonify(annotations)
378 |
379 |
# @app.route("/alignment/<video_id>")
380 |
# @login_required
381 |
# def alignment_mode(video_id):
382 |
# """Page for aligning sign language with transcribed text."""
383 |
# annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_id}_annotations.json")
384 |
# if not os.path.exists(annotation_file):
385 |
# return render_template("error.html", message="No annotations found for this video. Please annotate the video first.")
386 |
# with open(annotation_file, 'r') as f:
387 |
# annotations = json.load(f)
388 |
# return render_template(
389 |
# "alignment.html",
390 |
# video_id=video_id,
391 |
# total_clips=len(annotations['timestamps']) - 1,
392 |
# user=session.get('user')
393 |
# )
394 |
395 |
# @app.route("/api/transcript/<video_id>")
396 |
# @login_required
397 |
# def get_transcript(video_id):
398 |
# """Get transcript for a video."""
399 |
# timestamps_file = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
400 |
# logger.info(f"Attempting to load word timestamps from: {timestamps_file}")
401 |
# if not os.path.exists(timestamps_file):
402 |
# logger.warning(f"Word timestamps file not found: {timestamps_file}")
403 |
# return jsonify({
404 |
# "status": "error",
405 |
# "message": "No word timestamps found for this video"
406 |
# }), 404
407 |
# try:
408 |
# with open(timestamps_file, 'r') as f:
409 |
# word_data = json.load(f)
410 |
# full_text = " ".join(item["punctuated_word"] for item in word_data)
411 |
# words_with_times = [{
412 |
# "word": item["punctuated_word"],
413 |
# "start": float(item["start_time"]),
414 |
# "end": float(item["end_time"])
415 |
# } for item in word_data]
416 |
# logger.info(f"Successfully created transcript ({len(full_text)} characters)")
417 |
# return jsonify({
418 |
# "status": "success",
419 |
# "text": full_text,
420 |
# "words": words_with_times
421 |
# })
422 |
# except Exception as e:
423 |
# logger.error(f"Error processing word timestamps: {str(e)}")
424 |
# return jsonify({
425 |
# "status": "error",
426 |
# "message": f"Error processing word timestamps: {str(e)}"
427 |
# }), 500
428 |
429 |
# @app.route("/api/word_timestamps/<video_id>")
430 |
# @login_required
431 |
# def get_word_timestamps(video_id):
432 |
# """Get word-level timestamps for a video."""
433 |
# timestamps_file = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
434 |
# logger.info(f"Attempting to load word timestamps from: {timestamps_file}")
435 |
# if not os.path.exists(timestamps_file):
436 |
# logger.warning(f"Word timestamps file not found: {timestamps_file}")
437 |
# return jsonify({
438 |
# "status": "error",
439 |
# "message": "No word timestamps found for this video"
440 |
# }), 404
441 |
# try:
442 |
# with open(timestamps_file, 'r') as f:
443 |
# word_data = json.load(f)
444 |
# logger.info(f"Successfully loaded {len(word_data)} word timestamps")
445 |
# return jsonify({
446 |
# "status": "success",
447 |
# "words": word_data
448 |
# })
449 |
# except Exception as e:
450 |
# logger.error(f"Error processing word timestamps: {str(e)}")
451 |
# return jsonify({
452 |
# "status": "error",
453 |
# "message": f"Error processing word timestamps: {str(e)}"
454 |
# }), 500
455 |
456 |
# @app.route("/api/clips/<video_id>")
457 |
# @login_required
458 |
# def get_video_clips(video_id):
459 |
# """Get clips for a video."""
460 |
# try:
461 |
# annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_id}_annotations.json")
462 |
# if not os.path.exists(annotation_file):
463 |
# raise FileNotFoundError("Annotations not found")
464 |
# with open(annotation_file, 'r') as f:
465 |
# annotations = json.load(f)
466 |
# timestamps = annotations['timestamps']
467 |
# clips = []
468 |
# for i in range(len(timestamps)-1):
469 |
# clips.append({
470 |
# "index": i,
471 |
# "start": timestamps[i],
472 |
# "end": timestamps[i+1],
473 |
# "path": f"/clip/{video_id}/{i}"
474 |
# })
475 |
# return jsonify({
476 |
# "status": "success",
477 |
# "clips": clips
478 |
# })
479 |
# except Exception as e:
480 |
# logger.error(f"Error getting clips: {str(e)}")
481 |
# return jsonify({
482 |
# "status": "error",
483 |
# "message": str(e)
484 |
# }), 500
485 |
486 |
# @app.route("/clip/<video_id>/<int:clip_index>")
487 |
# @login_required
488 |
# def serve_clip(video_id, clip_index):
489 |
# """Serve a specific clip."""
490 |
# clip_path = os.path.join(
491 |
492 |
# f"{video_id}_clip_{clip_index:03d}.mp4"
493 |
# )
494 |
# logger.info(f"Attempting to serve clip: {clip_path}")
495 |
# if not os.path.exists(clip_path):
496 |
# logger.error(f"Clip not found: {clip_path}")
497 |
# return jsonify({
498 |
# "status": "error",
499 |
# "message": "Clip not found"
500 |
# }), 404
501 |
# return send_file(clip_path, mimetype="video/mp4")
502 |
503 |
# @app.route("/api/save_alignments", methods=["POST"])
504 |
# @login_required
505 |
# def save_alignments():
506 |
# """Save alignment data."""
507 |
# try:
508 |
# data = request.json
509 |
# if not data or 'video_id' not in data or 'alignments' not in data:
510 |
# return jsonify({'success': False, 'message': 'Invalid data'}), 400
511 |
512 |
# # Add user information to the alignments
513 |
# for alignment in data['alignments']:
514 |
# if alignment:
515 |
# alignment['aligned_by'] = session.get('user', {}).get('name', 'unknown')
516 |
517 |
# output_path = os.path.join(ALIGNMENTS_DIR, f"{data['video_id']}.json")
518 |
# with open(output_path, "w") as f:
519 |
# json.dump(data['alignments'], f, indent=2)
520 |
# return jsonify({
521 |
# "success": True,
522 |
# "message": "Alignments saved successfully"
523 |
# })
524 |
# except Exception as e:
525 |
# logger.error(f"Error saving alignments: {str(e)}")
526 |
# return jsonify({
527 |
# "success": False,
528 |
# "message": str(e)
529 |
# }), 500
530 |
531 |
# @app.route("/api/extract_clips/<video_id>")
532 |
# @login_required
533 |
# def extract_clips_for_video(video_id):
534 |
# """Extract clips and start transcription for a video."""
535 |
# status = clip_extraction_status.get(video_id, {})
536 |
# if status.get("percent", 0) < 100:
537 |
# thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
538 |
# thread.start()
539 |
# if video_id not in transcription_progress_status or transcription_progress_status.get(video_id, {}).get("percent", 0) < 100:
540 |
# thread_trans = threading.Thread(target=run_transcription, args=(video_id,))
541 |
# thread_trans.start()
542 |
# return jsonify({"status": "started"})
543 |
544 |
# @app.route("/api/clip_progress/<video_id>")
545 |
# @login_required
546 |
# def clip_progress(video_id):
547 |
# """Get clip extraction progress."""
548 |
# progress = clip_extraction_status.get(video_id, {"current": 0, "total": 0, "percent": 0})
549 |
# return jsonify(progress)
550 |
551 |
# @app.route("/api/transcription_progress/<video_id>")
552 |
# @login_required
553 |
# def transcription_progress(video_id):
554 |
# """Get transcription progress."""
555 |
# progress = transcription_progress_status.get(video_id, {"status": "not started", "percent": 0})
556 |
# return jsonify(progress)
557 |
558 |
# if __name__ == '__main__':
559 |
# try:
560 |
# # Print diagnostic information
561 |
# print("=" * 50)
562 |
# print(f"Starting app with configuration:")
563 |
# print(f"- Running in HF Space: {is_hf_space}")
564 |
# print(f"- Auth bypass: {bypass_auth}")
565 |
# print(f"- Port: {os.getenv('PORT', 5000)}")
566 |
# print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
567 |
# print("=" * 50)
568 |
569 |
# port = int(os.getenv('PORT', 5000))
570 |
# app.run(host='', port=port, debug=True)
571 |
# except Exception as e:
572 |
# print(f"Error starting the application: {e}")
573 |
# import traceback
574 |
# traceback.print_exc()
575 |
576 |
577 |
from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
578 |
import os, json, threading, time, signal, sys
579 |
from datetime import datetime
580 |
from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
581 |
import logging
582 |
from dotenv import load_dotenv
583 |
import boto3
584 |
from botocore.exceptions import ClientError
585 |
import tempfile
586 |
import uuid
587 |
import requests
588 |
from urllib.parse import urlparse
589 |
590 |
# Load environment variables
591 |
629 |
ALIGNMENTS_DIR = os.path.abspath("data/alignments")
630 |
TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
631 |
632 |
# S3 configuration
633 |
S3_BUCKET = os.getenv('S3_BUCKET', "sorenson-ai-sb-scratch")
634 |
S3_VIDEO_PREFIX = os.getenv('S3_VIDEO_PREFIX', "awilkinson/kylie_dataset_videos_for_alignment_webapp/")
635 |
USE_S3_FOR_VIDEOS = os.getenv('USE_S3_FOR_VIDEOS', 'true').lower() == 'true'
636 |
637 |
# Ensure all required directories exist
638 |
639 |
os.makedirs(directory, exist_ok=True)
642 |
clip_extraction_status = {}
643 |
transcription_progress_status = {}
644 |
645 |
# S3 helper functions
646 |
def get_s3_client():
647 |
"""Get a boto3 S3 client."""
648 |
return boto3.client(
649 |
650 |
region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
651 |
652 |
653 |
654 |
655 |
def list_s3_videos():
656 |
"""List all videos in the S3 bucket with the given prefix."""
657 |
658 |
s3_client = get_s3_client()
659 |
response = s3_client.list_objects_v2(
660 |
661 |
662 |
663 |
664 |
if 'Contents' not in response:
665 |
logger.warning(f"No videos found in S3 bucket {S3_BUCKET} with prefix {S3_VIDEO_PREFIX}")
666 |
return []
667 |
668 |
# Extract video IDs (filenames without extension) from S3 keys
669 |
videos = []
670 |
for item in response['Contents']:
671 |
key = item['Key']
672 |
if key.endswith('.mp4'):
673 |
# Extract just the filename without extension
674 |
filename = os.path.basename(key)
675 |
video_id = os.path.splitext(filename)[0]
676 |
677 |
678 |
return videos
679 |
except ClientError as e:
680 |
logger.error(f"Error listing videos from S3: {str(e)}")
681 |
return []
682 |
683 |
def download_video_from_s3(video_id):
684 |
"""Download a video from S3 to the local videos directory."""
685 |
video_filename = f"{video_id}.mp4"
686 |
s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
687 |
local_path = os.path.join(VIDEO_DIR, video_filename)
688 |
689 |
# Check if the file already exists locally
690 |
if os.path.exists(local_path):
691 |
logger.info(f"Video {video_id} already exists locally.")
692 |
return local_path
693 |
694 |
695 |
logger.info(f"Downloading video {video_id} from S3...")
696 |
s3_client = get_s3_client()
697 |
s3_client.download_file(S3_BUCKET, s3_key, local_path)
698 |
logger.info(f"Video {video_id} downloaded successfully to {local_path}")
699 |
return local_path
700 |
except ClientError as e:
701 |
logger.error(f"Error downloading video from S3: {str(e)}")
702 |
return None
703 |
704 |
def generate_presigned_url(video_id, expiration=3600):
705 |
"""Generate a presigned URL for direct access to the video in S3."""
706 |
video_filename = f"{video_id}.mp4"
707 |
s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
708 |
709 |
710 |
s3_client = get_s3_client()
711 |
url = s3_client.generate_presigned_url(
712 |
713 |
Params={'Bucket': S3_BUCKET, 'Key': s3_key},
714 |
715 |
716 |
return url
717 |
except ClientError as e:
718 |
logger.error(f"Error generating presigned URL: {str(e)}")
719 |
return None
720 |
721 |
# Graceful shutdown handler
722 |
def graceful_shutdown(signum, frame):
723 |
"""Handle graceful shutdown on signals."""
779 |
transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
780 |
781 |
782 |
# Download video from S3 if needed
783 |
784 |
video_path = download_video_from_s3(video_id)
785 |
if not video_path:
786 |
transcription_progress_status[video_id] = {
787 |
"status": "error",
788 |
"percent": 0,
789 |
"message": f"Failed to download video {video_id} from S3"
790 |
791 |
792 |
793 |
video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
794 |
795 |
transcription_progress_status[video_id] = {"status": "started", "percent": 10}
796 |
797 |
# Check if AWS credentials are available
852 |
return render_template('error.html', message="Authentication failed. No username provided.")
853 |
return redirect(url_for('login'))
854 |
855 |
856 |
def health_check():
857 |
"""Health check endpoint for container verification."""
861 |
"DEBUG": os.environ.get('DEBUG', 'Not set'),
862 |
"SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
863 |
"BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
864 |
"SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set',
865 |
"S3_BUCKET": os.environ.get('S3_BUCKET', 'Not set'),
866 |
"S3_VIDEO_PREFIX": os.environ.get('S3_VIDEO_PREFIX', 'Not set'),
867 |
"USE_S3_FOR_VIDEOS": os.environ.get('USE_S3_FOR_VIDEOS', 'Not set')
868 |
869 |
870 |
logger.info(f"Health check called. Environment: {env_vars}")
978 |
"app_config": {k: str(v) for k, v in app.config.items() if k in
979 |
980 |
981 |
"s3_config": {
982 |
983 |
984 |
985 |
986 |
987 |
return jsonify(info)
988 |
997 |
998 |
def select_video():
999 |
"""Page to select a video for annotation."""
1000 |
1001 |
video_ids = list_s3_videos()
1002 |
1003 |
if not os.path.exists(VIDEO_DIR):
1004 |
return render_template('error.html', message="Video directory not found.")
1005 |
videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
1006 |
video_ids = [os.path.splitext(v)[0] for v in videos]
1007 |
1008 |
return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
1009 |
1010 |
1017 |
1018 |
def get_videos():
1019 |
"""API endpoint to get available videos."""
1020 |
1021 |
videos = list_s3_videos()
1022 |
if not videos:
1023 |
return jsonify({'error': 'No videos found in S3'}), 404
1024 |
# Return just the filenames with .mp4 extension for compatibility
1025 |
return jsonify([f"{vid}.mp4" for vid in videos])
1026 |
1027 |
# Original local file behavior
1028 |
if not os.path.exists(VIDEO_DIR):
1029 |
return jsonify({'error': 'Video directory not found'}), 404
1030 |
videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
1031 |
if not videos:
1032 |
return jsonify({'error': 'No videos found'}), 404
1033 |
return jsonify(videos)
1034 |
1035 |
1036 |
1037 |
def serve_video(filename):
1038 |
"""Serve a video file from S3 or local storage."""
1039 |
video_id = os.path.splitext(filename)[0] # Remove extension
1040 |
1041 |
1042 |
# Option 1: Generate a presigned URL and redirect
1043 |
presigned_url = generate_presigned_url(video_id)
1044 |
if presigned_url:
1045 |
return redirect(presigned_url)
1046 |
1047 |
# Option 2 (fallback): Download from S3 to local temporary storage and serve
1048 |
local_path = download_video_from_s3(video_id)
1049 |
if local_path and os.path.exists(local_path):
1050 |
return send_from_directory(VIDEO_DIR, filename)
1051 |
1052 |
return jsonify({'error': 'Video not found in S3'}), 404
1053 |
1054 |
# Original local file behavior
1055 |
if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
1056 |
return jsonify({'error': 'Video not found'}), 404
1057 |
return send_from_directory(VIDEO_DIR, filename)
1058 |
1059 |
@app.route('/save_annotations', methods=['POST'])
1060 |
1242 |
1243 |
def extract_clips_for_video(video_id):
1244 |
"""Extract clips and start transcription for a video."""
1245 |
# If using S3, ensure the video is downloaded first
1246 |
1247 |
video_path = download_video_from_s3(video_id)
1248 |
if not video_path:
1249 |
return jsonify({
1250 |
"status": "error",
1251 |
"message": f"Failed to download video {video_id} from S3"
1252 |
}), 404
1253 |
1254 |
status = clip_extraction_status.get(video_id, {})
1255 |
if status.get("percent", 0) < 100:
1256 |
thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
1282 |
print(f"- Running in HF Space: {is_hf_space}")
1283 |
print(f"- Auth bypass: {bypass_auth}")
1284 |
print(f"- Port: {os.getenv('PORT', 5000)}")
1285 |
print(f"- S3 for videos: {USE_S3_FOR_VIDEOS}")
1286 |
print(f"- S3 bucket: {S3_BUCKET}")
1287 |
print(f"- S3 prefix: {S3_VIDEO_PREFIX}")
1288 |
print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
1289 |
1290 |
1291 |
s3_videos = list_s3_videos()
1292 |
print(f"- Available S3 videos: {s3_videos if s3_videos else 'None'}")
1293 |
except Exception as e:
1294 |
print(f"- Error listing S3 videos: {str(e)}")
1295 |
print("=" * 50)
1296 |
1297 |
port = int(os.getenv('PORT', 5000))
@@ -8,6 +8,41 @@ import requests
8 |
import time
9 |
from decimal import Decimal
10 |
from typing import Any, Dict, List
11 |
12 |
def extract_audio(video_path: str) -> str:
13 |
"""Extract audio from video file using ffmpeg.
@@ -113,17 +148,21 @@ def main() -> None:
113 |
114 |
video_filename = args.video_id + ".mp4" # Source video file (with .mp4)
115 |
video_path = os.path.join(base_dir, "data", "videos", video_filename)
116 |
word_timestamps = get_word_timestamps(video_path)
117 |
output_dir = os.path.join(base_dir, "data", "word_timestamps")
118 |
os.makedirs(output_dir, exist_ok=True)
119 |
output_path = os.path.join(output_dir, args.video_id + "_word_timestamps.json")
120 |
with open(output_path, "w") as f:
121 |
json.dump(word_timestamps, f, indent=4)
122 |
print(f"Word timestamps saved to: {output_path}")
123 |
124 |
if __name__ == "__main__":
125 |
import argparse
126 |
parser = argparse.ArgumentParser(description="Get word timestamps for a given video file ID.")
127 |
parser.add_argument("video_id", help="Video file ID (without extension)")
128 |
args = parser.parse_args()
129 |
8 |
import time
9 |
from decimal import Decimal
10 |
from typing import Any, Dict, List
11 |
from botocore.exceptions import ClientError
12 |
13 |
S3_BUCKET = "sorenson-ai-sb-scratch"
14 |
S3_VIDEO_PREFIX = "awilkinson/kylie_dataset_videos_for_alignment_webapp/"
15 |
USE_S3_FOR_VIDEOS = True # Set to True to use S3, False to use local files
16 |
17 |
def get_s3_client():
18 |
"""Get a boto3 S3 client."""
19 |
return boto3.client(
20 |
21 |
region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
22 |
23 |
24 |
25 |
26 |
def download_video_from_s3(video_id, output_dir):
27 |
"""Download a video from S3."""
28 |
video_filename = f"{video_id}.mp4"
29 |
s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
30 |
local_path = os.path.join(output_dir, video_filename)
31 |
32 |
# Check if the file already exists locally
33 |
if os.path.exists(local_path):
34 |
print(f"Video {video_id} already exists locally.")
35 |
return local_path
36 |
37 |
38 |
print(f"Downloading video {video_id} from S3...")
39 |
s3_client = get_s3_client()
40 |
s3_client.download_file(S3_BUCKET, s3_key, local_path)
41 |
print(f"Video {video_id} downloaded successfully to {local_path}")
42 |
return local_path
43 |
except ClientError as e:
44 |
print(f"Error downloading video from S3: {str(e)}")
45 |
return None
46 |
47 |
def extract_audio(video_path: str) -> str:
48 |
"""Extract audio from video file using ffmpeg.
148 |
149 |
video_filename = args.video_id + ".mp4" # Source video file (with .mp4)
150 |
video_path = os.path.join(base_dir, "data", "videos", video_filename)
151 |
152 |
# Check if we need to download from S3
153 |
if USE_S3_FOR_VIDEOS and not os.path.exists(video_path):
154 |
videos_dir = os.path.join(base_dir, "data", "videos")
155 |
os.makedirs(videos_dir, exist_ok=True)
156 |
download_video_from_s3(args.video_id, videos_dir)
157 |
158 |
if not os.path.exists(video_path):
159 |
print(f"Error: Video file not found: {video_path}")
160 |
161 |
162 |
word_timestamps = get_word_timestamps(video_path)
163 |
output_dir = os.path.join(base_dir, "data", "word_timestamps")
164 |
os.makedirs(output_dir, exist_ok=True)
165 |
output_path = os.path.join(output_dir, args.video_id + "_word_timestamps.json")
166 |
with open(output_path, "w") as f:
167 |
json.dump(word_timestamps, f, indent=4)
168 |
print(f"Word timestamps saved to: {output_path}")
@@ -137,8 +137,10 @@
137 |
// Use the provided template video_id if available; it should be the base ID (without .mp4)
138 |
const templateVideoId = "{{ video_id|default('') }}";
139 |
let currentVideo = "";
140 |
if (templateVideoId) {
141 |
currentVideo = templateVideoId;
142 |
} else {
143 |
// Fallback: use /videos API and remove the .mp4 extension
144 |
@@ -150,8 +152,7 @@
150 |
151 |
if (videos.length > 0) {
152 |
currentVideo = videos[0].replace(/\.mp4$/, "");
153 |
154 |
155 |
156 |
157 |
.catch(error => {
@@ -311,6 +312,28 @@
311 |
div.textContent = timestamps.map(t => t.toFixed(2)).join(', ');
312 |
313 |
314 |
video.addEventListener('timeupdate', () => {
315 |
currentTimeDisplay.textContent = `Current Time: ${video.currentTime.toFixed(2)}`;
316 |
137 |
// Use the provided template video_id if available; it should be the base ID (without .mp4)
138 |
const templateVideoId = "{{ video_id|default('') }}";
139 |
let currentVideo = "";
140 |
141 |
if (templateVideoId) {
142 |
currentVideo = templateVideoId;
143 |
144 |
} else {
145 |
// Fallback: use /videos API and remove the .mp4 extension
146 |
152 |
153 |
if (videos.length > 0) {
154 |
currentVideo = videos[0].replace(/\.mp4$/, "");
155 |
156 |
157 |
158 |
.catch(error => {
312 |
div.textContent = timestamps.map(t => t.toFixed(2)).join(', ');
313 |
314 |
315 |
function loadVideoSource(videoId) {
316 |
const videoUrl = `/video/${videoId}.mp4`;
317 |
318 |
319 |
.then(response => {
320 |
if (response.redirected) {
321 |
// If we've been redirected to a presigned S3 URL
322 |
document.getElementById('video-source').src = response.url;
323 |
324 |
} else if (response.ok) {
325 |
// If it's a direct file response
326 |
document.getElementById('video-source').src = videoUrl;
327 |
328 |
} else {
329 |
throw new Error('Video not found');
330 |
331 |
332 |
.catch(error => {
333 |
document.getElementById('error-message').textContent = 'Error loading video: ' + error;
334 |
335 |
336 |
337 |
video.addEventListener('timeupdate', () => {
338 |
currentTimeDisplay.textContent = `Current Time: ${video.currentTime.toFixed(2)}`;
339 |