Perilon commited on
Commit
2daffd5
·
1 Parent(s): c23fdff
.hf-space CHANGED
@@ -7,8 +7,11 @@ config:
7
  - PORT=7860
8
  - SPACE_ID=true
9
  - BYPASS_AUTH=true
 
 
 
10
  resources:
11
  cpu: 1
12
  memory: 1
13
  gpu: null
14
- restarts: true
 
7
  - PORT=7860
8
  - SPACE_ID=true
9
  - BYPASS_AUTH=true
10
+ - S3_BUCKET=sorenson-ai-sb-scratch
11
+ - S3_VIDEO_PREFIX=awilkinson/kylie_dataset_videos_for_alignment_webapp/
12
+ - USE_S3_FOR_VIDEOS=true
13
  resources:
14
  cpu: 1
15
  memory: 1
16
  gpu: null
17
+ restarts: true
Dockerfile CHANGED
@@ -16,10 +16,12 @@ COPY . .
16
  ENV PYTHONUNBUFFERED=1
17
  ENV PORT=7860
18
  ENV SPACE_ID="true"
19
- # Add explicit environment variable to enable authentication bypass for troubleshooting
20
  ENV BYPASS_AUTH="true"
21
  ENV SECRET_KEY="f7290fc27f11dbf14be6cd348638ad62"
22
  ENV DEBUG="True"
 
 
 
23
 
24
  # Create necessary directories
25
  RUN mkdir -p data/videos data/annotations data/temp data/word_timestamps data/alignments data/transcripts
 
16
  ENV PYTHONUNBUFFERED=1
17
  ENV PORT=7860
18
  ENV SPACE_ID="true"
 
19
  ENV BYPASS_AUTH="true"
20
  ENV SECRET_KEY="f7290fc27f11dbf14be6cd348638ad62"
21
  ENV DEBUG="True"
22
+ ENV S3_BUCKET="sorenson-ai-sb-scratch"
23
+ ENV S3_VIDEO_PREFIX="awilkinson/kylie_dataset_videos_for_alignment_webapp/"
24
+ ENV USE_S3_FOR_VIDEOS="true"
25
 
26
  # Create necessary directories
27
  RUN mkdir -p data/videos data/annotations data/temp data/word_timestamps data/alignments data/transcripts
docker-compose.yml CHANGED
@@ -13,5 +13,8 @@ services:
13
  - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
14
  - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
15
  - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-us-west-2}
 
 
 
16
  volumes:
17
  - ./data:/app/data
 
13
  - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
14
  - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
15
  - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-us-west-2}
16
+ - S3_BUCKET=sorenson-ai-sb-scratch
17
+ - S3_VIDEO_PREFIX=awilkinson/kylie_dataset_videos_for_alignment_webapp/
18
+ - USE_S3_FOR_VIDEOS=true
19
  volumes:
20
  - ./data:/app/data
extract_signed_segments_from_annotations.py CHANGED
@@ -1,212 +1,3 @@
1
- # #!/usr/bin/env python3
2
-
3
- # import json
4
- # import os
5
- # import pathlib
6
- # import subprocess
7
- # from dataclasses import dataclass
8
- # from decimal import Decimal
9
- # from typing import List, Optional, Tuple
10
- # import argparse
11
-
12
- # try:
13
- # from tqdm import tqdm
14
- # except ImportError:
15
- # def tqdm(iterable, **kwargs):
16
- # return iterable
17
- # def write(msg):
18
- # print(msg)
19
-
20
- # @dataclass
21
- # class VideoClip:
22
- # """Represents a video clip with timing information."""
23
- # start_time: float
24
- # end_time: float
25
- # clip_path: str
26
- # original_video: str
27
- # index: int
28
-
29
- # @property
30
- # def duration(self) -> float:
31
- # return self.end_time - self.start_time
32
-
33
- # class ClipExtractor:
34
- # """Handles extraction of video clips based on annotation timestamps."""
35
-
36
- # def __init__(self, base_dir: str) -> None:
37
- # """Initialize with project base directory."""
38
- # self.base_dir = base_dir
39
- # self.temp_dir = os.path.join(base_dir, "data", "temp")
40
- # self.videos_dir = os.path.join(base_dir, "data", "videos")
41
- # self.annotations_dir = os.path.join(base_dir, "data", "annotations")
42
- # self.metadata_dir = os.path.join(base_dir, "data", "segment_metadata")
43
- # os.makedirs(self.temp_dir, exist_ok=True)
44
- # os.makedirs(self.metadata_dir, exist_ok=True)
45
-
46
- # def validate_timestamps(self, timestamps: List[float]) -> Tuple[bool, Optional[str]]:
47
- # if not timestamps:
48
- # return False, "No timestamps found in annotation file."
49
- # for i in range(len(timestamps) - 1):
50
- # if timestamps[i] >= timestamps[i + 1]:
51
- # return False, (f"Invalid timestamp order: {str(round(Decimal(timestamps[i]), 3))} seconds "
52
- # f"followed by {str(round(Decimal(timestamps[i + 1]), 3))} seconds")
53
- # return True, None
54
-
55
- # def get_video_duration(self, video_path: str) -> float:
56
- # try:
57
- # cmd = [
58
- # "ffprobe",
59
- # "-v", "error",
60
- # "-show_entries", "format=duration",
61
- # "-of", "default=noprint_wrappers=1:nokey=1",
62
- # video_path
63
- # ]
64
- # output = subprocess.check_output(cmd).decode().strip()
65
- # return float(output)
66
- # except subprocess.CalledProcessError as e:
67
- # raise RuntimeError(f"Failed to get video duration. Error: {str(e)}")
68
-
69
- # def extract_clip(self, video_path: str, start_time: float, end_time: float, output_path: str) -> bool:
70
- # try:
71
- # cmd = [
72
- # "ffmpeg",
73
- # "-i", video_path,
74
- # "-ss", str(start_time),
75
- # "-t", str(end_time - start_time),
76
- # "-c:v", "libx264",
77
- # "-c:a", "aac",
78
- # "-y",
79
- # output_path
80
- # ]
81
- # subprocess.run(cmd, check=True, capture_output=True)
82
- # if not os.path.exists(output_path):
83
- # print(f"Warning: ffmpeg completed but file not found: {output_path}")
84
- # return False
85
- # file_size = os.path.getsize(output_path)
86
- # print(f"Created clip: {output_path} ({file_size} bytes)")
87
- # return True
88
- # except subprocess.CalledProcessError as e:
89
- # print(f"Error extracting clip. Details: {str(e)}")
90
- # return False
91
-
92
- # def extract_clips_from_annotations(self, video_id: str, progress_callback=None) -> List[VideoClip]:
93
- # # Determine paths
94
- # video_path = os.path.join(self.videos_dir, f"{video_id}.mp4")
95
- # annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
96
- # if not os.path.exists(video_path):
97
- # raise FileNotFoundError(f"Video file not found: {video_path}")
98
- # if not os.path.exists(annotation_path):
99
- # raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
100
-
101
- # with open(annotation_path, "r") as f:
102
- # annotations = json.load(f)
103
- # timestamps = sorted(annotations["timestamps"])
104
- # is_valid, error_message = self.validate_timestamps(timestamps)
105
- # if not is_valid:
106
- # raise ValueError(f"Invalid timestamps in annotation file. {error_message}")
107
- # video_duration = self.get_video_duration(video_path)
108
- # if timestamps[-1] > video_duration:
109
- # raise ValueError(
110
- # f"Final timestamp ({str(round(Decimal(timestamps[-1]), 3))} seconds) " +
111
- # f"exceeds video duration ({str(round(Decimal(video_duration), 3))} seconds)"
112
- # )
113
-
114
- # # Create segments using only consecutive pairs from the annotated boundaries.
115
- # segments = [(timestamps[i], timestamps[i+1]) for i in range(len(timestamps)-1)]
116
- # total_clips = len(segments)
117
-
118
- # # Check metadata to see if segmentation is up-to-date.
119
- # metadata_file = os.path.join(self.metadata_dir, f"{video_id}_metadata.json")
120
- # use_cached = False
121
- # if os.path.exists(metadata_file):
122
- # with open(metadata_file, "r") as meta_f:
123
- # try:
124
- # meta_data = json.load(meta_f)
125
- # if meta_data.get("segments") == segments:
126
- # use_cached = True
127
- # except Exception as ex:
128
- # use_cached = False
129
-
130
- # if use_cached:
131
- # clips = []
132
- # for i, (start, end) in enumerate(segments):
133
- # clip_path = os.path.join(self.temp_dir, f"{video_id}_clip_{i:03d}.mp4")
134
- # if not os.path.exists(clip_path):
135
- # use_cached = False
136
- # break
137
- # clips.append(VideoClip(start, end, clip_path, video_id, i))
138
- # if use_cached:
139
- # if progress_callback:
140
- # progress_callback(total_clips, total_clips)
141
- # else:
142
- # print("Using cached segmentation as boundaries haven't changed.")
143
- # return clips
144
-
145
- # # If metadata is missing, boundaries have changed, or some clip file is missing, re-run segmentation.
146
- # clips = []
147
- # current_clip = 0
148
- # use_tqdm = progress_callback is None
149
- # if use_tqdm:
150
- # pbar = tqdm(total=total_clips, desc="Extracting clips")
151
- # for segment in segments:
152
- # start, end = segment
153
- # clip_path = os.path.join(self.temp_dir, f"{video_id}_clip_{current_clip:03d}.mp4")
154
- # if self.extract_clip(video_path, start, end, clip_path):
155
- # clips.append(VideoClip(start, end, clip_path, video_id, current_clip))
156
- # current_clip += 1
157
- # if progress_callback:
158
- # progress_callback(current_clip, total_clips)
159
- # elif use_tqdm:
160
- # pbar.update(1)
161
- # if use_tqdm:
162
- # pbar.close()
163
- # # Save segmentation metadata for future use.
164
- # meta_data = {"segments": segments}
165
- # with open(metadata_file, "w") as meta_f:
166
- # json.dump(meta_data, meta_f, indent=4)
167
- # return clips
168
-
169
- # def cleanup_clips(self, clips: List[VideoClip]) -> None:
170
- # if not clips:
171
- # return
172
- # print("\nCleaning up temporary files...")
173
- # for clip in clips:
174
- # if os.path.exists(clip.clip_path):
175
- # try:
176
- # print(f"Removing: {clip.clip_path}")
177
- # os.remove(clip.clip_path)
178
- # except OSError as e:
179
- # print(f"Warning: Failed to remove clip {clip.clip_path}. Error: {str(e)}")
180
- # else:
181
- # print(f"Warning: File not found for cleanup: {clip.clip_path}")
182
-
183
- # def main() -> None:
184
- # parser = argparse.ArgumentParser(description="Extract video clips based on annotations for a given video file ID.")
185
- # parser.add_argument("video_id", help="Video file ID (without extension)")
186
- # args = parser.parse_args()
187
-
188
- # base_dir = os.path.join(
189
- # str(pathlib.Path.home()),
190
- # "andrew_messaround",
191
- # "vsl_speech_to_signing_alignment",
192
- # "boundary_annotation_webapp"
193
- # )
194
- # extractor = ClipExtractor(base_dir)
195
- # try:
196
- # clips = extractor.extract_clips_from_annotations(args.video_id)
197
- # print(f"\nSuccessfully extracted {len(clips)} clips:")
198
- # for clip in clips:
199
- # print(f"Clip {clip.index}: {round(clip.start_time, 2)}s → {round(clip.end_time, 2)}s")
200
- # print(f"Duration: {round(clip.duration, 2)}s")
201
- # print(f"Path: {clip.clip_path}\n")
202
- # except Exception as e:
203
- # print(f"Error: {str(e)}")
204
-
205
- # if __name__ == "__main__":
206
- # main()
207
-
208
- #!/usr/bin/env python3
209
-
210
  import json
211
  import os
212
  import pathlib
@@ -215,6 +6,8 @@ from dataclasses import dataclass
215
  from decimal import Decimal
216
  from typing import List, Optional, Tuple
217
  import argparse
 
 
218
 
219
  try:
220
  from tqdm import tqdm
@@ -226,6 +19,20 @@ except ImportError:
226
 
227
  from concurrent.futures import ThreadPoolExecutor, as_completed
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  @dataclass
230
  class VideoClip:
231
  """Represents a video clip with timing information."""
@@ -300,15 +107,31 @@ class ClipExtractor:
300
  print(f"Error extracting clip. Details: {str(e)}")
301
  return False
302
 
303
- def extract_clips_from_annotations(self, video_id: str, progress_callback=None) -> List[VideoClip]:
 
304
  # Determine paths
305
- video_path = os.path.join(self.videos_dir, f"{video_id}.mp4")
 
306
  annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
307
- if not os.path.exists(video_path):
308
- raise FileNotFoundError(f"Video file not found: {video_path}")
309
  if not os.path.exists(annotation_path):
310
  raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
311
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  with open(annotation_path, "r") as f:
313
  annotations = json.load(f)
314
  timestamps = sorted(annotations["timestamps"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
  import pathlib
 
6
  from decimal import Decimal
7
  from typing import List, Optional, Tuple
8
  import argparse
9
+ import boto3
10
+ from botocore.exceptions import ClientError
11
 
12
  try:
13
  from tqdm import tqdm
 
19
 
20
  from concurrent.futures import ThreadPoolExecutor, as_completed
21
 
22
+ # Add these constants at the top of the file
23
+ S3_BUCKET = "sorenson-ai-sb-scratch"
24
+ S3_VIDEO_PREFIX = "awilkinson/kylie_dataset_videos_for_alignment_webapp/"
25
+ USE_S3_FOR_VIDEOS = True # Set to True to use S3, False to use local files
26
+
27
+ def get_s3_client():
28
+ """Get a boto3 S3 client."""
29
+ return boto3.client(
30
+ 's3',
31
+ region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
32
+ aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
33
+ aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
34
+ )
35
+
36
  @dataclass
37
  class VideoClip:
38
  """Represents a video clip with timing information."""
 
107
  print(f"Error extracting clip. Details: {str(e)}")
108
  return False
109
 
110
+ def extract_clips_from_annotations(self, video_id, progress_callback=None) -> List[VideoClip]:
111
+ """Extract clips based on annotation timestamps, handling S3 videos."""
112
  # Determine paths
113
+ video_filename = f"{video_id}.mp4"
114
+ video_path = os.path.join(self.videos_dir, video_filename)
115
  annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
116
+
 
117
  if not os.path.exists(annotation_path):
118
  raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
119
+
120
+ # Check if we need to download the video from S3
121
+ if USE_S3_FOR_VIDEOS and not os.path.exists(video_path):
122
+ print(f"Video not found locally. Downloading from S3: {video_id}")
123
+ s3_client = get_s3_client()
124
+ try:
125
+ s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
126
+ s3_client.download_file(S3_BUCKET, s3_key, video_path)
127
+ print(f"Video downloaded successfully: {video_path}")
128
+ except ClientError as e:
129
+ raise FileNotFoundError(f"Video file not found in S3: {s3_key}. Error: {str(e)}")
130
+
131
+ if not os.path.exists(video_path):
132
+ raise FileNotFoundError(f"Video file not found: {video_path}")
133
+
134
+ # Now continue with the original extraction process
135
  with open(annotation_path, "r") as f:
136
  annotations = json.load(f)
137
  timestamps = sorted(annotations["timestamps"])
flask_app.py CHANGED
@@ -1,9 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
2
  import os, json, threading, time, signal, sys
3
  from datetime import datetime
4
  from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
5
  import logging
6
  from dotenv import load_dotenv
 
 
 
 
 
 
7
 
8
  # Load environment variables
9
  load_dotenv()
@@ -47,6 +629,11 @@ WORD_TIMESTAMPS_DIR = os.path.abspath("data/word_timestamps")
47
  ALIGNMENTS_DIR = os.path.abspath("data/alignments")
48
  TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
49
 
 
 
 
 
 
50
  # Ensure all required directories exist
51
  for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALIGNMENTS_DIR, TRANSCRIPTS_DIR]:
52
  os.makedirs(directory, exist_ok=True)
@@ -55,6 +642,82 @@ for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALI
55
  clip_extraction_status = {}
56
  transcription_progress_status = {}
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  # Graceful shutdown handler
59
  def graceful_shutdown(signum, frame):
60
  """Handle graceful shutdown on signals."""
@@ -116,7 +779,19 @@ def run_transcription(video_id):
116
  transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
117
  return
118
 
119
- video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
 
 
 
 
 
 
 
 
 
 
 
 
120
  transcription_progress_status[video_id] = {"status": "started", "percent": 10}
121
 
122
  # Check if AWS credentials are available
@@ -177,7 +852,6 @@ def auth_callback():
177
  return render_template('error.html', message="Authentication failed. No username provided.")
178
  return redirect(url_for('login'))
179
 
180
- # Replace the health check route with this improved version
181
  @app.route('/health')
182
  def health_check():
183
  """Health check endpoint for container verification."""
@@ -187,7 +861,10 @@ def health_check():
187
  "DEBUG": os.environ.get('DEBUG', 'Not set'),
188
  "SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
189
  "BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
190
- "SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set'
 
 
 
191
  }
192
 
193
  logger.info(f"Health check called. Environment: {env_vars}")
@@ -301,6 +978,11 @@ def debug_info():
301
  "app_config": {k: str(v) for k, v in app.config.items() if k in
302
  ['SESSION_COOKIE_SECURE', 'SESSION_COOKIE_HTTPONLY',
303
  'SESSION_COOKIE_SAMESITE', 'PERMANENT_SESSION_LIFETIME']},
 
 
 
 
 
304
  }
305
  return jsonify(info)
306
 
@@ -315,10 +997,14 @@ def index():
315
  @login_required
316
  def select_video():
317
  """Page to select a video for annotation."""
318
- if not os.path.exists(VIDEO_DIR):
319
- return render_template('error.html', message="Video directory not found.")
320
- videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
321
- video_ids = [os.path.splitext(v)[0] for v in videos]
 
 
 
 
322
  return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
323
 
324
  @app.route('/player/<video_id>')
@@ -331,20 +1017,44 @@ def player(video_id):
331
  @login_required
332
  def get_videos():
333
  """API endpoint to get available videos."""
334
- if not os.path.exists(VIDEO_DIR):
335
- return jsonify({'error': 'Video directory not found'}), 404
336
- videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
337
- if not videos:
338
- return jsonify({'error': 'No videos found'}), 404
339
- return jsonify(videos)
 
 
 
 
 
 
 
 
340
 
341
  @app.route('/video/<path:filename>')
342
  @login_required
343
  def serve_video(filename):
344
- """Serve a video file."""
345
- if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
346
- return jsonify({'error': 'Video not found'}), 404
347
- return send_from_directory(VIDEO_DIR, filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
  @app.route('/save_annotations', methods=['POST'])
350
  @login_required
@@ -532,6 +1242,15 @@ def save_alignments():
532
  @login_required
533
  def extract_clips_for_video(video_id):
534
  """Extract clips and start transcription for a video."""
 
 
 
 
 
 
 
 
 
535
  status = clip_extraction_status.get(video_id, {})
536
  if status.get("percent", 0) < 100:
537
  thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
@@ -563,7 +1282,16 @@ if __name__ == '__main__':
563
  print(f"- Running in HF Space: {is_hf_space}")
564
  print(f"- Auth bypass: {bypass_auth}")
565
  print(f"- Port: {os.getenv('PORT', 5000)}")
 
 
 
566
  print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
 
 
 
 
 
 
567
  print("=" * 50)
568
 
569
  port = int(os.getenv('PORT', 5000))
 
1
+ # from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
2
+ # import os, json, threading, time, signal, sys
3
+ # from datetime import datetime
4
+ # from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
5
+ # import logging
6
+ # from dotenv import load_dotenv
7
+
8
+ # # Load environment variables
9
+ # load_dotenv()
10
+
11
+ # # Add this near the top with other environment variables
12
+ # bypass_auth = os.getenv('BYPASS_AUTH', 'false').lower() == 'true'
13
+
14
+ # # Configure logging first
15
+ # logging.basicConfig(
16
+ # level=logging.INFO,
17
+ # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ # )
19
+ # logger = logging.getLogger(__name__)
20
+
21
+ # # Hugging Face specific configuration
22
+ # is_hf_space = os.getenv('SPACE_ID') is not None
23
+ # if is_hf_space:
24
+ # logger.info("Running in Hugging Face Spaces environment")
25
+ # # Allow insecure transport for development in HF
26
+ # os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
27
+ # # Ensure port is set correctly
28
+ # os.environ['PORT'] = '7860'
29
+
30
+
31
+
32
+ # app = Flask(__name__)
33
+ # app.secret_key = os.getenv('SECRET_KEY', 'dev_key_for_testing')
34
+
35
+ # # Configure session for HF
36
+ # if is_hf_space:
37
+ # app.config['SESSION_COOKIE_SECURE'] = False
38
+ # app.config['SESSION_COOKIE_HTTPONLY'] = True
39
+ # app.config['SESSION_COOKIE_SAMESITE'] = None # Add this line
40
+ # app.config['PERMANENT_SESSION_LIFETIME'] = 86400 # 24 hours
41
+
42
+ # # Directory paths
43
+ # VIDEO_DIR = os.path.abspath("data/videos")
44
+ # ANNOTATIONS_DIR = os.path.abspath("data/annotations")
45
+ # TEMP_DIR = os.path.abspath("data/temp")
46
+ # WORD_TIMESTAMPS_DIR = os.path.abspath("data/word_timestamps")
47
+ # ALIGNMENTS_DIR = os.path.abspath("data/alignments")
48
+ # TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
49
+
50
+ # # Ensure all required directories exist
51
+ # for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALIGNMENTS_DIR, TRANSCRIPTS_DIR]:
52
+ # os.makedirs(directory, exist_ok=True)
53
+
54
+ # # Global dictionaries for progress tracking
55
+ # clip_extraction_status = {}
56
+ # transcription_progress_status = {}
57
+
58
+ # # Graceful shutdown handler
59
+ # def graceful_shutdown(signum, frame):
60
+ # """Handle graceful shutdown on signals."""
61
+ # logger.info(f"Received signal {signum}, shutting down gracefully...")
62
+ # # Clean up as needed here
63
+ # sys.exit(0)
64
+
65
+ # # Register signal handlers
66
+ # signal.signal(signal.SIGTERM, graceful_shutdown)
67
+ # signal.signal(signal.SIGINT, graceful_shutdown)
68
+
69
+ # # Login required decorator
70
+ # def login_required(f):
71
+ # from functools import wraps
72
+ # @wraps(f)
73
+ # def decorated_function(*args, **kwargs):
74
+ # if 'user' not in session:
75
+ # logger.info(f"User not in session, redirecting to login")
76
+ # return redirect(url_for('login'))
77
+ # return f(*args, **kwargs)
78
+ # return decorated_function
79
+
80
+ # # Allow specific users (for testing)
81
+ # def is_allowed_user(username):
82
+ # allowed_users_env = os.getenv('ALLOWED_USERS', 'Perilon') # Default to your username
83
+ # allowed_users = [user.strip() for user in allowed_users_env.split(',')]
84
+ # return username in allowed_users or not is_hf_space # Allow all users in local dev
85
+
86
+ # def update_extraction_progress(video_id, current, total):
87
+ # percent = int((current / total) * 100)
88
+ # clip_extraction_status[video_id] = {"current": current, "total": total, "percent": percent}
89
+
90
+ # def run_clip_extraction(video_id):
91
+ # try:
92
+ # base_dir = app.root_path
93
+ # extractor = ClipExtractor(base_dir)
94
+ # extractor.extract_clips_from_annotations(
95
+ # video_id,
96
+ # progress_callback=lambda current, total: update_extraction_progress(video_id, current, total)
97
+ # )
98
+ # if video_id in clip_extraction_status:
99
+ # status = clip_extraction_status[video_id]
100
+ # if status.get("percent", 0) < 100:
101
+ # update_extraction_progress(video_id, status["total"], status["total"])
102
+ # else:
103
+ # update_extraction_progress(video_id, 1, 1)
104
+ # except Exception as e:
105
+ # logger.error(f"Error during clip extraction for {video_id}: {str(e)}")
106
+ # clip_extraction_status[video_id] = {"error": str(e)}
107
+
108
+ # def run_transcription(video_id):
109
+ # try:
110
+ # base_dir = app.root_path
111
+ # output_path = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
112
+
113
+ # # Check if transcription already exists and is valid.
114
+ # if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
115
+ # logger.info(f"Using cached transcription for video {video_id}.")
116
+ # transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
117
+ # return
118
+
119
+ # video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
120
+ # transcription_progress_status[video_id] = {"status": "started", "percent": 10}
121
+
122
+ # # Check if AWS credentials are available
123
+ # if not os.environ.get('AWS_ACCESS_KEY_ID') or not os.environ.get('AWS_SECRET_ACCESS_KEY'):
124
+ # logger.warning("AWS credentials not found. Transcription will not work properly.")
125
+ # transcription_progress_status[video_id] = {
126
+ # "status": "error",
127
+ # "percent": 0,
128
+ # "message": "AWS credentials missing"
129
+ # }
130
+ # return
131
+
132
+ # # Run transcription via the imported function from get_transcription_with_amazon.py
133
+ # from get_transcription_with_amazon import get_word_timestamps
134
+ # word_timestamps = get_word_timestamps(video_path)
135
+
136
+ # with open(output_path, "w") as f:
137
+ # json.dump(word_timestamps, f, indent=4)
138
+
139
+ # transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
140
+ # except Exception as e:
141
+ # logger.error(f"Error during transcription for {video_id}: {str(e)}")
142
+ # transcription_progress_status[video_id] = {"status": "error", "percent": 0, "message": str(e)}
143
+
144
+ # # Authentication routes
145
+ # @app.route('/login')
146
+ # def login():
147
+ # """Handle login for both local and HF environments."""
148
+ # logger.info(f"Login route called. Headers: {dict(request.headers)}")
149
+
150
+ # if is_hf_space:
151
+ # username = request.headers.get('X-Spaces-Username')
152
+ # logger.info(f"Username from headers in login: {username}")
153
+
154
+ # if username and is_allowed_user(username):
155
+ # session['user'] = {'name': username, 'is_hf': True}
156
+ # return redirect(url_for('index'))
157
+ # else:
158
+ # # Redirect to the HF auth endpoint
159
+ # return redirect('/auth')
160
+ # else:
161
+ # # For local development
162
+ # session['user'] = {'name': 'LocalDeveloper', 'is_mock': True}
163
+ # return redirect(url_for('index'))
164
+
165
+ # @app.route('/auth/callback')
166
+ # def auth_callback():
167
+ # """This route will be called by Hugging Face after successful authentication."""
168
+ # logger.info(f"Auth callback called. Headers: {dict(request.headers)}")
169
+
170
+ # if is_hf_space:
171
+ # # In Hugging Face Spaces, the user info is available in the request headers
172
+ # username = request.headers.get('X-Spaces-Username')
173
+ # if username:
174
+ # session['user'] = {'name': username, 'is_hf': True}
175
+ # return redirect(url_for('index'))
176
+ # else:
177
+ # return render_template('error.html', message="Authentication failed. No username provided.")
178
+ # return redirect(url_for('login'))
179
+
180
+ # # Replace the health check route with this improved version
181
+ # @app.route('/health')
182
+ # def health_check():
183
+ # """Health check endpoint for container verification."""
184
+ # # Log environment variables for debugging
185
+ # env_vars = {
186
+ # "FLASK_ENV": os.environ.get('FLASK_ENV', 'production'),
187
+ # "DEBUG": os.environ.get('DEBUG', 'Not set'),
188
+ # "SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
189
+ # "BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
190
+ # "SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set'
191
+ # }
192
+
193
+ # logger.info(f"Health check called. Environment: {env_vars}")
194
+
195
+ # # Get session information for debugging
196
+ # session_info = dict(session) if session else None
197
+ # session_keys = list(session.keys()) if session else []
198
+
199
+ # return jsonify({
200
+ # "status": "healthy",
201
+ # "environment": env_vars,
202
+ # "session_keys": session_keys,
203
+ # "is_hf_space": is_hf_space,
204
+ # "bypass_auth": bypass_auth,
205
+ # "directories": {
206
+ # "videos": os.path.exists(VIDEO_DIR),
207
+ # "annotations": os.path.exists(ANNOTATIONS_DIR),
208
+ # "temp": os.path.exists(TEMP_DIR)
209
+ # }
210
+ # })
211
+
212
+ # @app.route('/auth')
213
+ # def auth():
214
+ # """This route handles HF authentication."""
215
+ # logger.info(f"Auth route called. Headers: {dict(request.headers)}")
216
+
217
+ # # Force bypass auth to be true for debugging
218
+ # bypass_auth = True
219
+
220
+ # # If bypass is enabled, authenticate immediately
221
+ # if bypass_auth:
222
+ # logger.info("Auth bypass enabled, setting default user")
223
+ # session['user'] = {'name': 'Perilon', 'is_hf': True}
224
+ # return redirect(url_for('index'))
225
+
226
+ # # Normal authentication logic
227
+ # username = request.headers.get('X-Spaces-Username')
228
+ # logger.info(f"Username from headers in auth: {username}")
229
+
230
+ # if is_hf_space and username and is_allowed_user(username):
231
+ # logger.info(f"Setting user in session: {username}")
232
+ # session['user'] = {'name': username, 'is_hf': True}
233
+ # return redirect(url_for('index'))
234
+ # elif not is_hf_space:
235
+ # # For local development
236
+ # session['user'] = {'name': 'LocalDeveloper', 'is_mock': True}
237
+ # return redirect(url_for('index'))
238
+ # else:
239
+ # # For HF with no valid username yet
240
+ # return render_template('error.html', message=
241
+ # "Waiting for Hugging Face authentication. If you continue to see this message, "
242
+ # "please make sure you're logged into Hugging Face and your username is allowed.")
243
+
244
+ # @app.before_request
245
+ # def check_auth():
246
+ # """Check authentication before processing requests."""
247
+ # # Skip authentication for certain routes and static files
248
+ # if request.path in ['/login', '/logout', '/auth', '/auth/callback', '/debug', '/health'] or request.path.startswith('/static/'):
249
+ # return
250
+
251
+ # # Force bypass auth to be true for debugging
252
+ # bypass_auth = True
253
+
254
+ # # Log all request paths to help troubleshoot
255
+ # logger.debug(f"Request path: {request.path}, User in session: {'user' in session}")
256
+
257
+ # if bypass_auth:
258
+ # # Set default user for bypass mode if not already set
259
+ # if 'user' not in session:
260
+ # session['user'] = {'name': 'Perilon', 'is_hf': True}
261
+ # return
262
+
263
+ # if is_hf_space:
264
+ # # Check for HF username header
265
+ # username = request.headers.get('X-Spaces-Username')
266
+
267
+ # if 'user' in session:
268
+ # logger.debug(f"User in session: {session['user']}")
269
+ # return
270
+
271
+ # if username and is_allowed_user(username):
272
+ # logger.info(f"Setting user from headers: {username}")
273
+ # session['user'] = {'name': username, 'is_hf': True}
274
+ # return
275
+
276
+ # # No valid user in session or headers
277
+ # logger.info(f"No authenticated user, redirecting to /auth")
278
+ # return redirect('/auth')
279
+ # elif 'user' not in session:
280
+ # return redirect(url_for('login'))
281
+
282
+ # @app.route('/logout')
283
+ # def logout():
284
+ # """Clear session and redirect to login."""
285
+ # session.clear() # Clear the entire session
286
+ # if is_hf_space:
287
+ # return redirect('/auth/logout')
288
+ # return redirect(url_for('login'))
289
+
290
+ # @app.route('/debug')
291
+ # def debug_info():
292
+ # """Return debug information."""
293
+ # cookies = {key: request.cookies.get(key) for key in request.cookies.keys()}
294
+
295
+ # info = {
296
+ # "session": dict(session) if session else None,
297
+ # "headers": dict(request.headers),
298
+ # "cookies": cookies,
299
+ # "is_hf_space": is_hf_space,
300
+ # "allowed_users": os.getenv('ALLOWED_USERS', 'Perilon'),
301
+ # "app_config": {k: str(v) for k, v in app.config.items() if k in
302
+ # ['SESSION_COOKIE_SECURE', 'SESSION_COOKIE_HTTPONLY',
303
+ # 'SESSION_COOKIE_SAMESITE', 'PERMANENT_SESSION_LIFETIME']},
304
+ # }
305
+ # return jsonify(info)
306
+
307
+ # # Main application routes
308
+ # @app.route('/')
309
+ # @login_required
310
+ # def index():
311
+ # """Main entry point, redirects to video selection."""
312
+ # return redirect(url_for('select_video'))
313
+
314
+ # @app.route('/select_video')
315
+ # @login_required
316
+ # def select_video():
317
+ # """Page to select a video for annotation."""
318
+ # if not os.path.exists(VIDEO_DIR):
319
+ # return render_template('error.html', message="Video directory not found.")
320
+ # videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
321
+ # video_ids = [os.path.splitext(v)[0] for v in videos]
322
+ # return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
323
+
324
+ # @app.route('/player/<video_id>')
325
+ # @login_required
326
+ # def player(video_id):
327
+ # """Video player page for annotation."""
328
+ # return render_template('player.html', video_id=video_id, user=session.get('user'))
329
+
330
+ # @app.route('/videos')
331
+ # @login_required
332
+ # def get_videos():
333
+ # """API endpoint to get available videos."""
334
+ # if not os.path.exists(VIDEO_DIR):
335
+ # return jsonify({'error': 'Video directory not found'}), 404
336
+ # videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
337
+ # if not videos:
338
+ # return jsonify({'error': 'No videos found'}), 404
339
+ # return jsonify(videos)
340
+
341
+ # @app.route('/video/<path:filename>')
342
+ # @login_required
343
+ # def serve_video(filename):
344
+ # """Serve a video file."""
345
+ # if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
346
+ # return jsonify({'error': 'Video not found'}), 404
347
+ # return send_from_directory(VIDEO_DIR, filename)
348
+
349
+ # @app.route('/save_annotations', methods=['POST'])
350
+ # @login_required
351
+ # def save_annotations():
352
+ # """Save annotation data."""
353
+ # data = request.json
354
+ # if not data or 'video' not in data or 'timestamps' not in data:
355
+ # return jsonify({'success': False, 'message': 'Invalid data'}), 400
356
+
357
+ # annotation_file = os.path.join(ANNOTATIONS_DIR, f"{data['video']}_annotations.json")
358
+ # annotation_data = {
359
+ # "video_name": data['video'] + ".mp4",
360
+ # "timestamps": sorted(data['timestamps']),
361
+ # "annotation_date": datetime.now().isoformat(),
362
+ # "annotated_by": session.get('user', {}).get('name', 'unknown')
363
+ # }
364
+ # with open(annotation_file, 'w') as f:
365
+ # json.dump(annotation_data, f, indent=4)
366
+ # return jsonify({'success': True, 'message': 'Annotations saved successfully'})
367
+
368
+ # @app.route('/get_annotations/<path:video_name>')
369
+ # @login_required
370
+ # def get_annotations(video_name):
371
+ # """Get annotations for a video."""
372
+ # annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_name}_annotations.json")
373
+ # if not os.path.exists(annotation_file):
374
+ # return jsonify({'error': 'No annotations found'}), 404
375
+ # with open(annotation_file, 'r') as f:
376
+ # annotations = json.load(f)
377
+ # return jsonify(annotations)
378
+
379
+ # @app.route("/alignment/<video_id>")
380
+ # @login_required
381
+ # def alignment_mode(video_id):
382
+ # """Page for aligning sign language with transcribed text."""
383
+ # annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_id}_annotations.json")
384
+ # if not os.path.exists(annotation_file):
385
+ # return render_template("error.html", message="No annotations found for this video. Please annotate the video first.")
386
+ # with open(annotation_file, 'r') as f:
387
+ # annotations = json.load(f)
388
+ # return render_template(
389
+ # "alignment.html",
390
+ # video_id=video_id,
391
+ # total_clips=len(annotations['timestamps']) - 1,
392
+ # user=session.get('user')
393
+ # )
394
+
395
+ # @app.route("/api/transcript/<video_id>")
396
+ # @login_required
397
+ # def get_transcript(video_id):
398
+ # """Get transcript for a video."""
399
+ # timestamps_file = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
400
+ # logger.info(f"Attempting to load word timestamps from: {timestamps_file}")
401
+ # if not os.path.exists(timestamps_file):
402
+ # logger.warning(f"Word timestamps file not found: {timestamps_file}")
403
+ # return jsonify({
404
+ # "status": "error",
405
+ # "message": "No word timestamps found for this video"
406
+ # }), 404
407
+ # try:
408
+ # with open(timestamps_file, 'r') as f:
409
+ # word_data = json.load(f)
410
+ # full_text = " ".join(item["punctuated_word"] for item in word_data)
411
+ # words_with_times = [{
412
+ # "word": item["punctuated_word"],
413
+ # "start": float(item["start_time"]),
414
+ # "end": float(item["end_time"])
415
+ # } for item in word_data]
416
+ # logger.info(f"Successfully created transcript ({len(full_text)} characters)")
417
+ # return jsonify({
418
+ # "status": "success",
419
+ # "text": full_text,
420
+ # "words": words_with_times
421
+ # })
422
+ # except Exception as e:
423
+ # logger.error(f"Error processing word timestamps: {str(e)}")
424
+ # return jsonify({
425
+ # "status": "error",
426
+ # "message": f"Error processing word timestamps: {str(e)}"
427
+ # }), 500
428
+
429
+ # @app.route("/api/word_timestamps/<video_id>")
430
+ # @login_required
431
+ # def get_word_timestamps(video_id):
432
+ # """Get word-level timestamps for a video."""
433
+ # timestamps_file = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
434
+ # logger.info(f"Attempting to load word timestamps from: {timestamps_file}")
435
+ # if not os.path.exists(timestamps_file):
436
+ # logger.warning(f"Word timestamps file not found: {timestamps_file}")
437
+ # return jsonify({
438
+ # "status": "error",
439
+ # "message": "No word timestamps found for this video"
440
+ # }), 404
441
+ # try:
442
+ # with open(timestamps_file, 'r') as f:
443
+ # word_data = json.load(f)
444
+ # logger.info(f"Successfully loaded {len(word_data)} word timestamps")
445
+ # return jsonify({
446
+ # "status": "success",
447
+ # "words": word_data
448
+ # })
449
+ # except Exception as e:
450
+ # logger.error(f"Error processing word timestamps: {str(e)}")
451
+ # return jsonify({
452
+ # "status": "error",
453
+ # "message": f"Error processing word timestamps: {str(e)}"
454
+ # }), 500
455
+
456
+ # @app.route("/api/clips/<video_id>")
457
+ # @login_required
458
+ # def get_video_clips(video_id):
459
+ # """Get clips for a video."""
460
+ # try:
461
+ # annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_id}_annotations.json")
462
+ # if not os.path.exists(annotation_file):
463
+ # raise FileNotFoundError("Annotations not found")
464
+ # with open(annotation_file, 'r') as f:
465
+ # annotations = json.load(f)
466
+ # timestamps = annotations['timestamps']
467
+ # clips = []
468
+ # for i in range(len(timestamps)-1):
469
+ # clips.append({
470
+ # "index": i,
471
+ # "start": timestamps[i],
472
+ # "end": timestamps[i+1],
473
+ # "path": f"/clip/{video_id}/{i}"
474
+ # })
475
+ # return jsonify({
476
+ # "status": "success",
477
+ # "clips": clips
478
+ # })
479
+ # except Exception as e:
480
+ # logger.error(f"Error getting clips: {str(e)}")
481
+ # return jsonify({
482
+ # "status": "error",
483
+ # "message": str(e)
484
+ # }), 500
485
+
486
+ # @app.route("/clip/<video_id>/<int:clip_index>")
487
+ # @login_required
488
+ # def serve_clip(video_id, clip_index):
489
+ # """Serve a specific clip."""
490
+ # clip_path = os.path.join(
491
+ # TEMP_DIR,
492
+ # f"{video_id}_clip_{clip_index:03d}.mp4"
493
+ # )
494
+ # logger.info(f"Attempting to serve clip: {clip_path}")
495
+ # if not os.path.exists(clip_path):
496
+ # logger.error(f"Clip not found: {clip_path}")
497
+ # return jsonify({
498
+ # "status": "error",
499
+ # "message": "Clip not found"
500
+ # }), 404
501
+ # return send_file(clip_path, mimetype="video/mp4")
502
+
503
+ # @app.route("/api/save_alignments", methods=["POST"])
504
+ # @login_required
505
+ # def save_alignments():
506
+ # """Save alignment data."""
507
+ # try:
508
+ # data = request.json
509
+ # if not data or 'video_id' not in data or 'alignments' not in data:
510
+ # return jsonify({'success': False, 'message': 'Invalid data'}), 400
511
+
512
+ # # Add user information to the alignments
513
+ # for alignment in data['alignments']:
514
+ # if alignment:
515
+ # alignment['aligned_by'] = session.get('user', {}).get('name', 'unknown')
516
+
517
+ # output_path = os.path.join(ALIGNMENTS_DIR, f"{data['video_id']}.json")
518
+ # with open(output_path, "w") as f:
519
+ # json.dump(data['alignments'], f, indent=2)
520
+ # return jsonify({
521
+ # "success": True,
522
+ # "message": "Alignments saved successfully"
523
+ # })
524
+ # except Exception as e:
525
+ # logger.error(f"Error saving alignments: {str(e)}")
526
+ # return jsonify({
527
+ # "success": False,
528
+ # "message": str(e)
529
+ # }), 500
530
+
531
+ # @app.route("/api/extract_clips/<video_id>")
532
+ # @login_required
533
+ # def extract_clips_for_video(video_id):
534
+ # """Extract clips and start transcription for a video."""
535
+ # status = clip_extraction_status.get(video_id, {})
536
+ # if status.get("percent", 0) < 100:
537
+ # thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
538
+ # thread.start()
539
+ # if video_id not in transcription_progress_status or transcription_progress_status.get(video_id, {}).get("percent", 0) < 100:
540
+ # thread_trans = threading.Thread(target=run_transcription, args=(video_id,))
541
+ # thread_trans.start()
542
+ # return jsonify({"status": "started"})
543
+
544
+ # @app.route("/api/clip_progress/<video_id>")
545
+ # @login_required
546
+ # def clip_progress(video_id):
547
+ # """Get clip extraction progress."""
548
+ # progress = clip_extraction_status.get(video_id, {"current": 0, "total": 0, "percent": 0})
549
+ # return jsonify(progress)
550
+
551
+ # @app.route("/api/transcription_progress/<video_id>")
552
+ # @login_required
553
+ # def transcription_progress(video_id):
554
+ # """Get transcription progress."""
555
+ # progress = transcription_progress_status.get(video_id, {"status": "not started", "percent": 0})
556
+ # return jsonify(progress)
557
+
558
+ # if __name__ == '__main__':
559
+ # try:
560
+ # # Print diagnostic information
561
+ # print("=" * 50)
562
+ # print(f"Starting app with configuration:")
563
+ # print(f"- Running in HF Space: {is_hf_space}")
564
+ # print(f"- Auth bypass: {bypass_auth}")
565
+ # print(f"- Port: {os.getenv('PORT', 5000)}")
566
+ # print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
567
+ # print("=" * 50)
568
+
569
+ # port = int(os.getenv('PORT', 5000))
570
+ # app.run(host='0.0.0.0', port=port, debug=True)
571
+ # except Exception as e:
572
+ # print(f"Error starting the application: {e}")
573
+ # import traceback
574
+ # traceback.print_exc()
575
+
576
+
577
  from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
578
  import os, json, threading, time, signal, sys
579
  from datetime import datetime
580
  from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
581
  import logging
582
  from dotenv import load_dotenv
583
+ import boto3
584
+ from botocore.exceptions import ClientError
585
+ import tempfile
586
+ import uuid
587
+ import requests
588
+ from urllib.parse import urlparse
589
 
590
  # Load environment variables
591
  load_dotenv()
 
629
  ALIGNMENTS_DIR = os.path.abspath("data/alignments")
630
  TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
631
 
632
+ # S3 configuration
633
+ S3_BUCKET = os.getenv('S3_BUCKET', "sorenson-ai-sb-scratch")
634
+ S3_VIDEO_PREFIX = os.getenv('S3_VIDEO_PREFIX', "awilkinson/kylie_dataset_videos_for_alignment_webapp/")
635
+ USE_S3_FOR_VIDEOS = os.getenv('USE_S3_FOR_VIDEOS', 'true').lower() == 'true'
636
+
637
  # Ensure all required directories exist
638
  for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALIGNMENTS_DIR, TRANSCRIPTS_DIR]:
639
  os.makedirs(directory, exist_ok=True)
 
642
  clip_extraction_status = {}
643
  transcription_progress_status = {}
644
 
645
+ # S3 helper functions
646
+ def get_s3_client():
647
+ """Get a boto3 S3 client."""
648
+ return boto3.client(
649
+ 's3',
650
+ region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
651
+ aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
652
+ aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
653
+ )
654
+
655
+ def list_s3_videos():
656
+ """List all videos in the S3 bucket with the given prefix."""
657
+ try:
658
+ s3_client = get_s3_client()
659
+ response = s3_client.list_objects_v2(
660
+ Bucket=S3_BUCKET,
661
+ Prefix=S3_VIDEO_PREFIX
662
+ )
663
+
664
+ if 'Contents' not in response:
665
+ logger.warning(f"No videos found in S3 bucket {S3_BUCKET} with prefix {S3_VIDEO_PREFIX}")
666
+ return []
667
+
668
+ # Extract video IDs (filenames without extension) from S3 keys
669
+ videos = []
670
+ for item in response['Contents']:
671
+ key = item['Key']
672
+ if key.endswith('.mp4'):
673
+ # Extract just the filename without extension
674
+ filename = os.path.basename(key)
675
+ video_id = os.path.splitext(filename)[0]
676
+ videos.append(video_id)
677
+
678
+ return videos
679
+ except ClientError as e:
680
+ logger.error(f"Error listing videos from S3: {str(e)}")
681
+ return []
682
+
683
+ def download_video_from_s3(video_id):
684
+ """Download a video from S3 to the local videos directory."""
685
+ video_filename = f"{video_id}.mp4"
686
+ s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
687
+ local_path = os.path.join(VIDEO_DIR, video_filename)
688
+
689
+ # Check if the file already exists locally
690
+ if os.path.exists(local_path):
691
+ logger.info(f"Video {video_id} already exists locally.")
692
+ return local_path
693
+
694
+ try:
695
+ logger.info(f"Downloading video {video_id} from S3...")
696
+ s3_client = get_s3_client()
697
+ s3_client.download_file(S3_BUCKET, s3_key, local_path)
698
+ logger.info(f"Video {video_id} downloaded successfully to {local_path}")
699
+ return local_path
700
+ except ClientError as e:
701
+ logger.error(f"Error downloading video from S3: {str(e)}")
702
+ return None
703
+
704
+ def generate_presigned_url(video_id, expiration=3600):
705
+ """Generate a presigned URL for direct access to the video in S3."""
706
+ video_filename = f"{video_id}.mp4"
707
+ s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
708
+
709
+ try:
710
+ s3_client = get_s3_client()
711
+ url = s3_client.generate_presigned_url(
712
+ 'get_object',
713
+ Params={'Bucket': S3_BUCKET, 'Key': s3_key},
714
+ ExpiresIn=expiration
715
+ )
716
+ return url
717
+ except ClientError as e:
718
+ logger.error(f"Error generating presigned URL: {str(e)}")
719
+ return None
720
+
721
  # Graceful shutdown handler
722
  def graceful_shutdown(signum, frame):
723
  """Handle graceful shutdown on signals."""
 
779
  transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
780
  return
781
 
782
+ # Download video from S3 if needed
783
+ if USE_S3_FOR_VIDEOS:
784
+ video_path = download_video_from_s3(video_id)
785
+ if not video_path:
786
+ transcription_progress_status[video_id] = {
787
+ "status": "error",
788
+ "percent": 0,
789
+ "message": f"Failed to download video {video_id} from S3"
790
+ }
791
+ return
792
+ else:
793
+ video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
794
+
795
  transcription_progress_status[video_id] = {"status": "started", "percent": 10}
796
 
797
  # Check if AWS credentials are available
 
852
  return render_template('error.html', message="Authentication failed. No username provided.")
853
  return redirect(url_for('login'))
854
 
 
855
  @app.route('/health')
856
  def health_check():
857
  """Health check endpoint for container verification."""
 
861
  "DEBUG": os.environ.get('DEBUG', 'Not set'),
862
  "SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
863
  "BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
864
+ "SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set',
865
+ "S3_BUCKET": os.environ.get('S3_BUCKET', 'Not set'),
866
+ "S3_VIDEO_PREFIX": os.environ.get('S3_VIDEO_PREFIX', 'Not set'),
867
+ "USE_S3_FOR_VIDEOS": os.environ.get('USE_S3_FOR_VIDEOS', 'Not set')
868
  }
869
 
870
  logger.info(f"Health check called. Environment: {env_vars}")
 
978
  "app_config": {k: str(v) for k, v in app.config.items() if k in
979
  ['SESSION_COOKIE_SECURE', 'SESSION_COOKIE_HTTPONLY',
980
  'SESSION_COOKIE_SAMESITE', 'PERMANENT_SESSION_LIFETIME']},
981
+ "s3_config": {
982
+ "S3_BUCKET": S3_BUCKET,
983
+ "S3_VIDEO_PREFIX": S3_VIDEO_PREFIX,
984
+ "USE_S3_FOR_VIDEOS": USE_S3_FOR_VIDEOS
985
+ }
986
  }
987
  return jsonify(info)
988
 
 
997
  @login_required
998
  def select_video():
999
  """Page to select a video for annotation."""
1000
+ if USE_S3_FOR_VIDEOS:
1001
+ video_ids = list_s3_videos()
1002
+ else:
1003
+ if not os.path.exists(VIDEO_DIR):
1004
+ return render_template('error.html', message="Video directory not found.")
1005
+ videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
1006
+ video_ids = [os.path.splitext(v)[0] for v in videos]
1007
+
1008
  return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
1009
 
1010
  @app.route('/player/<video_id>')
 
1017
  @login_required
1018
  def get_videos():
1019
  """API endpoint to get available videos."""
1020
+ if USE_S3_FOR_VIDEOS:
1021
+ videos = list_s3_videos()
1022
+ if not videos:
1023
+ return jsonify({'error': 'No videos found in S3'}), 404
1024
+ # Return just the filenames with .mp4 extension for compatibility
1025
+ return jsonify([f"{vid}.mp4" for vid in videos])
1026
+ else:
1027
+ # Original local file behavior
1028
+ if not os.path.exists(VIDEO_DIR):
1029
+ return jsonify({'error': 'Video directory not found'}), 404
1030
+ videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
1031
+ if not videos:
1032
+ return jsonify({'error': 'No videos found'}), 404
1033
+ return jsonify(videos)
1034
 
1035
  @app.route('/video/<path:filename>')
1036
  @login_required
1037
  def serve_video(filename):
1038
+ """Serve a video file from S3 or local storage."""
1039
+ video_id = os.path.splitext(filename)[0] # Remove extension
1040
+
1041
+ if USE_S3_FOR_VIDEOS:
1042
+ # Option 1: Generate a presigned URL and redirect
1043
+ presigned_url = generate_presigned_url(video_id)
1044
+ if presigned_url:
1045
+ return redirect(presigned_url)
1046
+
1047
+ # Option 2 (fallback): Download from S3 to local temporary storage and serve
1048
+ local_path = download_video_from_s3(video_id)
1049
+ if local_path and os.path.exists(local_path):
1050
+ return send_from_directory(VIDEO_DIR, filename)
1051
+
1052
+ return jsonify({'error': 'Video not found in S3'}), 404
1053
+ else:
1054
+ # Original local file behavior
1055
+ if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
1056
+ return jsonify({'error': 'Video not found'}), 404
1057
+ return send_from_directory(VIDEO_DIR, filename)
1058
 
1059
  @app.route('/save_annotations', methods=['POST'])
1060
  @login_required
 
1242
  @login_required
1243
  def extract_clips_for_video(video_id):
1244
  """Extract clips and start transcription for a video."""
1245
+ # If using S3, ensure the video is downloaded first
1246
+ if USE_S3_FOR_VIDEOS:
1247
+ video_path = download_video_from_s3(video_id)
1248
+ if not video_path:
1249
+ return jsonify({
1250
+ "status": "error",
1251
+ "message": f"Failed to download video {video_id} from S3"
1252
+ }), 404
1253
+
1254
  status = clip_extraction_status.get(video_id, {})
1255
  if status.get("percent", 0) < 100:
1256
  thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
 
1282
  print(f"- Running in HF Space: {is_hf_space}")
1283
  print(f"- Auth bypass: {bypass_auth}")
1284
  print(f"- Port: {os.getenv('PORT', 5000)}")
1285
+ print(f"- S3 for videos: {USE_S3_FOR_VIDEOS}")
1286
+ print(f"- S3 bucket: {S3_BUCKET}")
1287
+ print(f"- S3 prefix: {S3_VIDEO_PREFIX}")
1288
  print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
1289
+ if USE_S3_FOR_VIDEOS:
1290
+ try:
1291
+ s3_videos = list_s3_videos()
1292
+ print(f"- Available S3 videos: {s3_videos if s3_videos else 'None'}")
1293
+ except Exception as e:
1294
+ print(f"- Error listing S3 videos: {str(e)}")
1295
  print("=" * 50)
1296
 
1297
  port = int(os.getenv('PORT', 5000))
get_transcription_with_amazon.py CHANGED
@@ -8,6 +8,41 @@ import requests
8
  import time
9
  from decimal import Decimal
10
  from typing import Any, Dict, List
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def extract_audio(video_path: str) -> str:
13
  """Extract audio from video file using ffmpeg.
@@ -113,17 +148,21 @@ def main() -> None:
113
  )
114
  video_filename = args.video_id + ".mp4" # Source video file (with .mp4)
115
  video_path = os.path.join(base_dir, "data", "videos", video_filename)
 
 
 
 
 
 
 
 
 
 
 
116
  word_timestamps = get_word_timestamps(video_path)
117
  output_dir = os.path.join(base_dir, "data", "word_timestamps")
118
  os.makedirs(output_dir, exist_ok=True)
119
  output_path = os.path.join(output_dir, args.video_id + "_word_timestamps.json")
120
  with open(output_path, "w") as f:
121
  json.dump(word_timestamps, f, indent=4)
122
- print(f"Word timestamps saved to: {output_path}")
123
-
124
- if __name__ == "__main__":
125
- import argparse
126
- parser = argparse.ArgumentParser(description="Get word timestamps for a given video file ID.")
127
- parser.add_argument("video_id", help="Video file ID (without extension)")
128
- args = parser.parse_args()
129
- main()
 
8
  import time
9
  from decimal import Decimal
10
  from typing import Any, Dict, List
11
+ from botocore.exceptions import ClientError
12
+
13
+ S3_BUCKET = "sorenson-ai-sb-scratch"
14
+ S3_VIDEO_PREFIX = "awilkinson/kylie_dataset_videos_for_alignment_webapp/"
15
+ USE_S3_FOR_VIDEOS = True # Set to True to use S3, False to use local files
16
+
17
+ def get_s3_client():
18
+ """Get a boto3 S3 client."""
19
+ return boto3.client(
20
+ 's3',
21
+ region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
22
+ aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
23
+ aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
24
+ )
25
+
26
+ def download_video_from_s3(video_id, output_dir):
27
+ """Download a video from S3."""
28
+ video_filename = f"{video_id}.mp4"
29
+ s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
30
+ local_path = os.path.join(output_dir, video_filename)
31
+
32
+ # Check if the file already exists locally
33
+ if os.path.exists(local_path):
34
+ print(f"Video {video_id} already exists locally.")
35
+ return local_path
36
+
37
+ try:
38
+ print(f"Downloading video {video_id} from S3...")
39
+ s3_client = get_s3_client()
40
+ s3_client.download_file(S3_BUCKET, s3_key, local_path)
41
+ print(f"Video {video_id} downloaded successfully to {local_path}")
42
+ return local_path
43
+ except ClientError as e:
44
+ print(f"Error downloading video from S3: {str(e)}")
45
+ return None
46
 
47
  def extract_audio(video_path: str) -> str:
48
  """Extract audio from video file using ffmpeg.
 
148
  )
149
  video_filename = args.video_id + ".mp4" # Source video file (with .mp4)
150
  video_path = os.path.join(base_dir, "data", "videos", video_filename)
151
+
152
+ # Check if we need to download from S3
153
+ if USE_S3_FOR_VIDEOS and not os.path.exists(video_path):
154
+ videos_dir = os.path.join(base_dir, "data", "videos")
155
+ os.makedirs(videos_dir, exist_ok=True)
156
+ download_video_from_s3(args.video_id, videos_dir)
157
+
158
+ if not os.path.exists(video_path):
159
+ print(f"Error: Video file not found: {video_path}")
160
+ return
161
+
162
  word_timestamps = get_word_timestamps(video_path)
163
  output_dir = os.path.join(base_dir, "data", "word_timestamps")
164
  os.makedirs(output_dir, exist_ok=True)
165
  output_path = os.path.join(output_dir, args.video_id + "_word_timestamps.json")
166
  with open(output_path, "w") as f:
167
  json.dump(word_timestamps, f, indent=4)
168
+ print(f"Word timestamps saved to: {output_path}")
 
 
 
 
 
 
 
templates/player.html CHANGED
@@ -137,8 +137,10 @@
137
  // Use the provided template video_id if available; it should be the base ID (without .mp4)
138
  const templateVideoId = "{{ video_id|default('') }}";
139
  let currentVideo = "";
 
140
  if (templateVideoId) {
141
  currentVideo = templateVideoId;
 
142
  } else {
143
  // Fallback: use /videos API and remove the .mp4 extension
144
  fetch('/videos')
@@ -150,8 +152,7 @@
150
  }
151
  if (videos.length > 0) {
152
  currentVideo = videos[0].replace(/\.mp4$/, "");
153
- document.getElementById('video-source').src = `/video/${videos[0]}`;
154
- document.getElementById('video').load();
155
  }
156
  })
157
  .catch(error => {
@@ -311,6 +312,28 @@
311
  div.textContent = timestamps.map(t => t.toFixed(2)).join(', ');
312
  }
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  video.addEventListener('timeupdate', () => {
315
  currentTimeDisplay.textContent = `Current Time: ${video.currentTime.toFixed(2)}`;
316
  });
 
137
  // Use the provided template video_id if available; it should be the base ID (without .mp4)
138
  const templateVideoId = "{{ video_id|default('') }}";
139
  let currentVideo = "";
140
+
141
  if (templateVideoId) {
142
  currentVideo = templateVideoId;
143
+ loadVideoSource(currentVideo);
144
  } else {
145
  // Fallback: use /videos API and remove the .mp4 extension
146
  fetch('/videos')
 
152
  }
153
  if (videos.length > 0) {
154
  currentVideo = videos[0].replace(/\.mp4$/, "");
155
+ loadVideoSource(currentVideo);
 
156
  }
157
  })
158
  .catch(error => {
 
312
  div.textContent = timestamps.map(t => t.toFixed(2)).join(', ');
313
  }
314
 
315
+ function loadVideoSource(videoId) {
316
+ const videoUrl = `/video/${videoId}.mp4`;
317
+
318
+ fetch(videoUrl)
319
+ .then(response => {
320
+ if (response.redirected) {
321
+ // If we've been redirected to a presigned S3 URL
322
+ document.getElementById('video-source').src = response.url;
323
+ document.getElementById('video').load();
324
+ } else if (response.ok) {
325
+ // If it's a direct file response
326
+ document.getElementById('video-source').src = videoUrl;
327
+ document.getElementById('video').load();
328
+ } else {
329
+ throw new Error('Video not found');
330
+ }
331
+ })
332
+ .catch(error => {
333
+ document.getElementById('error-message').textContent = 'Error loading video: ' + error;
334
+ });
335
+ }
336
+
337
  video.addEventListener('timeupdate', () => {
338
  currentTimeDisplay.textContent = `Current Time: ${video.currentTime.toFixed(2)}`;
339
  });