Spaces:

VSL-Data-Collection
/

VSL_Boundary_Annotation_and_Alignment_Tool

Sleeping

App Files Files Community

Perilon commited on 3 days ago

Commit

2daffd5

1 Parent(s): c23fdff

Bug fixes

Browse files

Files changed (7) hide show

.hf-space +4 -1
Dockerfile +3 -1
docker-compose.yml +3 -0
extract_signed_segments_from_annotations.py +37 -214
flask_app.py +745 -17
get_transcription_with_amazon.py +47 -8
templates/player.html +25 -2

.hf-space CHANGED Viewed

@@ -7,8 +7,11 @@ config:
     - PORT=7860
     - SPACE_ID=true
     - BYPASS_AUTH=true
   resources:
     cpu: 1
     memory: 1
     gpu: null
-  restarts: true

     - PORT=7860
     - SPACE_ID=true
     - BYPASS_AUTH=true
+    - S3_BUCKET=sorenson-ai-sb-scratch
+    - S3_VIDEO_PREFIX=awilkinson/kylie_dataset_videos_for_alignment_webapp/
+    - USE_S3_FOR_VIDEOS=true
   resources:
     cpu: 1
     memory: 1
     gpu: null
+  restarts: true

Dockerfile CHANGED Viewed

@@ -16,10 +16,12 @@ COPY . .
 ENV PYTHONUNBUFFERED=1
 ENV PORT=7860
 ENV SPACE_ID="true"
-# Add explicit environment variable to enable authentication bypass for troubleshooting
 ENV BYPASS_AUTH="true"
 ENV SECRET_KEY="f7290fc27f11dbf14be6cd348638ad62"
 ENV DEBUG="True"
 # Create necessary directories
 RUN mkdir -p data/videos data/annotations data/temp data/word_timestamps data/alignments data/transcripts

 ENV PYTHONUNBUFFERED=1
 ENV PORT=7860
 ENV SPACE_ID="true"
 ENV BYPASS_AUTH="true"
 ENV SECRET_KEY="f7290fc27f11dbf14be6cd348638ad62"
 ENV DEBUG="True"
+ENV S3_BUCKET="sorenson-ai-sb-scratch"
+ENV S3_VIDEO_PREFIX="awilkinson/kylie_dataset_videos_for_alignment_webapp/"
+ENV USE_S3_FOR_VIDEOS="true"
 # Create necessary directories
 RUN mkdir -p data/videos data/annotations data/temp data/word_timestamps data/alignments data/transcripts

docker-compose.yml CHANGED Viewed

@@ -13,5 +13,8 @@ services:
       - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
       - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
       - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-us-west-2}
     volumes:
       - ./data:/app/data

       - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
       - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
       - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-us-west-2}
+      - S3_BUCKET=sorenson-ai-sb-scratch
+      - S3_VIDEO_PREFIX=awilkinson/kylie_dataset_videos_for_alignment_webapp/
+      - USE_S3_FOR_VIDEOS=true
     volumes:
       - ./data:/app/data

extract_signed_segments_from_annotations.py CHANGED Viewed

@@ -1,212 +1,3 @@
-# #!/usr/bin/env python3
-# import json
-# import os
-# import pathlib
-# import subprocess
-# from dataclasses import dataclass
-# from decimal import Decimal
-# from typing import List, Optional, Tuple
-# import argparse
-# try:
-#     from tqdm import tqdm
-# except ImportError:
-#     def tqdm(iterable, **kwargs):
-#         return iterable
-#     def write(msg):
-#         print(msg)
-# @dataclass
-# class VideoClip:
-#     """Represents a video clip with timing information."""
-#     start_time: float
-#     end_time: float
-#     clip_path: str
-#     original_video: str
-#     index: int
-#     @property
-#     def duration(self) -> float:
-#         return self.end_time - self.start_time
-# class ClipExtractor:
-#     """Handles extraction of video clips based on annotation timestamps."""
-#     def __init__(self, base_dir: str) -> None:
-#         """Initialize with project base directory."""
-#         self.base_dir = base_dir
-#         self.temp_dir = os.path.join(base_dir, "data", "temp")
-#         self.videos_dir = os.path.join(base_dir, "data", "videos")
-#         self.annotations_dir = os.path.join(base_dir, "data", "annotations")
-#         self.metadata_dir = os.path.join(base_dir, "data", "segment_metadata")
-#         os.makedirs(self.temp_dir, exist_ok=True)
-#         os.makedirs(self.metadata_dir, exist_ok=True)
-#     def validate_timestamps(self, timestamps: List[float]) -> Tuple[bool, Optional[str]]:
-#         if not timestamps:
-#             return False, "No timestamps found in annotation file."
-#         for i in range(len(timestamps) - 1):
-#             if timestamps[i] >= timestamps[i + 1]:
-#                 return False, (f"Invalid timestamp order: {str(round(Decimal(timestamps[i]), 3))} seconds "
-#                                 f"followed by {str(round(Decimal(timestamps[i + 1]), 3))} seconds")
-#         return True, None
-#     def get_video_duration(self, video_path: str) -> float:
-#         try:
-#             cmd = [
-#                 "ffprobe",
-#                 "-v", "error",
-#                 "-show_entries", "format=duration",
-#                 "-of", "default=noprint_wrappers=1:nokey=1",
-#                 video_path
-#             ]
-#             output = subprocess.check_output(cmd).decode().strip()
-#             return float(output)
-#         except subprocess.CalledProcessError as e:
-#             raise RuntimeError(f"Failed to get video duration. Error: {str(e)}")
-#     def extract_clip(self, video_path: str, start_time: float, end_time: float, output_path: str) -> bool:
-#         try:
-#             cmd = [
-#                 "ffmpeg",
-#                 "-i", video_path,
-#                 "-ss", str(start_time),
-#                 "-t", str(end_time - start_time),
-#                 "-c:v", "libx264",
-#                 "-c:a", "aac",
-#                 "-y",
-#                 output_path
-#             ]
-#             subprocess.run(cmd, check=True, capture_output=True)
-#             if not os.path.exists(output_path):
-#                 print(f"Warning: ffmpeg completed but file not found: {output_path}")
-#                 return False
-#             file_size = os.path.getsize(output_path)
-#             print(f"Created clip: {output_path} ({file_size} bytes)")
-#             return True
-#         except subprocess.CalledProcessError as e:
-#             print(f"Error extracting clip. Details: {str(e)}")
-#             return False
-#     def extract_clips_from_annotations(self, video_id: str, progress_callback=None) -> List[VideoClip]:
-#         # Determine paths
-#         video_path = os.path.join(self.videos_dir, f"{video_id}.mp4")
-#         annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
-#         if not os.path.exists(video_path):
-#             raise FileNotFoundError(f"Video file not found: {video_path}")
-#         if not os.path.exists(annotation_path):
-#             raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
-#         with open(annotation_path, "r") as f:
-#             annotations = json.load(f)
-#         timestamps = sorted(annotations["timestamps"])
-#         is_valid, error_message = self.validate_timestamps(timestamps)
-#         if not is_valid:
-#             raise ValueError(f"Invalid timestamps in annotation file. {error_message}")
-#         video_duration = self.get_video_duration(video_path)
-#         if timestamps[-1] > video_duration:
-#             raise ValueError(
-#                 f"Final timestamp ({str(round(Decimal(timestamps[-1]), 3))} seconds) " +
-#                 f"exceeds video duration ({str(round(Decimal(video_duration), 3))} seconds)"
-#             )
-#         # Create segments using only consecutive pairs from the annotated boundaries.
-#         segments = [(timestamps[i], timestamps[i+1]) for i in range(len(timestamps)-1)]
-#         total_clips = len(segments)
-#         # Check metadata to see if segmentation is up-to-date.
-#         metadata_file = os.path.join(self.metadata_dir, f"{video_id}_metadata.json")
-#         use_cached = False
-#         if os.path.exists(metadata_file):
-#             with open(metadata_file, "r") as meta_f:
-#                 try:
-#                     meta_data = json.load(meta_f)
-#                     if meta_data.get("segments") == segments:
-#                         use_cached = True
-#                 except Exception as ex:
-#                     use_cached = False
-#         if use_cached:
-#             clips = []
-#             for i, (start, end) in enumerate(segments):
-#                 clip_path = os.path.join(self.temp_dir, f"{video_id}_clip_{i:03d}.mp4")
-#                 if not os.path.exists(clip_path):
-#                     use_cached = False
-#                     break
-#                 clips.append(VideoClip(start, end, clip_path, video_id, i))
-#             if use_cached:
-#                 if progress_callback:
-#                     progress_callback(total_clips, total_clips)
-#                 else:
-#                     print("Using cached segmentation as boundaries haven't changed.")
-#                 return clips
-#         # If metadata is missing, boundaries have changed, or some clip file is missing, re-run segmentation.
-#         clips = []
-#         current_clip = 0
-#         use_tqdm = progress_callback is None
-#         if use_tqdm:
-#             pbar = tqdm(total=total_clips, desc="Extracting clips")
-#         for segment in segments:
-#             start, end = segment
-#             clip_path = os.path.join(self.temp_dir, f"{video_id}_clip_{current_clip:03d}.mp4")
-#             if self.extract_clip(video_path, start, end, clip_path):
-#                 clips.append(VideoClip(start, end, clip_path, video_id, current_clip))
-#             current_clip += 1
-#             if progress_callback:
-#                 progress_callback(current_clip, total_clips)
-#             elif use_tqdm:
-#                 pbar.update(1)
-#         if use_tqdm:
-#             pbar.close()
-#         # Save segmentation metadata for future use.
-#         meta_data = {"segments": segments}
-#         with open(metadata_file, "w") as meta_f:
-#             json.dump(meta_data, meta_f, indent=4)
-#         return clips
-#     def cleanup_clips(self, clips: List[VideoClip]) -> None:
-#         if not clips:
-#             return
-#         print("\nCleaning up temporary files...")
-#         for clip in clips:
-#             if os.path.exists(clip.clip_path):
-#                 try:
-#                     print(f"Removing: {clip.clip_path}")
-#                     os.remove(clip.clip_path)
-#                 except OSError as e:
-#                     print(f"Warning: Failed to remove clip {clip.clip_path}. Error: {str(e)}")
-#             else:
-#                 print(f"Warning: File not found for cleanup: {clip.clip_path}")
-# def main() -> None:
-#     parser = argparse.ArgumentParser(description="Extract video clips based on annotations for a given video file ID.")
-#     parser.add_argument("video_id", help="Video file ID (without extension)")
-#     args = parser.parse_args()
-#     base_dir = os.path.join(
-#         str(pathlib.Path.home()),
-#         "andrew_messaround",
-#         "vsl_speech_to_signing_alignment",
-#         "boundary_annotation_webapp"
-#     )
-#     extractor = ClipExtractor(base_dir)
-#     try:
-#         clips = extractor.extract_clips_from_annotations(args.video_id)
-#         print(f"\nSuccessfully extracted {len(clips)} clips:")
-#         for clip in clips:
-#             print(f"Clip {clip.index}: {round(clip.start_time, 2)}s → {round(clip.end_time, 2)}s")
-#             print(f"Duration: {round(clip.duration, 2)}s")
-#             print(f"Path: {clip.clip_path}\n")
-#     except Exception as e:
-#         print(f"Error: {str(e)}")
-# if __name__ == "__main__":
-#     main()
-#!/usr/bin/env python3
 import json
 import os
 import pathlib
@@ -215,6 +6,8 @@ from dataclasses import dataclass
 from decimal import Decimal
 from typing import List, Optional, Tuple
 import argparse
 try:
     from tqdm import tqdm
@@ -226,6 +19,20 @@ except ImportError:
 from concurrent.futures import ThreadPoolExecutor, as_completed
 @dataclass
 class VideoClip:
     """Represents a video clip with timing information."""
@@ -300,15 +107,31 @@ class ClipExtractor:
             print(f"Error extracting clip. Details: {str(e)}")
             return False
-    def extract_clips_from_annotations(self, video_id: str, progress_callback=None) -> List[VideoClip]:
         # Determine paths
-        video_path = os.path.join(self.videos_dir, f"{video_id}.mp4")
         annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
-        if not os.path.exists(video_path):
-            raise FileNotFoundError(f"Video file not found: {video_path}")
         if not os.path.exists(annotation_path):
             raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
         with open(annotation_path, "r") as f:
             annotations = json.load(f)
         timestamps = sorted(annotations["timestamps"])

 import json
 import os
 import pathlib
 from decimal import Decimal
 from typing import List, Optional, Tuple
 import argparse
+import boto3
+from botocore.exceptions import ClientError
 try:
     from tqdm import tqdm
 from concurrent.futures import ThreadPoolExecutor, as_completed
+# Add these constants at the top of the file
+S3_BUCKET = "sorenson-ai-sb-scratch"
+S3_VIDEO_PREFIX = "awilkinson/kylie_dataset_videos_for_alignment_webapp/"
+USE_S3_FOR_VIDEOS = True  # Set to True to use S3, False to use local files
+def get_s3_client():
+    """Get a boto3 S3 client."""
+    return boto3.client(
+        's3',
+        region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
+        aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
+        aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
+    )
 @dataclass
 class VideoClip:
     """Represents a video clip with timing information."""
             print(f"Error extracting clip. Details: {str(e)}")
             return False
+    def extract_clips_from_annotations(self, video_id, progress_callback=None) -> List[VideoClip]:
+        """Extract clips based on annotation timestamps, handling S3 videos."""
         # Determine paths
+        video_filename = f"{video_id}.mp4"
+        video_path = os.path.join(self.videos_dir, video_filename)
         annotation_path = os.path.join(self.annotations_dir, f"{video_id}_annotations.json")
         if not os.path.exists(annotation_path):
             raise FileNotFoundError(f"Annotation file not found: {annotation_path}")
+        # Check if we need to download the video from S3
+        if USE_S3_FOR_VIDEOS and not os.path.exists(video_path):
+            print(f"Video not found locally. Downloading from S3: {video_id}")
+            s3_client = get_s3_client()
+            try:
+                s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
+                s3_client.download_file(S3_BUCKET, s3_key, video_path)
+                print(f"Video downloaded successfully: {video_path}")
+            except ClientError as e:
+                raise FileNotFoundError(f"Video file not found in S3: {s3_key}. Error: {str(e)}")
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"Video file not found: {video_path}")
+        # Now continue with the original extraction process
         with open(annotation_path, "r") as f:
             annotations = json.load(f)
         timestamps = sorted(annotations["timestamps"])

flask_app.py CHANGED Viewed

@@ -1,9 +1,591 @@
 from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
 import os, json, threading, time, signal, sys
 from datetime import datetime
 from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
 import logging
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
@@ -47,6 +629,11 @@ WORD_TIMESTAMPS_DIR = os.path.abspath("data/word_timestamps")
 ALIGNMENTS_DIR = os.path.abspath("data/alignments")
 TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
 # Ensure all required directories exist
 for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALIGNMENTS_DIR, TRANSCRIPTS_DIR]:
     os.makedirs(directory, exist_ok=True)
@@ -55,6 +642,82 @@ for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALI
 clip_extraction_status = {}
 transcription_progress_status = {}
 # Graceful shutdown handler
 def graceful_shutdown(signum, frame):
     """Handle graceful shutdown on signals."""
@@ -116,7 +779,19 @@ def run_transcription(video_id):
             transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
             return
-        video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
         transcription_progress_status[video_id] = {"status": "started", "percent": 10}
         # Check if AWS credentials are available
@@ -177,7 +852,6 @@ def auth_callback():
             return render_template('error.html', message="Authentication failed. No username provided.")
     return redirect(url_for('login'))
-# Replace the health check route with this improved version
 @app.route('/health')
 def health_check():
     """Health check endpoint for container verification."""
@@ -187,7 +861,10 @@ def health_check():
         "DEBUG": os.environ.get('DEBUG', 'Not set'),
         "SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
         "BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
-        "SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set'
     }
     logger.info(f"Health check called. Environment: {env_vars}")
@@ -301,6 +978,11 @@ def debug_info():
         "app_config": {k: str(v) for k, v in app.config.items() if k in
                       ['SESSION_COOKIE_SECURE', 'SESSION_COOKIE_HTTPONLY',
                        'SESSION_COOKIE_SAMESITE', 'PERMANENT_SESSION_LIFETIME']},
     }
     return jsonify(info)
@@ -315,10 +997,14 @@ def index():
 @login_required
 def select_video():
     """Page to select a video for annotation."""
-    if not os.path.exists(VIDEO_DIR):
-        return render_template('error.html', message="Video directory not found.")
-    videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
-    video_ids = [os.path.splitext(v)[0] for v in videos]
     return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
 @app.route('/player/<video_id>')
@@ -331,20 +1017,44 @@ def player(video_id):
 @login_required
 def get_videos():
     """API endpoint to get available videos."""
-    if not os.path.exists(VIDEO_DIR):
-        return jsonify({'error': 'Video directory not found'}), 404
-    videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
-    if not videos:
-        return jsonify({'error': 'No videos found'}), 404
-    return jsonify(videos)
 @app.route('/video/<path:filename>')
 @login_required
 def serve_video(filename):
-    """Serve a video file."""
-    if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
-        return jsonify({'error': 'Video not found'}), 404
-    return send_from_directory(VIDEO_DIR, filename)
 @app.route('/save_annotations', methods=['POST'])
 @login_required
@@ -532,6 +1242,15 @@ def save_alignments():
 @login_required
 def extract_clips_for_video(video_id):
     """Extract clips and start transcription for a video."""
     status = clip_extraction_status.get(video_id, {})
     if status.get("percent", 0) < 100:
         thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
@@ -563,7 +1282,16 @@ if __name__ == '__main__':
         print(f"- Running in HF Space: {is_hf_space}")
         print(f"- Auth bypass: {bypass_auth}")
         print(f"- Port: {os.getenv('PORT', 5000)}")
         print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
         print("=" * 50)
         port = int(os.getenv('PORT', 5000))

+# from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
+# import os, json, threading, time, signal, sys
+# from datetime import datetime
+# from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
+# import logging
+# from dotenv import load_dotenv
+# # Load environment variables
+# load_dotenv()
+# # Add this near the top with other environment variables
+# bypass_auth = os.getenv('BYPASS_AUTH', 'false').lower() == 'true'
+# # Configure logging first
+# logging.basicConfig(
+#     level=logging.INFO,
+#     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+# )
+# logger = logging.getLogger(__name__)
+# # Hugging Face specific configuration
+# is_hf_space = os.getenv('SPACE_ID') is not None
+# if is_hf_space:
+#     logger.info("Running in Hugging Face Spaces environment")
+#     # Allow insecure transport for development in HF
+#     os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
+#     # Ensure port is set correctly
+#     os.environ['PORT'] = '7860'
+# app = Flask(__name__)
+# app.secret_key = os.getenv('SECRET_KEY', 'dev_key_for_testing')
+# # Configure session for HF
+# if is_hf_space:
+#     app.config['SESSION_COOKIE_SECURE'] = False
+#     app.config['SESSION_COOKIE_HTTPONLY'] = True
+#     app.config['SESSION_COOKIE_SAMESITE'] = None  # Add this line
+#     app.config['PERMANENT_SESSION_LIFETIME'] = 86400  # 24 hours
+# # Directory paths
+# VIDEO_DIR = os.path.abspath("data/videos")
+# ANNOTATIONS_DIR = os.path.abspath("data/annotations")
+# TEMP_DIR = os.path.abspath("data/temp")
+# WORD_TIMESTAMPS_DIR = os.path.abspath("data/word_timestamps")
+# ALIGNMENTS_DIR = os.path.abspath("data/alignments")
+# TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
+# # Ensure all required directories exist
+# for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALIGNMENTS_DIR, TRANSCRIPTS_DIR]:
+#     os.makedirs(directory, exist_ok=True)
+# # Global dictionaries for progress tracking
+# clip_extraction_status = {}
+# transcription_progress_status = {}
+# # Graceful shutdown handler
+# def graceful_shutdown(signum, frame):
+#     """Handle graceful shutdown on signals."""
+#     logger.info(f"Received signal {signum}, shutting down gracefully...")
+#     # Clean up as needed here
+#     sys.exit(0)
+# # Register signal handlers
+# signal.signal(signal.SIGTERM, graceful_shutdown)
+# signal.signal(signal.SIGINT, graceful_shutdown)
+# # Login required decorator
+# def login_required(f):
+#     from functools import wraps
+#     @wraps(f)
+#     def decorated_function(*args, **kwargs):
+#         if 'user' not in session:
+#             logger.info(f"User not in session, redirecting to login")
+#             return redirect(url_for('login'))
+#         return f(*args, **kwargs)
+#     return decorated_function
+# # Allow specific users (for testing)
+# def is_allowed_user(username):
+#     allowed_users_env = os.getenv('ALLOWED_USERS', 'Perilon')  # Default to your username
+#     allowed_users = [user.strip() for user in allowed_users_env.split(',')]
+#     return username in allowed_users or not is_hf_space  # Allow all users in local dev
+# def update_extraction_progress(video_id, current, total):
+#     percent = int((current / total) * 100)
+#     clip_extraction_status[video_id] = {"current": current, "total": total, "percent": percent}
+# def run_clip_extraction(video_id):
+#     try:
+#         base_dir = app.root_path
+#         extractor = ClipExtractor(base_dir)
+#         extractor.extract_clips_from_annotations(
+#             video_id,
+#             progress_callback=lambda current, total: update_extraction_progress(video_id, current, total)
+#         )
+#         if video_id in clip_extraction_status:
+#             status = clip_extraction_status[video_id]
+#             if status.get("percent", 0) < 100:
+#                 update_extraction_progress(video_id, status["total"], status["total"])
+#         else:
+#             update_extraction_progress(video_id, 1, 1)
+#     except Exception as e:
+#         logger.error(f"Error during clip extraction for {video_id}: {str(e)}")
+#         clip_extraction_status[video_id] = {"error": str(e)}
+# def run_transcription(video_id):
+#     try:
+#         base_dir = app.root_path
+#         output_path = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
+#         # Check if transcription already exists and is valid.
+#         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+#             logger.info(f"Using cached transcription for video {video_id}.")
+#             transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
+#             return
+#         video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
+#         transcription_progress_status[video_id] = {"status": "started", "percent": 10}
+#         # Check if AWS credentials are available
+#         if not os.environ.get('AWS_ACCESS_KEY_ID') or not os.environ.get('AWS_SECRET_ACCESS_KEY'):
+#             logger.warning("AWS credentials not found. Transcription will not work properly.")
+#             transcription_progress_status[video_id] = {
+#                 "status": "error",
+#                 "percent": 0,
+#                 "message": "AWS credentials missing"
+#             }
+#             return
+#         # Run transcription via the imported function from get_transcription_with_amazon.py
+#         from get_transcription_with_amazon import get_word_timestamps
+#         word_timestamps = get_word_timestamps(video_path)
+#         with open(output_path, "w") as f:
+#             json.dump(word_timestamps, f, indent=4)
+#         transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
+#     except Exception as e:
+#         logger.error(f"Error during transcription for {video_id}: {str(e)}")
+#         transcription_progress_status[video_id] = {"status": "error", "percent": 0, "message": str(e)}
+# # Authentication routes
+# @app.route('/login')
+# def login():
+#     """Handle login for both local and HF environments."""
+#     logger.info(f"Login route called. Headers: {dict(request.headers)}")
+#     if is_hf_space:
+#         username = request.headers.get('X-Spaces-Username')
+#         logger.info(f"Username from headers in login: {username}")
+#         if username and is_allowed_user(username):
+#             session['user'] = {'name': username, 'is_hf': True}
+#             return redirect(url_for('index'))
+#         else:
+#             # Redirect to the HF auth endpoint
+#             return redirect('/auth')
+#     else:
+#         # For local development
+#         session['user'] = {'name': 'LocalDeveloper', 'is_mock': True}
+#         return redirect(url_for('index'))
+# @app.route('/auth/callback')
+# def auth_callback():
+#     """This route will be called by Hugging Face after successful authentication."""
+#     logger.info(f"Auth callback called. Headers: {dict(request.headers)}")
+#     if is_hf_space:
+#         # In Hugging Face Spaces, the user info is available in the request headers
+#         username = request.headers.get('X-Spaces-Username')
+#         if username:
+#             session['user'] = {'name': username, 'is_hf': True}
+#             return redirect(url_for('index'))
+#         else:
+#             return render_template('error.html', message="Authentication failed. No username provided.")
+#     return redirect(url_for('login'))
+# # Replace the health check route with this improved version
+# @app.route('/health')
+# def health_check():
+#     """Health check endpoint for container verification."""
+#     # Log environment variables for debugging
+#     env_vars = {
+#         "FLASK_ENV": os.environ.get('FLASK_ENV', 'production'),
+#         "DEBUG": os.environ.get('DEBUG', 'Not set'),
+#         "SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
+#         "BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
+#         "SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set'
+#     }
+#     logger.info(f"Health check called. Environment: {env_vars}")
+#     # Get session information for debugging
+#     session_info = dict(session) if session else None
+#     session_keys = list(session.keys()) if session else []
+#     return jsonify({
+#         "status": "healthy",
+#         "environment": env_vars,
+#         "session_keys": session_keys,
+#         "is_hf_space": is_hf_space,
+#         "bypass_auth": bypass_auth,
+#         "directories": {
+#             "videos": os.path.exists(VIDEO_DIR),
+#             "annotations": os.path.exists(ANNOTATIONS_DIR),
+#             "temp": os.path.exists(TEMP_DIR)
+#         }
+#     })
+# @app.route('/auth')
+# def auth():
+#     """This route handles HF authentication."""
+#     logger.info(f"Auth route called. Headers: {dict(request.headers)}")
+#     # Force bypass auth to be true for debugging
+#     bypass_auth = True
+#     # If bypass is enabled, authenticate immediately
+#     if bypass_auth:
+#         logger.info("Auth bypass enabled, setting default user")
+#         session['user'] = {'name': 'Perilon', 'is_hf': True}
+#         return redirect(url_for('index'))
+#     # Normal authentication logic
+#     username = request.headers.get('X-Spaces-Username')
+#     logger.info(f"Username from headers in auth: {username}")
+#     if is_hf_space and username and is_allowed_user(username):
+#         logger.info(f"Setting user in session: {username}")
+#         session['user'] = {'name': username, 'is_hf': True}
+#         return redirect(url_for('index'))
+#     elif not is_hf_space:
+#         # For local development
+#         session['user'] = {'name': 'LocalDeveloper', 'is_mock': True}
+#         return redirect(url_for('index'))
+#     else:
+#         # For HF with no valid username yet
+#         return render_template('error.html', message=
+#             "Waiting for Hugging Face authentication. If you continue to see this message, "
+#             "please make sure you're logged into Hugging Face and your username is allowed.")
+# @app.before_request
+# def check_auth():
+#     """Check authentication before processing requests."""
+#     # Skip authentication for certain routes and static files
+#     if request.path in ['/login', '/logout', '/auth', '/auth/callback', '/debug', '/health'] or request.path.startswith('/static/'):
+#         return
+#     # Force bypass auth to be true for debugging
+#     bypass_auth = True
+#     # Log all request paths to help troubleshoot
+#     logger.debug(f"Request path: {request.path}, User in session: {'user' in session}")
+#     if bypass_auth:
+#         # Set default user for bypass mode if not already set
+#         if 'user' not in session:
+#             session['user'] = {'name': 'Perilon', 'is_hf': True}
+#         return
+#     if is_hf_space:
+#         # Check for HF username header
+#         username = request.headers.get('X-Spaces-Username')
+#         if 'user' in session:
+#             logger.debug(f"User in session: {session['user']}")
+#             return
+#         if username and is_allowed_user(username):
+#             logger.info(f"Setting user from headers: {username}")
+#             session['user'] = {'name': username, 'is_hf': True}
+#             return
+#         # No valid user in session or headers
+#         logger.info(f"No authenticated user, redirecting to /auth")
+#         return redirect('/auth')
+#     elif 'user' not in session:
+#         return redirect(url_for('login'))
+# @app.route('/logout')
+# def logout():
+#     """Clear session and redirect to login."""
+#     session.clear()  # Clear the entire session
+#     if is_hf_space:
+#         return redirect('/auth/logout')
+#     return redirect(url_for('login'))
+# @app.route('/debug')
+# def debug_info():
+#     """Return debug information."""
+#     cookies = {key: request.cookies.get(key) for key in request.cookies.keys()}
+#     info = {
+#         "session": dict(session) if session else None,
+#         "headers": dict(request.headers),
+#         "cookies": cookies,
+#         "is_hf_space": is_hf_space,
+#         "allowed_users": os.getenv('ALLOWED_USERS', 'Perilon'),
+#         "app_config": {k: str(v) for k, v in app.config.items() if k in
+#                       ['SESSION_COOKIE_SECURE', 'SESSION_COOKIE_HTTPONLY',
+#                        'SESSION_COOKIE_SAMESITE', 'PERMANENT_SESSION_LIFETIME']},
+#     }
+#     return jsonify(info)
+# # Main application routes
+# @app.route('/')
+# @login_required
+# def index():
+#     """Main entry point, redirects to video selection."""
+#     return redirect(url_for('select_video'))
+# @app.route('/select_video')
+# @login_required
+# def select_video():
+#     """Page to select a video for annotation."""
+#     if not os.path.exists(VIDEO_DIR):
+#         return render_template('error.html', message="Video directory not found.")
+#     videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
+#     video_ids = [os.path.splitext(v)[0] for v in videos]
+#     return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
+# @app.route('/player/<video_id>')
+# @login_required
+# def player(video_id):
+#     """Video player page for annotation."""
+#     return render_template('player.html', video_id=video_id, user=session.get('user'))
+# @app.route('/videos')
+# @login_required
+# def get_videos():
+#     """API endpoint to get available videos."""
+#     if not os.path.exists(VIDEO_DIR):
+#         return jsonify({'error': 'Video directory not found'}), 404
+#     videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
+#     if not videos:
+#         return jsonify({'error': 'No videos found'}), 404
+#     return jsonify(videos)
+# @app.route('/video/<path:filename>')
+# @login_required
+# def serve_video(filename):
+#     """Serve a video file."""
+#     if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
+#         return jsonify({'error': 'Video not found'}), 404
+#     return send_from_directory(VIDEO_DIR, filename)
+# @app.route('/save_annotations', methods=['POST'])
+# @login_required
+# def save_annotations():
+#     """Save annotation data."""
+#     data = request.json
+#     if not data or 'video' not in data or 'timestamps' not in data:
+#         return jsonify({'success': False, 'message': 'Invalid data'}), 400
+#     annotation_file = os.path.join(ANNOTATIONS_DIR, f"{data['video']}_annotations.json")
+#     annotation_data = {
+#         "video_name": data['video'] + ".mp4",
+#         "timestamps": sorted(data['timestamps']),
+#         "annotation_date": datetime.now().isoformat(),
+#         "annotated_by": session.get('user', {}).get('name', 'unknown')
+#     }
+#     with open(annotation_file, 'w') as f:
+#         json.dump(annotation_data, f, indent=4)
+#     return jsonify({'success': True, 'message': 'Annotations saved successfully'})
+# @app.route('/get_annotations/<path:video_name>')
+# @login_required
+# def get_annotations(video_name):
+#     """Get annotations for a video."""
+#     annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_name}_annotations.json")
+#     if not os.path.exists(annotation_file):
+#         return jsonify({'error': 'No annotations found'}), 404
+#     with open(annotation_file, 'r') as f:
+#         annotations = json.load(f)
+#     return jsonify(annotations)
+# @app.route("/alignment/<video_id>")
+# @login_required
+# def alignment_mode(video_id):
+#     """Page for aligning sign language with transcribed text."""
+#     annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_id}_annotations.json")
+#     if not os.path.exists(annotation_file):
+#         return render_template("error.html", message="No annotations found for this video. Please annotate the video first.")
+#     with open(annotation_file, 'r') as f:
+#         annotations = json.load(f)
+#     return render_template(
+#         "alignment.html",
+#         video_id=video_id,
+#         total_clips=len(annotations['timestamps']) - 1,
+#         user=session.get('user')
+#     )
+# @app.route("/api/transcript/<video_id>")
+# @login_required
+# def get_transcript(video_id):
+#     """Get transcript for a video."""
+#     timestamps_file = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
+#     logger.info(f"Attempting to load word timestamps from: {timestamps_file}")
+#     if not os.path.exists(timestamps_file):
+#         logger.warning(f"Word timestamps file not found: {timestamps_file}")
+#         return jsonify({
+#             "status": "error",
+#             "message": "No word timestamps found for this video"
+#         }), 404
+#     try:
+#         with open(timestamps_file, 'r') as f:
+#             word_data = json.load(f)
+#         full_text = " ".join(item["punctuated_word"] for item in word_data)
+#         words_with_times = [{
+#             "word": item["punctuated_word"],
+#             "start": float(item["start_time"]),
+#             "end": float(item["end_time"])
+#         } for item in word_data]
+#         logger.info(f"Successfully created transcript ({len(full_text)} characters)")
+#         return jsonify({
+#             "status": "success",
+#             "text": full_text,
+#             "words": words_with_times
+#         })
+#     except Exception as e:
+#         logger.error(f"Error processing word timestamps: {str(e)}")
+#         return jsonify({
+#             "status": "error",
+#             "message": f"Error processing word timestamps: {str(e)}"
+#         }), 500
+# @app.route("/api/word_timestamps/<video_id>")
+# @login_required
+# def get_word_timestamps(video_id):
+#     """Get word-level timestamps for a video."""
+#     timestamps_file = os.path.join(WORD_TIMESTAMPS_DIR, f"{video_id}_word_timestamps.json")
+#     logger.info(f"Attempting to load word timestamps from: {timestamps_file}")
+#     if not os.path.exists(timestamps_file):
+#         logger.warning(f"Word timestamps file not found: {timestamps_file}")
+#         return jsonify({
+#             "status": "error",
+#             "message": "No word timestamps found for this video"
+#         }), 404
+#     try:
+#         with open(timestamps_file, 'r') as f:
+#             word_data = json.load(f)
+#         logger.info(f"Successfully loaded {len(word_data)} word timestamps")
+#         return jsonify({
+#             "status": "success",
+#             "words": word_data
+#         })
+#     except Exception as e:
+#         logger.error(f"Error processing word timestamps: {str(e)}")
+#         return jsonify({
+#             "status": "error",
+#             "message": f"Error processing word timestamps: {str(e)}"
+#         }), 500
+# @app.route("/api/clips/<video_id>")
+# @login_required
+# def get_video_clips(video_id):
+#     """Get clips for a video."""
+#     try:
+#         annotation_file = os.path.join(ANNOTATIONS_DIR, f"{video_id}_annotations.json")
+#         if not os.path.exists(annotation_file):
+#             raise FileNotFoundError("Annotations not found")
+#         with open(annotation_file, 'r') as f:
+#             annotations = json.load(f)
+#         timestamps = annotations['timestamps']
+#         clips = []
+#         for i in range(len(timestamps)-1):
+#             clips.append({
+#                 "index": i,
+#                 "start": timestamps[i],
+#                 "end": timestamps[i+1],
+#                 "path": f"/clip/{video_id}/{i}"
+#             })
+#         return jsonify({
+#             "status": "success",
+#             "clips": clips
+#         })
+#     except Exception as e:
+#         logger.error(f"Error getting clips: {str(e)}")
+#         return jsonify({
+#             "status": "error",
+#             "message": str(e)
+#         }), 500
+# @app.route("/clip/<video_id>/<int:clip_index>")
+# @login_required
+# def serve_clip(video_id, clip_index):
+#     """Serve a specific clip."""
+#     clip_path = os.path.join(
+#         TEMP_DIR,
+#         f"{video_id}_clip_{clip_index:03d}.mp4"
+#     )
+#     logger.info(f"Attempting to serve clip: {clip_path}")
+#     if not os.path.exists(clip_path):
+#         logger.error(f"Clip not found: {clip_path}")
+#         return jsonify({
+#             "status": "error",
+#             "message": "Clip not found"
+#         }), 404
+#     return send_file(clip_path, mimetype="video/mp4")
+# @app.route("/api/save_alignments", methods=["POST"])
+# @login_required
+# def save_alignments():
+#     """Save alignment data."""
+#     try:
+#         data = request.json
+#         if not data or 'video_id' not in data or 'alignments' not in data:
+#             return jsonify({'success': False, 'message': 'Invalid data'}), 400
+#         # Add user information to the alignments
+#         for alignment in data['alignments']:
+#             if alignment:
+#                 alignment['aligned_by'] = session.get('user', {}).get('name', 'unknown')
+#         output_path = os.path.join(ALIGNMENTS_DIR, f"{data['video_id']}.json")
+#         with open(output_path, "w") as f:
+#             json.dump(data['alignments'], f, indent=2)
+#         return jsonify({
+#             "success": True,
+#             "message": "Alignments saved successfully"
+#         })
+#     except Exception as e:
+#         logger.error(f"Error saving alignments: {str(e)}")
+#         return jsonify({
+#             "success": False,
+#             "message": str(e)
+#         }), 500
+# @app.route("/api/extract_clips/<video_id>")
+# @login_required
+# def extract_clips_for_video(video_id):
+#     """Extract clips and start transcription for a video."""
+#     status = clip_extraction_status.get(video_id, {})
+#     if status.get("percent", 0) < 100:
+#         thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
+#         thread.start()
+#     if video_id not in transcription_progress_status or transcription_progress_status.get(video_id, {}).get("percent", 0) < 100:
+#         thread_trans = threading.Thread(target=run_transcription, args=(video_id,))
+#         thread_trans.start()
+#     return jsonify({"status": "started"})
+# @app.route("/api/clip_progress/<video_id>")
+# @login_required
+# def clip_progress(video_id):
+#     """Get clip extraction progress."""
+#     progress = clip_extraction_status.get(video_id, {"current": 0, "total": 0, "percent": 0})
+#     return jsonify(progress)
+# @app.route("/api/transcription_progress/<video_id>")
+# @login_required
+# def transcription_progress(video_id):
+#     """Get transcription progress."""
+#     progress = transcription_progress_status.get(video_id, {"status": "not started", "percent": 0})
+#     return jsonify(progress)
+# if __name__ == '__main__':
+#     try:
+#         # Print diagnostic information
+#         print("=" * 50)
+#         print(f"Starting app with configuration:")
+#         print(f"- Running in HF Space: {is_hf_space}")
+#         print(f"- Auth bypass: {bypass_auth}")
+#         print(f"- Port: {os.getenv('PORT', 5000)}")
+#         print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
+#         print("=" * 50)
+#         port = int(os.getenv('PORT', 5000))
+#         app.run(host='0.0.0.0', port=port, debug=True)
+#     except Exception as e:
+#         print(f"Error starting the application: {e}")
+#         import traceback
+#         traceback.print_exc()
 from flask import Flask, render_template, jsonify, request, send_from_directory, send_file, redirect, url_for, session
 import os, json, threading, time, signal, sys
 from datetime import datetime
 from extract_signed_segments_from_annotations import ClipExtractor, VideoClip
 import logging
 from dotenv import load_dotenv
+import boto3
+from botocore.exceptions import ClientError
+import tempfile
+import uuid
+import requests
+from urllib.parse import urlparse
 # Load environment variables
 load_dotenv()
 ALIGNMENTS_DIR = os.path.abspath("data/alignments")
 TRANSCRIPTS_DIR = os.path.abspath("data/transcripts")
+# S3 configuration
+S3_BUCKET = os.getenv('S3_BUCKET', "sorenson-ai-sb-scratch")
+S3_VIDEO_PREFIX = os.getenv('S3_VIDEO_PREFIX', "awilkinson/kylie_dataset_videos_for_alignment_webapp/")
+USE_S3_FOR_VIDEOS = os.getenv('USE_S3_FOR_VIDEOS', 'true').lower() == 'true'
 # Ensure all required directories exist
 for directory in [VIDEO_DIR, ANNOTATIONS_DIR, TEMP_DIR, WORD_TIMESTAMPS_DIR, ALIGNMENTS_DIR, TRANSCRIPTS_DIR]:
     os.makedirs(directory, exist_ok=True)
 clip_extraction_status = {}
 transcription_progress_status = {}
+# S3 helper functions
+def get_s3_client():
+    """Get a boto3 S3 client."""
+    return boto3.client(
+        's3',
+        region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
+        aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
+        aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
+    )
+def list_s3_videos():
+    """List all videos in the S3 bucket with the given prefix."""
+    try:
+        s3_client = get_s3_client()
+        response = s3_client.list_objects_v2(
+            Bucket=S3_BUCKET,
+            Prefix=S3_VIDEO_PREFIX
+        )
+        if 'Contents' not in response:
+            logger.warning(f"No videos found in S3 bucket {S3_BUCKET} with prefix {S3_VIDEO_PREFIX}")
+            return []
+        # Extract video IDs (filenames without extension) from S3 keys
+        videos = []
+        for item in response['Contents']:
+            key = item['Key']
+            if key.endswith('.mp4'):
+                # Extract just the filename without extension
+                filename = os.path.basename(key)
+                video_id = os.path.splitext(filename)[0]
+                videos.append(video_id)
+        return videos
+    except ClientError as e:
+        logger.error(f"Error listing videos from S3: {str(e)}")
+        return []
+def download_video_from_s3(video_id):
+    """Download a video from S3 to the local videos directory."""
+    video_filename = f"{video_id}.mp4"
+    s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
+    local_path = os.path.join(VIDEO_DIR, video_filename)
+    # Check if the file already exists locally
+    if os.path.exists(local_path):
+        logger.info(f"Video {video_id} already exists locally.")
+        return local_path
+    try:
+        logger.info(f"Downloading video {video_id} from S3...")
+        s3_client = get_s3_client()
+        s3_client.download_file(S3_BUCKET, s3_key, local_path)
+        logger.info(f"Video {video_id} downloaded successfully to {local_path}")
+        return local_path
+    except ClientError as e:
+        logger.error(f"Error downloading video from S3: {str(e)}")
+        return None
+def generate_presigned_url(video_id, expiration=3600):
+    """Generate a presigned URL for direct access to the video in S3."""
+    video_filename = f"{video_id}.mp4"
+    s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
+    try:
+        s3_client = get_s3_client()
+        url = s3_client.generate_presigned_url(
+            'get_object',
+            Params={'Bucket': S3_BUCKET, 'Key': s3_key},
+            ExpiresIn=expiration
+        )
+        return url
+    except ClientError as e:
+        logger.error(f"Error generating presigned URL: {str(e)}")
+        return None
 # Graceful shutdown handler
 def graceful_shutdown(signum, frame):
     """Handle graceful shutdown on signals."""
             transcription_progress_status[video_id] = {"status": "completed", "percent": 100}
             return
+        # Download video from S3 if needed
+        if USE_S3_FOR_VIDEOS:
+            video_path = download_video_from_s3(video_id)
+            if not video_path:
+                transcription_progress_status[video_id] = {
+                    "status": "error",
+                    "percent": 0,
+                    "message": f"Failed to download video {video_id} from S3"
+                }
+                return
+        else:
+            video_path = os.path.join(base_dir, "data", "videos", f"{video_id}.mp4")
         transcription_progress_status[video_id] = {"status": "started", "percent": 10}
         # Check if AWS credentials are available
             return render_template('error.html', message="Authentication failed. No username provided.")
     return redirect(url_for('login'))
 @app.route('/health')
 def health_check():
     """Health check endpoint for container verification."""
         "DEBUG": os.environ.get('DEBUG', 'Not set'),
         "SPACE_ID": os.environ.get('SPACE_ID', 'Not set'),
         "BYPASS_AUTH": os.environ.get('BYPASS_AUTH', 'Not set'),
+        "SECRET_KEY": os.environ.get('SECRET_KEY', 'Not set')[:5] + '...' if os.environ.get('SECRET_KEY') else 'Not set',
+        "S3_BUCKET": os.environ.get('S3_BUCKET', 'Not set'),
+        "S3_VIDEO_PREFIX": os.environ.get('S3_VIDEO_PREFIX', 'Not set'),
+        "USE_S3_FOR_VIDEOS": os.environ.get('USE_S3_FOR_VIDEOS', 'Not set')
     }
     logger.info(f"Health check called. Environment: {env_vars}")
         "app_config": {k: str(v) for k, v in app.config.items() if k in
                       ['SESSION_COOKIE_SECURE', 'SESSION_COOKIE_HTTPONLY',
                        'SESSION_COOKIE_SAMESITE', 'PERMANENT_SESSION_LIFETIME']},
+        "s3_config": {
+            "S3_BUCKET": S3_BUCKET,
+            "S3_VIDEO_PREFIX": S3_VIDEO_PREFIX,
+            "USE_S3_FOR_VIDEOS": USE_S3_FOR_VIDEOS
+        }
     }
     return jsonify(info)
 @login_required
 def select_video():
     """Page to select a video for annotation."""
+    if USE_S3_FOR_VIDEOS:
+        video_ids = list_s3_videos()
+    else:
+        if not os.path.exists(VIDEO_DIR):
+            return render_template('error.html', message="Video directory not found.")
+        videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4')]
+        video_ids = [os.path.splitext(v)[0] for v in videos]
     return render_template('select_video.html', video_ids=video_ids, user=session.get('user'))
 @app.route('/player/<video_id>')
 @login_required
 def get_videos():
     """API endpoint to get available videos."""
+    if USE_S3_FOR_VIDEOS:
+        videos = list_s3_videos()
+        if not videos:
+            return jsonify({'error': 'No videos found in S3'}), 404
+        # Return just the filenames with .mp4 extension for compatibility
+        return jsonify([f"{vid}.mp4" for vid in videos])
+    else:
+        # Original local file behavior
+        if not os.path.exists(VIDEO_DIR):
+            return jsonify({'error': 'Video directory not found'}), 404
+        videos = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi', '.mov'))]
+        if not videos:
+            return jsonify({'error': 'No videos found'}), 404
+        return jsonify(videos)
 @app.route('/video/<path:filename>')
 @login_required
 def serve_video(filename):
+    """Serve a video file from S3 or local storage."""
+    video_id = os.path.splitext(filename)[0]  # Remove extension
+    if USE_S3_FOR_VIDEOS:
+        # Option 1: Generate a presigned URL and redirect
+        presigned_url = generate_presigned_url(video_id)
+        if presigned_url:
+            return redirect(presigned_url)
+        # Option 2 (fallback): Download from S3 to local temporary storage and serve
+        local_path = download_video_from_s3(video_id)
+        if local_path and os.path.exists(local_path):
+            return send_from_directory(VIDEO_DIR, filename)
+        return jsonify({'error': 'Video not found in S3'}), 404
+    else:
+        # Original local file behavior
+        if not os.path.exists(os.path.join(VIDEO_DIR, filename)):
+            return jsonify({'error': 'Video not found'}), 404
+        return send_from_directory(VIDEO_DIR, filename)
 @app.route('/save_annotations', methods=['POST'])
 @login_required
 @login_required
 def extract_clips_for_video(video_id):
     """Extract clips and start transcription for a video."""
+    # If using S3, ensure the video is downloaded first
+    if USE_S3_FOR_VIDEOS:
+        video_path = download_video_from_s3(video_id)
+        if not video_path:
+            return jsonify({
+                "status": "error",
+                "message": f"Failed to download video {video_id} from S3"
+            }), 404
     status = clip_extraction_status.get(video_id, {})
     if status.get("percent", 0) < 100:
         thread = threading.Thread(target=run_clip_extraction, args=(video_id,))
         print(f"- Running in HF Space: {is_hf_space}")
         print(f"- Auth bypass: {bypass_auth}")
         print(f"- Port: {os.getenv('PORT', 5000)}")
+        print(f"- S3 for videos: {USE_S3_FOR_VIDEOS}")
+        print(f"- S3 bucket: {S3_BUCKET}")
+        print(f"- S3 prefix: {S3_VIDEO_PREFIX}")
         print(f"- Available videos: {os.listdir(VIDEO_DIR) if os.path.exists(VIDEO_DIR) else 'None'}")
+        if USE_S3_FOR_VIDEOS:
+            try:
+                s3_videos = list_s3_videos()
+                print(f"- Available S3 videos: {s3_videos if s3_videos else 'None'}")
+            except Exception as e:
+                print(f"- Error listing S3 videos: {str(e)}")
         print("=" * 50)
         port = int(os.getenv('PORT', 5000))

get_transcription_with_amazon.py CHANGED Viewed

@@ -8,6 +8,41 @@ import requests
 import time
 from decimal import Decimal
 from typing import Any, Dict, List
 def extract_audio(video_path: str) -> str:
     """Extract audio from video file using ffmpeg.
@@ -113,17 +148,21 @@ def main() -> None:
     )
     video_filename = args.video_id + ".mp4"  # Source video file (with .mp4)
     video_path = os.path.join(base_dir, "data", "videos", video_filename)
     word_timestamps = get_word_timestamps(video_path)
     output_dir = os.path.join(base_dir, "data", "word_timestamps")
     os.makedirs(output_dir, exist_ok=True)
     output_path = os.path.join(output_dir, args.video_id + "_word_timestamps.json")
     with open(output_path, "w") as f:
         json.dump(word_timestamps, f, indent=4)
-    print(f"Word timestamps saved to: {output_path}")
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Get word timestamps for a given video file ID.")
-    parser.add_argument("video_id", help="Video file ID (without extension)")
-    args = parser.parse_args()
-    main()

 import time
 from decimal import Decimal
 from typing import Any, Dict, List
+from botocore.exceptions import ClientError
+S3_BUCKET = "sorenson-ai-sb-scratch"
+S3_VIDEO_PREFIX = "awilkinson/kylie_dataset_videos_for_alignment_webapp/"
+USE_S3_FOR_VIDEOS = True  # Set to True to use S3, False to use local files
+def get_s3_client():
+    """Get a boto3 S3 client."""
+    return boto3.client(
+        's3',
+        region_name=os.environ.get('AWS_DEFAULT_REGION', 'us-west-2'),
+        aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
+        aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
+    )
+def download_video_from_s3(video_id, output_dir):
+    """Download a video from S3."""
+    video_filename = f"{video_id}.mp4"
+    s3_key = f"{S3_VIDEO_PREFIX}{video_filename}"
+    local_path = os.path.join(output_dir, video_filename)
+    # Check if the file already exists locally
+    if os.path.exists(local_path):
+        print(f"Video {video_id} already exists locally.")
+        return local_path
+    try:
+        print(f"Downloading video {video_id} from S3...")
+        s3_client = get_s3_client()
+        s3_client.download_file(S3_BUCKET, s3_key, local_path)
+        print(f"Video {video_id} downloaded successfully to {local_path}")
+        return local_path
+    except ClientError as e:
+        print(f"Error downloading video from S3: {str(e)}")
+        return None
 def extract_audio(video_path: str) -> str:
     """Extract audio from video file using ffmpeg.
     )
     video_filename = args.video_id + ".mp4"  # Source video file (with .mp4)
     video_path = os.path.join(base_dir, "data", "videos", video_filename)
+    # Check if we need to download from S3
+    if USE_S3_FOR_VIDEOS and not os.path.exists(video_path):
+        videos_dir = os.path.join(base_dir, "data", "videos")
+        os.makedirs(videos_dir, exist_ok=True)
+        download_video_from_s3(args.video_id, videos_dir)
+    if not os.path.exists(video_path):
+        print(f"Error: Video file not found: {video_path}")
+        return
     word_timestamps = get_word_timestamps(video_path)
     output_dir = os.path.join(base_dir, "data", "word_timestamps")
     os.makedirs(output_dir, exist_ok=True)
     output_path = os.path.join(output_dir, args.video_id + "_word_timestamps.json")
     with open(output_path, "w") as f:
         json.dump(word_timestamps, f, indent=4)
+    print(f"Word timestamps saved to: {output_path}")

templates/player.html CHANGED Viewed

@@ -137,8 +137,10 @@
         // Use the provided template video_id if available; it should be the base ID (without .mp4)
         const templateVideoId = "{{ video_id|default('') }}";
         let currentVideo = "";
         if (templateVideoId) {
             currentVideo = templateVideoId;
         } else {
             // Fallback: use /videos API and remove the .mp4 extension
             fetch('/videos')
@@ -150,8 +152,7 @@
                     }
                     if (videos.length > 0) {
                         currentVideo = videos[0].replace(/\.mp4$/, "");
-                        document.getElementById('video-source').src = `/video/${videos[0]}`;
-                        document.getElementById('video').load();
                     }
                 })
                 .catch(error => {
@@ -311,6 +312,28 @@
             div.textContent = timestamps.map(t => t.toFixed(2)).join(', ');
         }
         video.addEventListener('timeupdate', () => {
             currentTimeDisplay.textContent = `Current Time: ${video.currentTime.toFixed(2)}`;
         });

         // Use the provided template video_id if available; it should be the base ID (without .mp4)
         const templateVideoId = "{{ video_id|default('') }}";
         let currentVideo = "";
         if (templateVideoId) {
             currentVideo = templateVideoId;
+            loadVideoSource(currentVideo);
         } else {
             // Fallback: use /videos API and remove the .mp4 extension
             fetch('/videos')
                     }
                     if (videos.length > 0) {
                         currentVideo = videos[0].replace(/\.mp4$/, "");
+                        loadVideoSource(currentVideo);
                     }
                 })
                 .catch(error => {
             div.textContent = timestamps.map(t => t.toFixed(2)).join(', ');
         }
+        function loadVideoSource(videoId) {
+            const videoUrl = `/video/${videoId}.mp4`;
+            fetch(videoUrl)
+                .then(response => {
+                    if (response.redirected) {
+                        // If we've been redirected to a presigned S3 URL
+                        document.getElementById('video-source').src = response.url;
+                        document.getElementById('video').load();
+                    } else if (response.ok) {
+                        // If it's a direct file response
+                        document.getElementById('video-source').src = videoUrl;
+                        document.getElementById('video').load();
+                    } else {
+                        throw new Error('Video not found');
+                    }
+                })
+                .catch(error => {
+                    document.getElementById('error-message').textContent = 'Error loading video: ' + error;
+                });
+        }
         video.addEventListener('timeupdate', () => {
             currentTimeDisplay.textContent = `Current Time: ${video.currentTime.toFixed(2)}`;
         });