Spaces:

garyuzair
/

VideoTranscriber

Running

File size: 3,470 Bytes

import gradio as gr
from transformers import pipeline
import tempfile
import os
import time
import ffmpeg
import numpy as np

# Cache the model with CPU optimization
def load_model():
    return pipeline(
        "automatic-speech-recognition", 
        model="openai/whisper-tiny",
        device="cpu"  # Force CPU usage
    )

# Load model at startup
model = load_model()

def extract_audio(video_path):
    """Optimized audio extraction for CPU"""
    audio_path = tempfile.mktemp(suffix=".wav")
    # Fixed ffmpeg command syntax
    (
        ffmpeg
        .input(video_path)
        .output(audio_path, ac=1, ar=16000, acodec='pcm_s16le')
        .overwrite_output()
        .run(quiet=True)
    )
    return audio_path

def transcribe_video(video_file):
    """Process video and return transcript"""
    start_time = time.time()
    
    # Save video to temp file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video:
        tmp_video.write(video_file)
        video_path = tmp_video.name
    
    # Get file size
    file_size = os.path.getsize(video_path) / (1024 * 1024)  # in MB
    
    # Extract audio
    audio_path = extract_audio(video_path)
    
    # Clean up video file
    os.unlink(video_path)
    
    # Transcribe
    result = model(audio_path)
    transcript = result["text"]
    
    # Clean up audio file
    os.unlink(audio_path)
    
    process_time = time.time() - start_time
    
    return transcript, f"✅ Processed {file_size:.1f}MB video in {process_time:.1f} seconds"

# Gradio interface
with gr.Blocks(title="Free Video Transcriber", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎥 Free Video Transcriber")
    gr.Markdown("Upload any video to transcribe using Whisper Tiny (optimized for CPU)")
    
    with gr.Row():
        with gr.Column():
            video_input = gr.File(label="Upload Video", file_types=["video"])
            transcribe_btn = gr.Button("Transcribe Video", variant="primary")
            
        with gr.Column():
            transcript_output = gr.Textbox(label="Transcript", lines=10, interactive=True)
            status_output = gr.Textbox(label="Status", interactive=False)
            download_btn = gr.DownloadButton(label="Download Transcript")
    
    # Processing function
    def process_video(video_file):
        if video_file is None:
            return "", "Please upload a video file first", None
        
        # Read file content
        with open(video_file.name, "rb") as f:
            video_bytes = f.read()
        
        transcript, status = transcribe_video(video_bytes)
        return transcript, status, transcript
    
    # Set up button actions
    transcribe_btn.click(
        fn=process_video,
        inputs=video_input,
        outputs=[transcript_output, status_output, download_btn]
    )
    
    # Info section
    with gr.Accordion("ℹ️ About this app", open=False):
        gr.Markdown("""
        **How it works:**
        - Uses OpenAI's Whisper Tiny model optimized for CPU
        - Extracts audio from video using FFmpeg
        - Transcribes audio to text
        - Works with MP4, MOV, AVI, MKV, WEBM formats
        
        **Performance notes:**
        - 1 min video: ~10-20 seconds
        - 5 min video: ~1-2 minutes
        - 10 min video: ~2-4 minutes
        
        **Optimized for:** Hugging Face Spaces free tier (CPU only)
        """)

# Launch the app
if __name__ == "__main__":
    demo.launch()