import gradio as gr from transformers import pipeline import tempfile import os import time import ffmpeg import numpy as np # Cache the model with CPU optimization def load_model(): return pipeline( "automatic-speech-recognition", model="openai/whisper-tiny", device="cpu" # Force CPU usage ) # Load model at startup model = load_model() def extract_audio(video_path): """Optimized audio extraction for CPU""" audio_path = tempfile.mktemp(suffix=".wav") # Fixed ffmpeg command syntax ( ffmpeg .input(video_path) .output(audio_path, ac=1, ar=16000, acodec='pcm_s16le') .overwrite_output() .run(quiet=True) ) return audio_path def transcribe_video(video_file): """Process video and return transcript""" start_time = time.time() # Save video to temp file with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video: tmp_video.write(video_file) video_path = tmp_video.name # Get file size file_size = os.path.getsize(video_path) / (1024 * 1024) # in MB # Extract audio audio_path = extract_audio(video_path) # Clean up video file os.unlink(video_path) # Transcribe result = model(audio_path) transcript = result["text"] # Clean up audio file os.unlink(audio_path) process_time = time.time() - start_time return transcript, f"✅ Processed {file_size:.1f}MB video in {process_time:.1f} seconds" # Gradio interface with gr.Blocks(title="Free Video Transcriber", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎥 Free Video Transcriber") gr.Markdown("Upload any video to transcribe using Whisper Tiny (optimized for CPU)") with gr.Row(): with gr.Column(): video_input = gr.File(label="Upload Video", file_types=["video"]) transcribe_btn = gr.Button("Transcribe Video", variant="primary") with gr.Column(): transcript_output = gr.Textbox(label="Transcript", lines=10, interactive=True) status_output = gr.Textbox(label="Status", interactive=False) download_btn = gr.DownloadButton(label="Download Transcript") # Processing function def process_video(video_file): if video_file is None: return "", "Please upload a video file first", None # Read file content with open(video_file.name, "rb") as f: video_bytes = f.read() transcript, status = transcribe_video(video_bytes) return transcript, status, transcript # Set up button actions transcribe_btn.click( fn=process_video, inputs=video_input, outputs=[transcript_output, status_output, download_btn] ) # Info section with gr.Accordion("ℹ️ About this app", open=False): gr.Markdown(""" **How it works:** - Uses OpenAI's Whisper Tiny model optimized for CPU - Extracts audio from video using FFmpeg - Transcribes audio to text - Works with MP4, MOV, AVI, MKV, WEBM formats **Performance notes:** - 1 min video: ~10-20 seconds - 5 min video: ~1-2 minutes - 10 min video: ~2-4 minutes **Optimized for:** Hugging Face Spaces free tier (CPU only) """) # Launch the app if __name__ == "__main__": demo.launch()