from fastapi import FastAPI, UploadFile, File from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor import torch import tempfile import os import time from fastapi.responses import HTMLResponse from fastapi.staticfiles import StaticFiles # Define FastAPI app app = FastAPI() # Load the Whisper model once during startup device = 0 if torch.cuda.is_available() else -1 # Use GPU if available, otherwise CPU asr_pipeline = pipeline(model="openai/whisper-small", device=device) # Initialize Whisper model # asr_pipeline = pipeline( model="openai/whisper-small", device=device, language="pt") # Basic GET endpoint @app.get("/") def read_root(): return {"message": "Welcome to the FastAPI app on Hugging Face Spaces!"} # POST endpoint to transcribe audio @app.post("/transcribe/") async def transcribe_audio(file: UploadFile = File(...)): start_time = time.time() # Save the uploaded file using a temporary file manager with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: temp_audio_file.write(await file.read()) temp_file_path = temp_audio_file.name # Transcribe the audio with long-form generation enabled transcription_start = time.time() transcription = asr_pipeline(temp_file_path, return_timestamps=True) # Enable timestamp return for long audio files transcription_end = time.time() # Clean up temporary file after use os.remove(temp_file_path) # Log time durations end_time = time.time() print(f"Time to transcribe audio: {transcription_end - transcription_start:.4f} seconds") print(f"Total execution time: {end_time - start_time:.4f} seconds") return {"transcription": transcription['text']} @app.get("/playground/", response_class=HTMLResponse) def playground(): html_content = """ Voice Recorder

Record your voice

Press start to record your voice...

""" return HTMLResponse(content=html_content) # If running as the main module, start Uvicorn if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)