Spaces:

legusxyz
/

whisper

Running

File size: 4,689 Bytes

from fastapi import FastAPI, UploadFile, File
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
import torch
import tempfile
import os
import time
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles

# Define FastAPI app
app = FastAPI()

# Load the Whisper model once during startup
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available, otherwise CPU
asr_pipeline = pipeline(model="openai/whisper-small", device=device)  # Initialize Whisper model
# asr_pipeline = pipeline( model="openai/whisper-small", device=device, language="pt")


# Basic GET endpoint
@app.get("/")
def read_root():
    return {"message": "Welcome to the FastAPI app on Hugging Face Spaces!"}

# POST endpoint to transcribe audio
@app.post("/transcribe/")
async def transcribe_audio(file: UploadFile = File(...)):
    start_time = time.time()

    # Save the uploaded file using a temporary file manager
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        temp_audio_file.write(await file.read())
        temp_file_path = temp_audio_file.name

    # Transcribe the audio with long-form generation enabled
    transcription_start = time.time()
    transcription = asr_pipeline(temp_file_path, return_timestamps=True)  # Enable timestamp return for long audio files
    transcription_end = time.time()

    # Clean up temporary file after use
    os.remove(temp_file_path)

    # Log time durations
    end_time = time.time()
    print(f"Time to transcribe audio: {transcription_end - transcription_start:.4f} seconds")
    print(f"Total execution time: {end_time - start_time:.4f} seconds")

    return {"transcription": transcription['text']}

@app.get("/playground/", response_class=HTMLResponse)
def playground():
    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Voice Recorder</title>
    </head>
    <body>
        <h1>Record your voice</h1>
        <button id="startBtn">Start Recording</button>
        <button id="stopBtn" disabled>Stop Recording</button>
        <p id="status">Press start to record your voice...</p>

        <audio id="audioPlayback" controls style="display:none;"></audio>
        <script>
            let mediaRecorder;
            let audioChunks = [];

            const startBtn = document.getElementById('startBtn');
            const stopBtn = document.getElementById('stopBtn');
            const status = document.getElementById('status');
            const audioPlayback = document.getElementById('audioPlayback');

            // Start Recording
            startBtn.addEventListener('click', async () => {
                const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
                mediaRecorder = new MediaRecorder(stream);
                mediaRecorder.start();

                status.textContent = 'Recording...';
                startBtn.disabled = true;
                stopBtn.disabled = false;

                mediaRecorder.ondataavailable = event => {
                    audioChunks.push(event.data);
                };
            });

            // Stop Recording
            stopBtn.addEventListener('click', () => {
                mediaRecorder.stop();
                mediaRecorder.onstop = async () => {
                    status.textContent = 'Recording stopped. Preparing to send...';
                    const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
                    const audioUrl = URL.createObjectURL(audioBlob);
                    audioPlayback.src = audioUrl;
                    audioPlayback.style.display = 'block';
                    audioChunks = [];

                    // Send audio blob to FastAPI endpoint
                    const formData = new FormData();
                    formData.append('file', audioBlob, 'recording.wav');

                    const response = await fetch('/transcribe/', {
                        method: 'POST',
                        body: formData,
                    });

                    const result = await response.json();
                    status.textContent = 'Transcription: ' + result.transcription;
                };

                startBtn.disabled = false;
                stopBtn.disabled = true;
            });
        </script>
    </body>
    </html>
    """
    return HTMLResponse(content=html_content)
# If running as the main module, start Uvicorn
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)