import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import numpy as np
import librosa

# Load the pre-trained model and processor
model_name = "facebook/s2t-wav2vec2-large-en-ar"
model = Wav2Vec2ForCTC.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)

# Function to transcribe audio using the model
def transcribe(audio):
    # Resample the audio to 16kHz if necessary
    if audio.ndim > 1:  # If audio is stereo
        audio = audio.mean(axis=1)  # Convert to mono
    
    # Resample audio to 16kHz if it's not already
    audio = librosa.resample(audio, orig_sr=audio.shape[0] / len(audio), target_sr=16000)

    # Process the audio to match the model's input format
    inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
    
    # Get the model's predictions
    with torch.no_grad():
        logits = model(input_values=inputs.input_values).logits
    
    # Decode the predicted text
    predicted_ids = logits.argmax(dim=-1)
    transcription = processor.decode(predicted_ids[0])
    
    return transcription

# Create the Gradio interface
interface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(type="numpy"),  # Take the audio input as numpy array
    outputs="text",  # Output transcribed text
    live=True  # Optional: live transcribing as you speak
)

# Launch the interface
interface.launch()