Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import librosa | |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
# Load pre-trained model and processor | |
MODEL_ID = "itsbib/wav2vec2-large-xls-r-300m-nepali-english-asr" | |
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) | |
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID) | |
# Function to transcribe audio | |
def transcribe(audio): | |
if isinstance(audio, tuple): # If recorded, Gradio returns a tuple (file path, sample rate) | |
audio = audio[0] # Extract file path | |
# Load and process audio | |
audio_input, _ = librosa.load(audio, sr=16000) | |
inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True) | |
# Generate predictions | |
with torch.no_grad(): | |
logits = model(inputs.input_values).logits | |
# Decode predicted text | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids)[0] | |
return transcription | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), # Allows both recording & file upload | |
outputs="text", | |
title="Nepali to English ASR", | |
description="π€ Record live or upload a Nepali audio file, and this model will transcribe it into English text." | |
) | |
iface.launch() | |