transcription / app.py
BeebekBhz's picture
Create app.py
4c7cbb9 verified
import gradio as gr
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
# Load pre-trained model and processor
MODEL_ID = "itsbib/wav2vec2-large-xls-r-300m-nepali-english-asr"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
# Function to transcribe audio
def transcribe(audio):
if isinstance(audio, tuple): # If recorded, Gradio returns a tuple (file path, sample rate)
audio = audio[0] # Extract file path
# Load and process audio
audio_input, _ = librosa.load(audio, sr=16000)
inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
# Generate predictions
with torch.no_grad():
logits = model(inputs.input_values).logits
# Decode predicted text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Create Gradio interface
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), # Allows both recording & file upload
outputs="text",
title="Nepali to English ASR",
description="🎀 Record live or upload a Nepali audio file, and this model will transcribe it into English text."
)
iface.launch()