Macedonian-ASR / app.py
cigol123's picture
Create app.py
9ece3ee verified
raw
history blame
1.99 kB
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
import numpy as np
from scipy import signal
# Ensure the model runs on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {device}")
# Load the model and processor
print("Loading Whisper model for Macedonian transcription...")
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").to(device)
print("✓ Model loaded successfully!")
def process_audio(audio_path):
try:
# Load and resample to 16kHz using scipy
waveform, sr = sf.read(audio_path)
if len(waveform.shape) > 1: # Convert stereo to mono
waveform = waveform.mean(axis=1)
if sr != 16000: # Resample if necessary
num_samples = int(len(waveform) * 16000 / sr)
waveform = signal.resample(waveform, num_samples)
# Process the audio
inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").to(device)
print("Transcribing...")
predicted_ids = model.generate(**inputs, language="mk")
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
except Exception as e:
return f"Error during transcription: {str(e)}"
# Gradio interface
demo = gr.Interface(
fn=process_audio,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
outputs="text",
title="Македонско препознавање на говор / Macedonian Speech Recognition",
description="Качете аудио или користете микрофон за транскрипција на македонски говор / Upload audio or use microphone to transcribe Macedonian speech"
)
if __name__ == "__main__":
demo.launch(share=True)