Spaces:

cigol123
/

Macedonian-ASR

Running

File size: 2,520 Bytes

9ece3ee
 
 
 
 
 
4360654
 
 
58aeb8f
 
 
ea16add
58aeb8f
20a10e3
58aeb8f
9ece3ee
54026e9
9b83916
54026e9
 
 
 
 
 
 
 
 
 
 
9b83916
 
9ece3ee
9b83916
9ece3ee
 
9b83916
 
 
 
 
 
 
 
 
 
 
 
 
9ece3ee
 
 
 
 
 
 
 
 
 
 
9b83916

import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
import numpy as np
from scipy import signal
import os

# Set the cache directory to a writable location
cache_dir = "/tmp/.cache"
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["HF_DATASETS_CACHE"] = cache_dir
os.environ["TORCH_HOME"] = cache_dir  # Set PyTorch cache directory

# Ensure the cache directory exists and is writable
os.makedirs(cache_dir, exist_ok=True)

# Load the base Whisper model and processor
def load_model():
    print("Loading base Whisper model and processor...")
    processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
    
    # Load the fine-tuned weights from the Macedonian-ASR repository
    print("Loading fine-tuned weights from Macedonian-ASR...")
    model.load_state_dict(torch.hub.load_state_dict_from_url(
        "https://huggingface.co/Macedonian-ASR/whisper-large-v3-macedonian-asr/resolve/main/pytorch_model.bin",
        map_location="cpu",
        model_dir=cache_dir  # Save downloaded weights to the writable cache directory
    ))
    print("✓ Model and processor loaded successfully!")
    return processor, model

processor, model = load_model()

def process_audio(audio_path):
    # Load and resample to 16kHz using scipy
    waveform, sr = sf.read(audio_path)
    if len(waveform.shape) > 1:  # Convert stereo to mono
        waveform = waveform.mean(axis=1)
    if sr != 16000:  # Resample if necessary
        num_samples = int(len(waveform) * 16000 / sr)
        waveform = signal.resample(waveform, num_samples)
    
    # Process the audio
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
    predicted_ids = model.generate(**inputs, language="mk")
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Gradio interface
demo = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
    outputs="text",
    title="Македонско препознавање на говор / Macedonian Speech Recognition",
    description="Качете аудио или користете микрофон за транскрипција на македонски говор / Upload audio or use microphone to transcribe Macedonian speech"
)

if __name__ == "__main__":
    demo.launch()