Spaces:

cigol123
/

Macedonian-ASR

Running

App Files Files Community

Macedonian-ASR / app.py

cigol123

Update app.py

54026e9 verified 18 days ago

raw

history blame

2.52 kB

	import gradio as gr
	import torch
	from transformers import WhisperProcessor, WhisperForConditionalGeneration
	import soundfile as sf
	import numpy as np
	from scipy import signal
	import os

	# Set the cache directory to a writable location
	cache_dir = "/tmp/.cache"
	os.environ["TRANSFORMERS_CACHE"] = cache_dir
	os.environ["HF_DATASETS_CACHE"] = cache_dir
	os.environ["TORCH_HOME"] = cache_dir # Set PyTorch cache directory

	# Ensure the cache directory exists and is writable
	os.makedirs(cache_dir, exist_ok=True)

	# Load the base Whisper model and processor
	def load_model():
	print("Loading base Whisper model and processor...")
	processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")

	# Load the fine-tuned weights from the Macedonian-ASR repository
	print("Loading fine-tuned weights from Macedonian-ASR...")
	model.load_state_dict(torch.hub.load_state_dict_from_url(
	"https://huggingface.co/Macedonian-ASR/whisper-large-v3-macedonian-asr/resolve/main/pytorch_model.bin",
	map_location="cpu",
	model_dir=cache_dir # Save downloaded weights to the writable cache directory
	))
	print("✓ Model and processor loaded successfully!")
	return processor, model

	processor, model = load_model()

	def process_audio(audio_path):
	# Load and resample to 16kHz using scipy
	waveform, sr = sf.read(audio_path)
	if len(waveform.shape) > 1: # Convert stereo to mono
	waveform = waveform.mean(axis=1)
	if sr != 16000: # Resample if necessary
	num_samples = int(len(waveform) * 16000 / sr)
	waveform = signal.resample(waveform, num_samples)

	# Process the audio
	inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
	predicted_ids = model.generate(**inputs, language="mk")
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
	return transcription

	# Gradio interface
	demo = gr.Interface(
	fn=process_audio,
	inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
	outputs="text",
	title="Македонско препознавање на говор / Macedonian Speech Recognition",
	description="Качете аудио или користете микрофон за транскрипција на македонски говор / Upload audio or use microphone to transcribe Macedonian speech"
	)

	if __name__ == "__main__":
	demo.launch()