Spaces:

cigol123
/

Macedonian-ASR

Running

App Files Files Community

Macedonian-ASR / app.py

cigol123

Create app.py

9ece3ee verified 18 days ago

raw

history blame

1.99 kB

	import gradio as gr
	import torch
	from transformers import WhisperProcessor, WhisperForConditionalGeneration
	import soundfile as sf
	import numpy as np
	from scipy import signal

	# Ensure the model runs on GPU if available
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Running on device: {device}")

	# Load the model and processor
	print("Loading Whisper model for Macedonian transcription...")
	processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").to(device)
	print("✓ Model loaded successfully!")

	def process_audio(audio_path):
	try:
	# Load and resample to 16kHz using scipy
	waveform, sr = sf.read(audio_path)
	if len(waveform.shape) > 1: # Convert stereo to mono
	waveform = waveform.mean(axis=1)
	if sr != 16000: # Resample if necessary
	num_samples = int(len(waveform) * 16000 / sr)
	waveform = signal.resample(waveform, num_samples)

	# Process the audio
	inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").to(device)
	print("Transcribing...")
	predicted_ids = model.generate(**inputs, language="mk")
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
	return transcription
	except Exception as e:
	return f"Error during transcription: {str(e)}"

	# Gradio interface
	demo = gr.Interface(
	fn=process_audio,
	inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
	outputs="text",
	title="Македонско препознавање на говор / Macedonian Speech Recognition",
	description="Качете аудио или користете микрофон за транскрипција на македонски говор / Upload audio or use microphone to transcribe Macedonian speech"
	)

	if __name__ == "__main__":
	demo.launch(share=True)