|
import torch |
|
import torchaudio |
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline |
|
import gradio as gr |
|
|
|
|
|
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
WHISPER_SAMPLE_RATE = 16000 |
|
|
|
|
|
model_id = "ixxan/whisper-small-thugy20" |
|
processor = AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20") |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained( |
|
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True |
|
) |
|
model.to(DEVICE) |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model=model, |
|
tokenizer=processor.tokenizer, |
|
feature_extractor=processor.feature_extractor, |
|
torch_dtype=torch_dtype, |
|
device=DEVICE, |
|
) |
|
|
|
def preprocess_audio(audio_path: str) -> torch.Tensor: |
|
try: |
|
audio, sample_rate = torchaudio.load(audio_path) |
|
|
|
if sample_rate != WHISPER_SAMPLE_RATE: |
|
resampler = torchaudio.transforms.Resample( |
|
orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE |
|
) |
|
audio = resampler(audio) |
|
|
|
if audio.shape[0] > 1: |
|
audio = torch.mean(audio, dim=0) |
|
return audio.squeeze() |
|
except Exception as e: |
|
raise RuntimeError(f"Error processing audio file: {str(e)}") |
|
|
|
def transcribe(audio_path): |
|
try: |
|
if audio_path is None: |
|
return "No audio provided. Please record or upload an audio file." |
|
|
|
audio_input = preprocess_audio(audio_path) |
|
input_features = processor( |
|
audio_input, |
|
sampling_rate=WHISPER_SAMPLE_RATE, |
|
return_tensors="pt", |
|
).input_features.to(DEVICE) |
|
predicted_ids = model.generate(input_features) |
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
return transcription |
|
except Exception as e: |
|
return f"Error transcribing audio: {str(e)}" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs="text", |
|
title="Uyghur Speech Recognition", |
|
description="Upload or record audio in Uyghur to get its transcription.", |
|
examples=[], |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |