import gradio as gr import soundfile as sf import numpy as np import torch, torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor MODEL_IS="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h" MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h" torch.random.manual_seed(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device) processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS) model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device) processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO) def recc(audio_file,model,processor): wav, sr = sf.read(audio_file, dtype=np.float32) if len(wav.shape) == 2: wav = wav.mean(1) if sr != 16000: wlen = int(wav.shape[0] / sr * 16000) wav = signal.resample(wav, wlen) with torch.inference_mode(): input_values = processor(wav,sampling_rate=16000).input_values[0] input_values = torch.tensor(input_values, device=device).unsqueeze(0) logits = model(input_values).logits pred_ids = torch.argmax(logits, dim=-1) xcp = processor.batch_decode(pred_ids) return xcp bl = gr.Blocks() with bl: gr.Markdown( """ # W2V2 speech recognition Upload a file for recognition with https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h - For some reason, the huggingface 'Hosted inference API' on the model page does not work, but this demo does. - There is no language model (yet), so it can generate non-words. """ ) with gr.Tabs(): with gr.TabItem("Icelandic"): with gr.Row(): audio_file = gr.Audio(type="filepath") text_output = gr.Textbox() text_button = gr.Button("Recognise") text_button.click(recc, inputs=[audio_file,model_is,processor_is], outputs=text_output) with gr.TabItem("Faroese"): with gr.Row(): audio_file = gr.Audio(type="filepath") text_output = gr.Textbox() text_button = gr.Button("Recognise") text_button.click(recc, inputs=[audio_file,model_fo,processor_fo], outputs=text_output) bl.launch()