File size: 3,323 Bytes
dd2b02c 56f1ec9 0e17f64 a64c958 56f1ec9 738c923 6b73084 0e89078 72390b0 56f1ec9 0e89078 d4d3d57 738c923 c418387 738c923 5adac55 56f1ec9 5adac55 0e17f64 5adac55 56f1ec9 0e17f64 509f052 0e17f64 a16e474 0e17f64 c9afa80 c418387 c9afa80 c793959 5adac55 c418387 0e17f64 f9a5e8b d4d3d57 0e89078 d4d3d57 0e17f64 0e89078 003fc60 0e89078 5adac55 c9afa80 c418387 f815a95 c9afa80 0e89078 c418387 f815a95 c418387 79be1a5 0e89078 79be1a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import gradio as gr
import soundfile as sf
from scipy import signal
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
MODEL_IS="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)
pipe_is = pipeline(model=MODEL_IS)
pipe_fo = pipeline(model=MODEL_FO)
def readwav(a_f):
wav, sr = sf.read(a_f, dtype=np.float32)
if len(wav.shape) == 2:
wav = wav.mean(1)
if sr != 16000:
wlen = int(wav.shape[0] / sr * 16000)
wav = signal.resample(wav, wlen)
return wav
def recc(audio_file,model,processor):
wav = readwav(audio_file)
with torch.inference_mode():
input_values = processor(wav,sampling_rate=16000).input_values[0]
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
logits = model(input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
xcp = processor.batch_decode(pred_ids)
return xcp[0]
def recis(audio_file):
single_output = recc(audio_file,model_is,processor_is)
chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
return (single_output, chunk_output)
def recfo(audio_file):
single_output = recc(audio_file,model_fo,processor_fo)
chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
return (single_output, chunk_output)
bl = gr.Blocks()
with bl:
gr.Markdown(
"""
# W2V2 speech recognition
Upload a file for recognition with
https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h
- For some reason, the huggingface 'Hosted inference API' on the model page does not currently work, but this does.
- There is no language model (yet), so it can generate non-words.
- Send errors/bugs to [email protected]
"""
)
with gr.Tabs():
with gr.TabItem("Icelandic"):
with gr.Row():
audio_file = gr.Audio(type="filepath")
with gr.Column():
whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="recognition with chunking")
text_button = gr.Button("Recognise Icelandic")
text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
with gr.TabItem("Faroese"):
with gr.Row():
audio_file = gr.Audio(type="filepath")
with gr.Column():
whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="recognition with chunking")
text_button = gr.Button("Recognise Faroese")
text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
bl.launch()
|