File size: 5,065 Bytes
dd2b02c 56f1ec9 0e17f64 a64c958 56f1ec9 738c923 3f1d354 6b73084 544f10c 0e89078 3f1d354 72390b0 56f1ec9 0e89078 d4d3d57 738c923 c418387 738c923 3f1d354 738c923 5adac55 56f1ec9 5adac55 0e17f64 5adac55 56f1ec9 0e17f64 509f052 0e17f64 a16e474 0e17f64 3f1d354 0e17f64 3f1d354 c418387 3f1d354 c793959 5adac55 3f1d354 c418387 3f1d354 0e17f64 5af0b29 6a7009c f9a5e8b d4d3d57 0e89078 544f10c 161fbd1 544f10c 161fbd1 0e89078 544f10c 0e89078 d4d3d57 544f10c a9ad990 0e89078 3f1d354 c9afa80 7bc8c40 5af0b29 7bc8c40 5af0b29 3f1d354 5af0b29 7bc8c40 0e89078 3f1d354 c418387 7bc8c40 f815a95 7bc8c40 3f1d354 79be1a5 0e89078 79be1a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
import soundfile as sf
from scipy import signal
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
from faster_whisper import WhisperModel
MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
MODEL_WHIS= "language-and-voice-lab/whisper-large-icelandic-62640-steps-967h"
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)
pipe_is = pipeline(model=MODEL_IS)
pipe_fo = pipeline(model=MODEL_FO)
whp_is = WhisperProcessor.from_pretrained(MODEL_WHIS)
whm_is = WhisperForConditionalGeneration.from_pretrained(MODEL_WHIS)
def readwav(a_f):
wav, sr = sf.read(a_f, dtype=np.float32)
if len(wav.shape) == 2:
wav = wav.mean(1)
if sr != 16000:
wlen = int(wav.shape[0] / sr * 16000)
wav = signal.resample(wav, wlen)
return wav
def recc(audio_file,model,processor):
wav = readwav(audio_file)
with torch.inference_mode():
input_values = processor(wav,sampling_rate=16000).input_values[0]
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
logits = model(input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
xcp = processor.batch_decode(pred_ids)
return xcp[0]
def whrecc(audio_file,wmodel,wprocessor):
wav = readwav(audio_file)
input_features = wprocessor(wav, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = wmodel.generate(input_features)
dec = wprocessor.batch_decode(predicted_ids, skip_special_tokens=True,language_id='is')
xcp = dec[0]
return xcp
def recis(audio_file):
#single_output = recc(audio_file,model_is,processor_is)
chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
#return (single_output, chunk_output)
return chunk_output
def recfo(audio_file):
#single_output = recc(audio_file,model_fo,processor_fo)
chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
#return (single_output, chunk_output)
return chunk_output
def recwhis(audio_file):
wh_output = whrecc(audio_file,whm_is,whp_is)
return(wh_output)
def pick_asrc(au_src):
return gr.update(source=au_src,value=None)
bl = gr.Blocks()
with bl:
gr.Markdown(
"""
# Speech recognition
### Users logged in to a Huggingface account can use each model's normal hosted inference API instead.
## * * * * * * * *
Upload a file for recognition with
https://huggingface.co/language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h
- Wav2Vec2 models have no language model (yet), so it can generate non-words.
- Whisper can hallucinate.
- Send errors/bugs to [email protected]
"""
)
with gr.Tabs():
with gr.TabItem("Icelandic"):
with gr.Row():
with gr.Column():
asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
with gr.Column():
#whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
whisper_output = gr.Textbox(label="Whisper recognition")
w2v_button = gr.Button("Recognise Icelandic with Wav2Vec2")
whi_button = gr.Button("Recognise Icelandic with Whisper")
#text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output])
asrc.change(pick_asrc,asrc,audio_file)
with gr.TabItem("Faroese"):
with gr.Row():
with gr.Column():
asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
with gr.Column():
#whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
text_button = gr.Button("Recognise Faroese")
#text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
asrc.change(pick_asrc,asrc,audio_file)
bl.launch()
|