File size: 2,107 Bytes
dd2b02c 56f1ec9 a64c958 56f1ec9 88ec444 56f1ec9 6b73084 72390b0 149c35c 72390b0 56f1ec9 43047d5 7d31b0d 04ae345 56f1ec9 d4d3d57 04ae345 56f1ec9 d4d3d57 56f1ec9 99453c8 509f052 a16e474 e462854 c793959 f9a5e8b d4d3d57 04ae345 509f052 f9a5e8b d4d3d57 df9a8d7 79be1a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import gradio as gr
import soundfile as sf
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset, Audio
import matplotlib.pyplot as plt
MODEL_NAME="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).to(device)
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME) # do i need this? can't remember
#ds = load_dataset("language-and-voice-lab/samromur_asr",split='train',streaming=True)
#ds = load_dataset("language-and-voice-lab/samromur_asr",split='test')
#ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
def show_ex(exnum):
#return(ds['audio_id'][exnum])
return(exnum)
def recc(a_f):
wav, sr = sf.read(a_f, dtype=np.float32)
if len(wav.shape) == 2:
wav = wav.mean(1)
if sr != 16000:
wlen = int(wav.shape[0] / sr * 16000)
wav = signal.resample(wav, wlen)
with torch.inference_mode():
#wav = torch.from_numpy(wav).unsqueeze(0)
#if torch.cuda.is_available():
# wav = wav.cuda()
input_values = processor(wav,sampling_rate=16000).input_values[0]
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
logits = model(input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
#pred_ids= pred_ids[0].cpu().detach()
xcp = processor.batch_decode(pred_ids)
return xcp
bl = gr.Blocks()
with bl:
audio_file = gr.Audio(type="filepath")
text_button = gr.Button("Recognise")
text_output = gr.Textbox()
text_button.click(recc, inputs=audio_file, outputs=text_output)
bl.launch()
#https://mercury-docs.readthedocs.io/en/latest/deploy/hugging-face-spaces/
#https://huggingface.co/spaces/pplonski/deploy-mercury
#https://discuss.huggingface.co/t/deploy-interactive-jupyter-notebook-on-spaces-with-mercury/17000
#https://huggingface.co/docs/transformers/notebooks
|