|
import gradio as gr |
|
import soundfile as sf |
|
import torch, torchaudio |
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
from datasets import load_dataset, Audio |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
MODEL_NAME="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h" |
|
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).to(device) |
|
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME) |
|
|
|
torch.random.manual_seed(0) |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def show_ex(exnum): |
|
|
|
return(exnum) |
|
|
|
|
|
def recc(a_f): |
|
wav, sr = sf.read(a_f, dtype=np.float32) |
|
if len(wav.shape) == 2: |
|
wav = wav.mean(1) |
|
if sr != 16000: |
|
wlen = int(wav.shape[0] / sr * 16000) |
|
wav = signal.resample(wav, wlen) |
|
|
|
with torch.inference_mode(): |
|
wav = torch.from_numpy(wav).unsqueeze(0) |
|
if torch.cuda.is_available(): |
|
wav = wav.cuda() |
|
input_values = processor(wav).input_values |
|
return input_values |
|
|
|
|
|
bl = gr.Blocks() |
|
with bl: |
|
text_input = gr.Textbox() |
|
text_output = gr.Textbox() |
|
text_button = gr.Button("Run") |
|
|
|
|
|
audio_file = gr.Audio(type="filepath") |
|
|
|
text_button.click(recc, inputs=audio_file, outputs=text_output) |
|
|
|
|
|
bl.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|