Spaces:

clr
/

w2v2asr

Sleeping

File size: 3,323 Bytes

dd2b02c
56f1ec9
0e17f64
a64c958
56f1ec9
738c923
6b73084
0e89078
 
72390b0
56f1ec9
 
 
0e89078
 
 
 
d4d3d57
738c923
c418387
 
738c923
 
5adac55
 
56f1ec9
 
 
 
 
5adac55
 
0e17f64
5adac55
56f1ec9
0e17f64
509f052
0e17f64
a16e474
0e17f64
 
 
 
 
c9afa80
c418387
c9afa80
c793959
5adac55
c418387
 
 
0e17f64
f9a5e8b
d4d3d57
 
 
0e89078
 
 
 
 
 
d4d3d57
0e17f64
0e89078
003fc60
0e89078
 
 
 
 
 
5adac55
c9afa80
 
c418387
f815a95
c9afa80
0e89078
 
 
c418387
 
 
f815a95
c418387
79be1a5
0e89078
79be1a5

import gradio as gr
import soundfile as sf
from scipy import signal
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline

MODEL_IS="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS) 
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO) 

pipe_is = pipeline(model=MODEL_IS)
pipe_fo = pipeline(model=MODEL_FO)



def readwav(a_f):
    wav, sr = sf.read(a_f, dtype=np.float32)
    if len(wav.shape) == 2:
        wav = wav.mean(1)
    if sr != 16000:
        wlen = int(wav.shape[0] / sr * 16000)
        wav = signal.resample(wav, wlen)
    return wav

def recc(audio_file,model,processor):
    wav = readwav(audio_file)
    with torch.inference_mode():
        input_values = processor(wav,sampling_rate=16000).input_values[0]
        input_values = torch.tensor(input_values, device=device).unsqueeze(0)
        logits = model(input_values).logits
        pred_ids = torch.argmax(logits, dim=-1)
        xcp = processor.batch_decode(pred_ids)
        return xcp[0]

    
def recis(audio_file):
    single_output = recc(audio_file,model_is,processor_is)
    chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
    return (single_output, chunk_output)

def recfo(audio_file):
    single_output = recc(audio_file,model_fo,processor_fo)
    chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
    return (single_output, chunk_output)


bl = gr.Blocks()
with bl:

    gr.Markdown(
        """
    # W2V2 speech recognition
    Upload a file for recognition with 
    https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h 
    or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h  

    - For some reason, the huggingface 'Hosted inference API' on the model page does not currently work, but this does.  
    - There is no language model (yet), so it can generate non-words.
    - Send errors/bugs to [email protected] 
    """
    )

    with gr.Tabs():
        with gr.TabItem("Icelandic"):
            with gr.Row():
                audio_file = gr.Audio(type="filepath")
                with gr.Column():
                    whole_output = gr.Textbox(label="whole-file recognition")
                    chunk_output = gr.Textbox(label="recognition with chunking")
            text_button = gr.Button("Recognise Icelandic")
            text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
        with gr.TabItem("Faroese"):
            with gr.Row():
                audio_file = gr.Audio(type="filepath")
                with gr.Column():
                    whole_output = gr.Textbox(label="whole-file recognition")
                    chunk_output = gr.Textbox(label="recognition with chunking")
            text_button = gr.Button("Recognise Faroese")
            text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])

bl.launch()