Spaces:

clr
/

w2v2asr

Sleeping

File size: 5,065 Bytes

dd2b02c
56f1ec9
0e17f64
a64c958
56f1ec9
738c923
3f1d354
6b73084
544f10c
0e89078
3f1d354
72390b0
56f1ec9
 
 
0e89078
 
 
 
d4d3d57
738c923
c418387
 
738c923
3f1d354
 
 
 
738c923
5adac55
 
56f1ec9
 
 
 
 
5adac55
 
0e17f64
5adac55
56f1ec9
0e17f64
509f052
0e17f64
a16e474
0e17f64
 
3f1d354
 
 
 
 
 
 
 
0e17f64
 
 
3f1d354
c418387
3f1d354
 
c793959
5adac55
3f1d354
c418387
3f1d354
 
 
 
 
 
 
0e17f64
5af0b29
6a7009c
f9a5e8b
d4d3d57
 
 
0e89078
 
544f10c
161fbd1
544f10c
161fbd1
 
0e89078
544f10c
0e89078
d4d3d57
544f10c
 
a9ad990
0e89078
 
 
 
 
 
3f1d354
 
 
c9afa80
7bc8c40
 
 
5af0b29
 
7bc8c40
5af0b29
3f1d354
5af0b29
 
 
7bc8c40
0e89078
 
3f1d354
 
 
c418387
7bc8c40
 
f815a95
7bc8c40
 
3f1d354
 
 
79be1a5
0e89078
79be1a5

import gradio as gr
import soundfile as sf
from scipy import signal
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
from faster_whisper import WhisperModel

MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
MODEL_WHIS= "language-and-voice-lab/whisper-large-icelandic-62640-steps-967h"

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS) 
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO) 

pipe_is = pipeline(model=MODEL_IS)
pipe_fo = pipeline(model=MODEL_FO)


whp_is = WhisperProcessor.from_pretrained(MODEL_WHIS)
whm_is = WhisperForConditionalGeneration.from_pretrained(MODEL_WHIS)



def readwav(a_f):
    wav, sr = sf.read(a_f, dtype=np.float32)
    if len(wav.shape) == 2:
        wav = wav.mean(1)
    if sr != 16000:
        wlen = int(wav.shape[0] / sr * 16000)
        wav = signal.resample(wav, wlen)
    return wav

def recc(audio_file,model,processor):
    wav = readwav(audio_file)
    with torch.inference_mode():
        input_values = processor(wav,sampling_rate=16000).input_values[0]
        input_values = torch.tensor(input_values, device=device).unsqueeze(0)
        logits = model(input_values).logits
        pred_ids = torch.argmax(logits, dim=-1)
        xcp = processor.batch_decode(pred_ids)
        return xcp[0]
        
def whrecc(audio_file,wmodel,wprocessor):
    wav = readwav(audio_file)
    input_features = wprocessor(wav, sampling_rate=16000, return_tensors="pt").input_features
    predicted_ids = wmodel.generate(input_features)
    dec = wprocessor.batch_decode(predicted_ids, skip_special_tokens=True,language_id='is')
    xcp = dec[0]
    return xcp

    
def recis(audio_file):
    #single_output = recc(audio_file,model_is,processor_is)
    chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
    #return (single_output, chunk_output)
    return chunk_output

def recfo(audio_file):
    #single_output = recc(audio_file,model_fo,processor_fo)
    chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
    #return (single_output, chunk_output)
    return chunk_output


def recwhis(audio_file):
    wh_output = whrecc(audio_file,whm_is,whp_is)
    return(wh_output)

def pick_asrc(au_src):
    return gr.update(source=au_src,value=None)

bl = gr.Blocks()
with bl:

    gr.Markdown(
        """
    # Speech recognition

    ### Users logged in to a Huggingface account can use each model's normal hosted inference API instead.
    ## * * * * * * * *
    
    Upload a file for recognition with 
    https://huggingface.co/language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h
    or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h  

    - Wav2Vec2 models have no language model (yet), so it can generate non-words.
    - Whisper can hallucinate.
    - Send errors/bugs to [email protected]
    """
    )

    with gr.Tabs():
        with gr.TabItem("Icelandic"):
            with gr.Row():
                with gr.Column():
                    asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
                    audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
                with gr.Column():
                    #whole_output = gr.Textbox(label="whole-file recognition")
                    chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
                    whisper_output = gr.Textbox(label="Whisper recognition")
            w2v_button = gr.Button("Recognise Icelandic with Wav2Vec2")
            whi_button = gr.Button("Recognise Icelandic with Whisper")
            #text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
            w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
            whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output])

            asrc.change(pick_asrc,asrc,audio_file)


        with gr.TabItem("Faroese"):
            with gr.Row():
                with gr.Column():
                    asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
                    audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
                with gr.Column():
                    #whole_output = gr.Textbox(label="whole-file recognition")
                    chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
            text_button = gr.Button("Recognise Faroese")
            #text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
            text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
            
            asrc.change(pick_asrc,asrc,audio_file)


bl.launch()