Spaces:

clr
/

w2v2asr

Sleeping

App Files Files Community

clr commited on May 23, 2024

Commit

3f1d354

verified ·

1 Parent(s): 5af0b29

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -8

app.py CHANGED Viewed

@@ -4,9 +4,11 @@ from scipy import signal
 import numpy as np
 import torch, torchaudio
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
 MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
 MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
 torch.random.manual_seed(0)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -20,6 +22,10 @@ pipe_is = pipeline(model=MODEL_IS)
 pipe_fo = pipeline(model=MODEL_FO)
 def readwav(a_f):
     wav, sr = sf.read(a_f, dtype=np.float32)
@@ -39,17 +45,32 @@ def recc(audio_file,model,processor):
         pred_ids = torch.argmax(logits, dim=-1)
         xcp = processor.batch_decode(pred_ids)
         return xcp[0]
 def recis(audio_file):
-    single_output = recc(audio_file,model_is,processor_is)
     chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
-    return (single_output, chunk_output)
 def recfo(audio_file):
-    single_output = recc(audio_file,model_fo,processor_fo)
     chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
-    return (single_output, chunk_output)
 def pick_asrc(au_src):
     return gr.update(source=au_src)
@@ -77,8 +98,9 @@ with bl:
     with gr.Tabs():
         with gr.TabItem("Icelandic"):
             with gr.Row():
-                asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
-                audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
                 with gr.Column():
                     #whole_output = gr.Textbox(label="whole-file recognition")
                     chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
@@ -87,20 +109,25 @@ with bl:
             whi_button = gr.Button("Recognise Icelandic with Whisper")
             #text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
             w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
-            #whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output])
             asrc.change(pick_asrc,asrc,audio_file)
         with gr.TabItem("Faroese"):
             with gr.Row():
-                audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
                 with gr.Column():
                     #whole_output = gr.Textbox(label="whole-file recognition")
                     chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
             text_button = gr.Button("Recognise Faroese")
             #text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
             text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
 bl.launch()

 import numpy as np
 import torch, torchaudio
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
+from faster_whisper import WhisperModel
 MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
 MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
+MODEL_WHIS= "language-and-voice-lab/whisper-large-icelandic-62640-steps-967h"
 torch.random.manual_seed(0)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 pipe_fo = pipeline(model=MODEL_FO)
+whp_is = WhisperProcessor.from_pretrained(MODEL_WHIS)
+whm_is = WhisperForConditionalGeneration.from_pretrained(MODEL_WHIS)
 def readwav(a_f):
     wav, sr = sf.read(a_f, dtype=np.float32)
         pred_ids = torch.argmax(logits, dim=-1)
         xcp = processor.batch_decode(pred_ids)
         return xcp[0]
+def whrecc(audio_file,wmodel,wprocessor):
+    wav = readwav(audio_file)
+    input_features = wprocessor(wav, sampling_rate=16000, return_tensors="pt").input_features
+    predicted_ids = wmodel.generate(input_features)
+    dec = wprocessor.batch_decode(predicted_ids, skip_special_tokens=True,language_id='is')
+    xcp = dec[0]
+    return xcp
 def recis(audio_file):
+    #single_output = recc(audio_file,model_is,processor_is)
     chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
+    #return (single_output, chunk_output)
+    return chunk_output
 def recfo(audio_file):
+    #single_output = recc(audio_file,model_fo,processor_fo)
     chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
+    #return (single_output, chunk_output)
+    return chunk_output
+def recwhis(audio_file):
+    wh_output = whrecc(audio_file,whm_is,whp_is)
+    return(wh_output)
 def pick_asrc(au_src):
     return gr.update(source=au_src)
     with gr.Tabs():
         with gr.TabItem("Icelandic"):
             with gr.Row():
+                with gr.Column():
+                    asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
+                    audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
                 with gr.Column():
                     #whole_output = gr.Textbox(label="whole-file recognition")
                     chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
             whi_button = gr.Button("Recognise Icelandic with Whisper")
             #text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
             w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
+            whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output])
             asrc.change(pick_asrc,asrc,audio_file)
         with gr.TabItem("Faroese"):
             with gr.Row():
+                with gr.Column():
+                    asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
+                    audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
                 with gr.Column():
                     #whole_output = gr.Textbox(label="whole-file recognition")
                     chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
             text_button = gr.Button("Recognise Faroese")
             #text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
             text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
+            asrc.change(pick_asrc,asrc,audio_file)
 bl.launch()