Update app.py
Browse files
app.py
CHANGED
@@ -4,9 +4,11 @@ from scipy import signal
|
|
4 |
import numpy as np
|
5 |
import torch, torchaudio
|
6 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
|
|
|
7 |
|
8 |
MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
|
9 |
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
|
|
|
10 |
|
11 |
torch.random.manual_seed(0)
|
12 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
@@ -20,6 +22,10 @@ pipe_is = pipeline(model=MODEL_IS)
|
|
20 |
pipe_fo = pipeline(model=MODEL_FO)
|
21 |
|
22 |
|
|
|
|
|
|
|
|
|
23 |
|
24 |
def readwav(a_f):
|
25 |
wav, sr = sf.read(a_f, dtype=np.float32)
|
@@ -39,17 +45,32 @@ def recc(audio_file,model,processor):
|
|
39 |
pred_ids = torch.argmax(logits, dim=-1)
|
40 |
xcp = processor.batch_decode(pred_ids)
|
41 |
return xcp[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
|
44 |
def recis(audio_file):
|
45 |
-
single_output = recc(audio_file,model_is,processor_is)
|
46 |
chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
|
47 |
-
return (single_output, chunk_output)
|
|
|
48 |
|
49 |
def recfo(audio_file):
|
50 |
-
single_output = recc(audio_file,model_fo,processor_fo)
|
51 |
chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
|
52 |
-
return (single_output, chunk_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
def pick_asrc(au_src):
|
55 |
return gr.update(source=au_src)
|
@@ -77,8 +98,9 @@ with bl:
|
|
77 |
with gr.Tabs():
|
78 |
with gr.TabItem("Icelandic"):
|
79 |
with gr.Row():
|
80 |
-
|
81 |
-
|
|
|
82 |
with gr.Column():
|
83 |
#whole_output = gr.Textbox(label="whole-file recognition")
|
84 |
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
|
@@ -87,20 +109,25 @@ with bl:
|
|
87 |
whi_button = gr.Button("Recognise Icelandic with Whisper")
|
88 |
#text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
|
89 |
w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
|
90 |
-
|
91 |
|
92 |
asrc.change(pick_asrc,asrc,audio_file)
|
93 |
|
94 |
|
95 |
with gr.TabItem("Faroese"):
|
96 |
with gr.Row():
|
97 |
-
|
|
|
|
|
98 |
with gr.Column():
|
99 |
#whole_output = gr.Textbox(label="whole-file recognition")
|
100 |
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
|
101 |
text_button = gr.Button("Recognise Faroese")
|
102 |
#text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
|
103 |
text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
|
|
|
|
|
|
|
104 |
|
105 |
bl.launch()
|
106 |
|
|
|
4 |
import numpy as np
|
5 |
import torch, torchaudio
|
6 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
|
7 |
+
from faster_whisper import WhisperModel
|
8 |
|
9 |
MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
|
10 |
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
|
11 |
+
MODEL_WHIS= "language-and-voice-lab/whisper-large-icelandic-62640-steps-967h"
|
12 |
|
13 |
torch.random.manual_seed(0)
|
14 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
22 |
pipe_fo = pipeline(model=MODEL_FO)
|
23 |
|
24 |
|
25 |
+
whp_is = WhisperProcessor.from_pretrained(MODEL_WHIS)
|
26 |
+
whm_is = WhisperForConditionalGeneration.from_pretrained(MODEL_WHIS)
|
27 |
+
|
28 |
+
|
29 |
|
30 |
def readwav(a_f):
|
31 |
wav, sr = sf.read(a_f, dtype=np.float32)
|
|
|
45 |
pred_ids = torch.argmax(logits, dim=-1)
|
46 |
xcp = processor.batch_decode(pred_ids)
|
47 |
return xcp[0]
|
48 |
+
|
49 |
+
def whrecc(audio_file,wmodel,wprocessor):
|
50 |
+
wav = readwav(audio_file)
|
51 |
+
input_features = wprocessor(wav, sampling_rate=16000, return_tensors="pt").input_features
|
52 |
+
predicted_ids = wmodel.generate(input_features)
|
53 |
+
dec = wprocessor.batch_decode(predicted_ids, skip_special_tokens=True,language_id='is')
|
54 |
+
xcp = dec[0]
|
55 |
+
return xcp
|
56 |
|
57 |
|
58 |
def recis(audio_file):
|
59 |
+
#single_output = recc(audio_file,model_is,processor_is)
|
60 |
chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
|
61 |
+
#return (single_output, chunk_output)
|
62 |
+
return chunk_output
|
63 |
|
64 |
def recfo(audio_file):
|
65 |
+
#single_output = recc(audio_file,model_fo,processor_fo)
|
66 |
chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
|
67 |
+
#return (single_output, chunk_output)
|
68 |
+
return chunk_output
|
69 |
+
|
70 |
+
|
71 |
+
def recwhis(audio_file):
|
72 |
+
wh_output = whrecc(audio_file,whm_is,whp_is)
|
73 |
+
return(wh_output)
|
74 |
|
75 |
def pick_asrc(au_src):
|
76 |
return gr.update(source=au_src)
|
|
|
98 |
with gr.Tabs():
|
99 |
with gr.TabItem("Icelandic"):
|
100 |
with gr.Row():
|
101 |
+
with gr.Column():
|
102 |
+
asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
|
103 |
+
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
|
104 |
with gr.Column():
|
105 |
#whole_output = gr.Textbox(label="whole-file recognition")
|
106 |
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
|
|
|
109 |
whi_button = gr.Button("Recognise Icelandic with Whisper")
|
110 |
#text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
|
111 |
w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
|
112 |
+
whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output])
|
113 |
|
114 |
asrc.change(pick_asrc,asrc,audio_file)
|
115 |
|
116 |
|
117 |
with gr.TabItem("Faroese"):
|
118 |
with gr.Row():
|
119 |
+
with gr.Column():
|
120 |
+
asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
|
121 |
+
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
|
122 |
with gr.Column():
|
123 |
#whole_output = gr.Textbox(label="whole-file recognition")
|
124 |
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
|
125 |
text_button = gr.Button("Recognise Faroese")
|
126 |
#text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
|
127 |
text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
|
128 |
+
|
129 |
+
asrc.change(pick_asrc,asrc,audio_file)
|
130 |
+
|
131 |
|
132 |
bl.launch()
|
133 |
|