clr commited on
Commit
3f1d354
·
verified ·
1 Parent(s): 5af0b29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -8
app.py CHANGED
@@ -4,9 +4,11 @@ from scipy import signal
4
  import numpy as np
5
  import torch, torchaudio
6
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
 
7
 
8
  MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
9
  MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
 
10
 
11
  torch.random.manual_seed(0)
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -20,6 +22,10 @@ pipe_is = pipeline(model=MODEL_IS)
20
  pipe_fo = pipeline(model=MODEL_FO)
21
 
22
 
 
 
 
 
23
 
24
  def readwav(a_f):
25
  wav, sr = sf.read(a_f, dtype=np.float32)
@@ -39,17 +45,32 @@ def recc(audio_file,model,processor):
39
  pred_ids = torch.argmax(logits, dim=-1)
40
  xcp = processor.batch_decode(pred_ids)
41
  return xcp[0]
 
 
 
 
 
 
 
 
42
 
43
 
44
  def recis(audio_file):
45
- single_output = recc(audio_file,model_is,processor_is)
46
  chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
47
- return (single_output, chunk_output)
 
48
 
49
  def recfo(audio_file):
50
- single_output = recc(audio_file,model_fo,processor_fo)
51
  chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
52
- return (single_output, chunk_output)
 
 
 
 
 
 
53
 
54
  def pick_asrc(au_src):
55
  return gr.update(source=au_src)
@@ -77,8 +98,9 @@ with bl:
77
  with gr.Tabs():
78
  with gr.TabItem("Icelandic"):
79
  with gr.Row():
80
- asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
81
- audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
 
82
  with gr.Column():
83
  #whole_output = gr.Textbox(label="whole-file recognition")
84
  chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
@@ -87,20 +109,25 @@ with bl:
87
  whi_button = gr.Button("Recognise Icelandic with Whisper")
88
  #text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
89
  w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
90
- #whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output])
91
 
92
  asrc.change(pick_asrc,asrc,audio_file)
93
 
94
 
95
  with gr.TabItem("Faroese"):
96
  with gr.Row():
97
- audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
 
 
98
  with gr.Column():
99
  #whole_output = gr.Textbox(label="whole-file recognition")
100
  chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
101
  text_button = gr.Button("Recognise Faroese")
102
  #text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
103
  text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
 
 
 
104
 
105
  bl.launch()
106
 
 
4
  import numpy as np
5
  import torch, torchaudio
6
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
7
+ from faster_whisper import WhisperModel
8
 
9
  MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
10
  MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
11
+ MODEL_WHIS= "language-and-voice-lab/whisper-large-icelandic-62640-steps-967h"
12
 
13
  torch.random.manual_seed(0)
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
22
  pipe_fo = pipeline(model=MODEL_FO)
23
 
24
 
25
+ whp_is = WhisperProcessor.from_pretrained(MODEL_WHIS)
26
+ whm_is = WhisperForConditionalGeneration.from_pretrained(MODEL_WHIS)
27
+
28
+
29
 
30
  def readwav(a_f):
31
  wav, sr = sf.read(a_f, dtype=np.float32)
 
45
  pred_ids = torch.argmax(logits, dim=-1)
46
  xcp = processor.batch_decode(pred_ids)
47
  return xcp[0]
48
+
49
+ def whrecc(audio_file,wmodel,wprocessor):
50
+ wav = readwav(audio_file)
51
+ input_features = wprocessor(wav, sampling_rate=16000, return_tensors="pt").input_features
52
+ predicted_ids = wmodel.generate(input_features)
53
+ dec = wprocessor.batch_decode(predicted_ids, skip_special_tokens=True,language_id='is')
54
+ xcp = dec[0]
55
+ return xcp
56
 
57
 
58
  def recis(audio_file):
59
+ #single_output = recc(audio_file,model_is,processor_is)
60
  chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
61
+ #return (single_output, chunk_output)
62
+ return chunk_output
63
 
64
  def recfo(audio_file):
65
+ #single_output = recc(audio_file,model_fo,processor_fo)
66
  chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
67
+ #return (single_output, chunk_output)
68
+ return chunk_output
69
+
70
+
71
+ def recwhis(audio_file):
72
+ wh_output = whrecc(audio_file,whm_is,whp_is)
73
+ return(wh_output)
74
 
75
  def pick_asrc(au_src):
76
  return gr.update(source=au_src)
 
98
  with gr.Tabs():
99
  with gr.TabItem("Icelandic"):
100
  with gr.Row():
101
+ with gr.Column():
102
+ asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
103
+ audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
104
  with gr.Column():
105
  #whole_output = gr.Textbox(label="whole-file recognition")
106
  chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
 
109
  whi_button = gr.Button("Recognise Icelandic with Whisper")
110
  #text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
111
  w2v_button.click(recis, inputs=audio_file, outputs=[chunk_output])
112
+ whi_button.click(recwhis, inputs=audio_file, outputs=[whisper_output])
113
 
114
  asrc.change(pick_asrc,asrc,audio_file)
115
 
116
 
117
  with gr.TabItem("Faroese"):
118
  with gr.Row():
119
+ with gr.Column():
120
+ asrc = gr.Radio(["upload", "microphone"],value="upload",label="Audio input")
121
+ audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
122
  with gr.Column():
123
  #whole_output = gr.Textbox(label="whole-file recognition")
124
  chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
125
  text_button = gr.Button("Recognise Faroese")
126
  #text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
127
  text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
128
+
129
+ asrc.change(pick_asrc,asrc,audio_file)
130
+
131
 
132
  bl.launch()
133