datnth1709 commited on
Commit
fb1f641
·
1 Parent(s): 8d29562

record realtime

Browse files
Files changed (1) hide show
  1. app.py +64 -9
app.py CHANGED
@@ -120,7 +120,6 @@ def speech2text_en(input_file):
120
  return transcription
121
 
122
 
123
-
124
  """Machine translation"""
125
  vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
126
  envi_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-en-vi_PhoMT"
@@ -140,13 +139,47 @@ def translate_en2vi(English):
140
  def inference_vien(audio):
141
  vi_text = speech2text_vi(audio)
142
  en_text = translate_vi2en(vi_text)
143
- return en_text
144
 
145
  def inference_envi(audio):
146
  en_text = speech2text_en(audio)
147
  vi_text = translate_en2vi(en_text)
148
- return vi_text
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  """Gradio demo"""
152
 
@@ -177,14 +210,25 @@ with gr.Blocks() as demo:
177
  with gr.TabItem("Speech2text and Vi-En Translation"):
178
  with gr.Row():
179
  with gr.Column():
180
- vi_audio = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
181
  translate_button_vien_2 = gr.Button(value="Translate To English")
182
  with gr.Column():
 
183
  english_out_2 = gr.Textbox(label="English Text")
184
 
185
- translate_button_vien_2.click(lambda vi_voice: inference_vien(vi_voice), inputs=vi_audio, outputs=english_out_2)
186
  gr.Examples(examples=vi_example_voice,
187
  inputs=[vi_audio])
 
 
 
 
 
 
 
 
 
 
188
 
189
  with gr.Tabs():
190
  with gr.TabItem("Translation: English to Vietnamese"):
@@ -197,17 +241,28 @@ with gr.Blocks() as demo:
197
  translate_button_envi_1.click(lambda text: translate_en2vi(text), inputs=english_text, outputs=vietnamese_out_1)
198
  gr.Examples(examples=en_example_text,
199
  inputs=[english_text])
 
200
  with gr.TabItem("Speech2text and En-Vi Translation"):
201
  with gr.Row():
202
  with gr.Column():
203
- en_audio = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
204
  translate_button_envi_2 = gr.Button(value="Translate To Vietnamese")
205
  with gr.Column():
 
206
  vietnamese_out_2 = gr.Textbox(label="Vietnamese Text")
207
-
208
- translate_button_envi_2.click(lambda en_voice: inference_envi(en_voice), inputs=en_audio, outputs=vietnamese_out_2)
209
  gr.Examples(examples=en_example_voice,
210
- inputs=[en_audio])
 
 
 
 
 
 
 
 
 
 
211
 
212
  if __name__ == "__main__":
213
  demo.launch()
 
120
  return transcription
121
 
122
 
 
123
  """Machine translation"""
124
  vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
125
  envi_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-en-vi_PhoMT"
 
139
  def inference_vien(audio):
140
  vi_text = speech2text_vi(audio)
141
  en_text = translate_vi2en(vi_text)
142
+ return vi_text, en_text
143
 
144
  def inference_envi(audio):
145
  en_text = speech2text_en(audio)
146
  vi_text = translate_en2vi(en_text)
147
+ return en_text, vi_text
148
 
149
+ def transcribe_vi(audio, state_vi="", state_en=""):
150
+ ds = speech_file_to_array_fn(audio.name)
151
+ # infer model
152
+ input_values = processor(
153
+ ds["speech"],
154
+ sampling_rate=ds["sampling_rate"],
155
+ return_tensors="pt"
156
+ ).input_values
157
+ # decode ctc output
158
+ logits = vi_model(input_values).logits[0]
159
+ pred_ids = torch.argmax(logits, dim=-1)
160
+ greedy_search_output = processor.decode(pred_ids)
161
+ beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
162
+ state_vi += beam_search_output + " "
163
+ en_text = translate_vi2en(beam_search_output)
164
+ state_en += en_text + " "
165
+ return state_vi, state_en, state_vi, state_en
166
+
167
+ def transcribe_en(audio, state_en="", state_vi=""):
168
+ speech = load_data(audio)
169
+ # Tokenize
170
+ input_values = eng_tokenizer(speech, return_tensors="pt").input_values
171
+ # Take logits
172
+ logits = eng_model(input_values).logits
173
+ # Take argmax
174
+ predicted_ids = torch.argmax(logits, dim=-1)
175
+ # Get the words from predicted word ids
176
+ transcription = eng_tokenizer.decode(predicted_ids[0])
177
+ # Output is all upper case
178
+ transcription = correct_casing(transcription.lower())
179
+ state_en += transcription + " "
180
+ vi_text = translate_en2vi(transcription)
181
+ state_vi += vi_text + " "
182
+ return state_en, state_vi, state_en, state_vi
183
 
184
  """Gradio demo"""
185
 
 
210
  with gr.TabItem("Speech2text and Vi-En Translation"):
211
  with gr.Row():
212
  with gr.Column():
213
+ vi_audio = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=False)
214
  translate_button_vien_2 = gr.Button(value="Translate To English")
215
  with gr.Column():
216
+ speech2text_vi1 = gr.Textbox(label="Vietnamese Text")
217
  english_out_2 = gr.Textbox(label="English Text")
218
 
219
+ translate_button_vien_2.click(lambda vi_voice: inference_vien(vi_voice), inputs=vi_audio, outputs=[speech2text_vi1, english_out_2])
220
  gr.Examples(examples=vi_example_voice,
221
  inputs=[vi_audio])
222
+ with gr.TabItem("Vi-En Realtime Translation"):
223
+ with gr.Row():
224
+ with gr.Column():
225
+ vi_audio = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
226
+ translate_button_vien_2 = gr.Button(value="Translate To English")
227
+ with gr.Column():
228
+ speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
229
+ english_out_3 = gr.Textbox(label="English Text")
230
+ vi_audio.change(transcribe_vi, [vi_audio, "state_vi", "state_en"], [speech2text_vi2, english_out_3, "state_vi", "state_en"])
231
+
232
 
233
  with gr.Tabs():
234
  with gr.TabItem("Translation: English to Vietnamese"):
 
241
  translate_button_envi_1.click(lambda text: translate_en2vi(text), inputs=english_text, outputs=vietnamese_out_1)
242
  gr.Examples(examples=en_example_text,
243
  inputs=[english_text])
244
+
245
  with gr.TabItem("Speech2text and En-Vi Translation"):
246
  with gr.Row():
247
  with gr.Column():
248
+ en_audio_1 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=False)
249
  translate_button_envi_2 = gr.Button(value="Translate To Vietnamese")
250
  with gr.Column():
251
+ speech2text_en1 = gr.Textbox(label="English Text")
252
  vietnamese_out_2 = gr.Textbox(label="Vietnamese Text")
253
+ translate_button_envi_2.click(lambda en_voice: inference_envi(en_voice), inputs=en_audio_1, outputs=[speech2text_en1, vietnamese_out_2])
 
254
  gr.Examples(examples=en_example_voice,
255
+ inputs=[en_audio_1])
256
+
257
+ with gr.TabItem("En-Vi Realtime Translation"):
258
+ with gr.Row():
259
+ with gr.Column():
260
+ en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
261
+ # translate_button_envi_2 = gr.Button(value="Translate To Vietnamese")
262
+ with gr.Column():
263
+ speech2text_en2 = gr.Textbox(label="English Text")
264
+ vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
265
+ en_audio_2.change(transcribe_en, [en_audio_2, "state_en", "state_vi"], [speech2text_en2, vietnamese_out_3, "state_en", "state_vi"])
266
 
267
  if __name__ == "__main__":
268
  demo.launch()