Spaces:
Runtime error
Runtime error
Commit
·
fb1f641
1
Parent(s):
8d29562
record realtime
Browse files
app.py
CHANGED
@@ -120,7 +120,6 @@ def speech2text_en(input_file):
|
|
120 |
return transcription
|
121 |
|
122 |
|
123 |
-
|
124 |
"""Machine translation"""
|
125 |
vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
|
126 |
envi_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-en-vi_PhoMT"
|
@@ -140,13 +139,47 @@ def translate_en2vi(English):
|
|
140 |
def inference_vien(audio):
|
141 |
vi_text = speech2text_vi(audio)
|
142 |
en_text = translate_vi2en(vi_text)
|
143 |
-
return en_text
|
144 |
|
145 |
def inference_envi(audio):
|
146 |
en_text = speech2text_en(audio)
|
147 |
vi_text = translate_en2vi(en_text)
|
148 |
-
return vi_text
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
"""Gradio demo"""
|
152 |
|
@@ -177,14 +210,25 @@ with gr.Blocks() as demo:
|
|
177 |
with gr.TabItem("Speech2text and Vi-En Translation"):
|
178 |
with gr.Row():
|
179 |
with gr.Column():
|
180 |
-
vi_audio = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=
|
181 |
translate_button_vien_2 = gr.Button(value="Translate To English")
|
182 |
with gr.Column():
|
|
|
183 |
english_out_2 = gr.Textbox(label="English Text")
|
184 |
|
185 |
-
translate_button_vien_2.click(lambda vi_voice: inference_vien(vi_voice), inputs=vi_audio, outputs=english_out_2)
|
186 |
gr.Examples(examples=vi_example_voice,
|
187 |
inputs=[vi_audio])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
with gr.Tabs():
|
190 |
with gr.TabItem("Translation: English to Vietnamese"):
|
@@ -197,17 +241,28 @@ with gr.Blocks() as demo:
|
|
197 |
translate_button_envi_1.click(lambda text: translate_en2vi(text), inputs=english_text, outputs=vietnamese_out_1)
|
198 |
gr.Examples(examples=en_example_text,
|
199 |
inputs=[english_text])
|
|
|
200 |
with gr.TabItem("Speech2text and En-Vi Translation"):
|
201 |
with gr.Row():
|
202 |
with gr.Column():
|
203 |
-
|
204 |
translate_button_envi_2 = gr.Button(value="Translate To Vietnamese")
|
205 |
with gr.Column():
|
|
|
206 |
vietnamese_out_2 = gr.Textbox(label="Vietnamese Text")
|
207 |
-
|
208 |
-
translate_button_envi_2.click(lambda en_voice: inference_envi(en_voice), inputs=en_audio, outputs=vietnamese_out_2)
|
209 |
gr.Examples(examples=en_example_voice,
|
210 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
if __name__ == "__main__":
|
213 |
demo.launch()
|
|
|
120 |
return transcription
|
121 |
|
122 |
|
|
|
123 |
"""Machine translation"""
|
124 |
vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
|
125 |
envi_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-en-vi_PhoMT"
|
|
|
139 |
def inference_vien(audio):
|
140 |
vi_text = speech2text_vi(audio)
|
141 |
en_text = translate_vi2en(vi_text)
|
142 |
+
return vi_text, en_text
|
143 |
|
144 |
def inference_envi(audio):
|
145 |
en_text = speech2text_en(audio)
|
146 |
vi_text = translate_en2vi(en_text)
|
147 |
+
return en_text, vi_text
|
148 |
|
149 |
+
def transcribe_vi(audio, state_vi="", state_en=""):
|
150 |
+
ds = speech_file_to_array_fn(audio.name)
|
151 |
+
# infer model
|
152 |
+
input_values = processor(
|
153 |
+
ds["speech"],
|
154 |
+
sampling_rate=ds["sampling_rate"],
|
155 |
+
return_tensors="pt"
|
156 |
+
).input_values
|
157 |
+
# decode ctc output
|
158 |
+
logits = vi_model(input_values).logits[0]
|
159 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
160 |
+
greedy_search_output = processor.decode(pred_ids)
|
161 |
+
beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
|
162 |
+
state_vi += beam_search_output + " "
|
163 |
+
en_text = translate_vi2en(beam_search_output)
|
164 |
+
state_en += en_text + " "
|
165 |
+
return state_vi, state_en, state_vi, state_en
|
166 |
+
|
167 |
+
def transcribe_en(audio, state_en="", state_vi=""):
|
168 |
+
speech = load_data(audio)
|
169 |
+
# Tokenize
|
170 |
+
input_values = eng_tokenizer(speech, return_tensors="pt").input_values
|
171 |
+
# Take logits
|
172 |
+
logits = eng_model(input_values).logits
|
173 |
+
# Take argmax
|
174 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
175 |
+
# Get the words from predicted word ids
|
176 |
+
transcription = eng_tokenizer.decode(predicted_ids[0])
|
177 |
+
# Output is all upper case
|
178 |
+
transcription = correct_casing(transcription.lower())
|
179 |
+
state_en += transcription + " "
|
180 |
+
vi_text = translate_en2vi(transcription)
|
181 |
+
state_vi += vi_text + " "
|
182 |
+
return state_en, state_vi, state_en, state_vi
|
183 |
|
184 |
"""Gradio demo"""
|
185 |
|
|
|
210 |
with gr.TabItem("Speech2text and Vi-En Translation"):
|
211 |
with gr.Row():
|
212 |
with gr.Column():
|
213 |
+
vi_audio = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=False)
|
214 |
translate_button_vien_2 = gr.Button(value="Translate To English")
|
215 |
with gr.Column():
|
216 |
+
speech2text_vi1 = gr.Textbox(label="Vietnamese Text")
|
217 |
english_out_2 = gr.Textbox(label="English Text")
|
218 |
|
219 |
+
translate_button_vien_2.click(lambda vi_voice: inference_vien(vi_voice), inputs=vi_audio, outputs=[speech2text_vi1, english_out_2])
|
220 |
gr.Examples(examples=vi_example_voice,
|
221 |
inputs=[vi_audio])
|
222 |
+
with gr.TabItem("Vi-En Realtime Translation"):
|
223 |
+
with gr.Row():
|
224 |
+
with gr.Column():
|
225 |
+
vi_audio = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
|
226 |
+
translate_button_vien_2 = gr.Button(value="Translate To English")
|
227 |
+
with gr.Column():
|
228 |
+
speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
|
229 |
+
english_out_3 = gr.Textbox(label="English Text")
|
230 |
+
vi_audio.change(transcribe_vi, [vi_audio, "state_vi", "state_en"], [speech2text_vi2, english_out_3, "state_vi", "state_en"])
|
231 |
+
|
232 |
|
233 |
with gr.Tabs():
|
234 |
with gr.TabItem("Translation: English to Vietnamese"):
|
|
|
241 |
translate_button_envi_1.click(lambda text: translate_en2vi(text), inputs=english_text, outputs=vietnamese_out_1)
|
242 |
gr.Examples(examples=en_example_text,
|
243 |
inputs=[english_text])
|
244 |
+
|
245 |
with gr.TabItem("Speech2text and En-Vi Translation"):
|
246 |
with gr.Row():
|
247 |
with gr.Column():
|
248 |
+
en_audio_1 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=False)
|
249 |
translate_button_envi_2 = gr.Button(value="Translate To Vietnamese")
|
250 |
with gr.Column():
|
251 |
+
speech2text_en1 = gr.Textbox(label="English Text")
|
252 |
vietnamese_out_2 = gr.Textbox(label="Vietnamese Text")
|
253 |
+
translate_button_envi_2.click(lambda en_voice: inference_envi(en_voice), inputs=en_audio_1, outputs=[speech2text_en1, vietnamese_out_2])
|
|
|
254 |
gr.Examples(examples=en_example_voice,
|
255 |
+
inputs=[en_audio_1])
|
256 |
+
|
257 |
+
with gr.TabItem("En-Vi Realtime Translation"):
|
258 |
+
with gr.Row():
|
259 |
+
with gr.Column():
|
260 |
+
en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
|
261 |
+
# translate_button_envi_2 = gr.Button(value="Translate To Vietnamese")
|
262 |
+
with gr.Column():
|
263 |
+
speech2text_en2 = gr.Textbox(label="English Text")
|
264 |
+
vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
|
265 |
+
en_audio_2.change(transcribe_en, [en_audio_2, "state_en", "state_vi"], [speech2text_en2, vietnamese_out_3, "state_en", "state_vi"])
|
266 |
|
267 |
if __name__ == "__main__":
|
268 |
demo.launch()
|