Spaces:

hkab
/

vietnamese-rnnt-demo

Running

HKAB commited on 12 days ago

Commit

5ec340b

1 Parent(s): dd0d853

update

Files changed (3) hide show

__pycache__/examples.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/examples.cpython-310.pyc and b/__pycache__/examples.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -105,8 +105,9 @@ title = "# Streaming RNN-T with Whisper Encoder"
 description = """
 Visit <https://github.com/HKAB/rnnt-whisper-tutorial/> for more information.
-- This model run on CPU
-- This model might not work with your microphone since it was trained on a quite clean dataset. Try to speak loudly and clearly 😃
 """
 def onnx_online_inference(audio, ort_encoder_session, ort_decoder_session, ort_jointer_session, tokenizer):
@@ -242,7 +243,7 @@ def process(
 with demo:
     gr.Markdown(title)
     gr.Markdown(description)
-    model_type = gr.Radio(["FP32", "INT8 (Quantized)"], label="Model type", value="FP32", info="INT8 model is faster but less accurate")
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):
@@ -261,6 +262,7 @@ with demo:
                     uploaded_file,
                     model_type
                 ],
                 outputs=[uploaded_output, uploaded_html_info],
                 fn=process_uploaded_file,
                 label="Cherry-picked examples",
@@ -283,6 +285,7 @@ with demo:
                     microphone,
                     model_type
                 ],
                 outputs=[recorded_output, recorded_html_info],
                 fn=process_microphone,
                 label="Cherry-picked examples",

 description = """
 Visit <https://github.com/HKAB/rnnt-whisper-tutorial/> for more information.
+- This model runs on CPU (Free tier) so the RTF of FP32 model is around 1.5.
+- This model mights not work with your microphone since it was trained on a quite clean dataset. Try to speak loudly and clearly 😃
+- Although you upload a full audio file, the model will process it in a streaming fashion.
 """
 def onnx_online_inference(audio, ort_encoder_session, ort_decoder_session, ort_jointer_session, tokenizer):
 with demo:
     gr.Markdown(title)
     gr.Markdown(description)
+    model_type = gr.Radio(["FP32", "INT8"], label="Model type", value="FP32", info="INT8 model is faster but less accurate")
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):
                     uploaded_file,
                     model_type
                 ],
+                cache_mode="lazy",
                 outputs=[uploaded_output, uploaded_html_info],
                 fn=process_uploaded_file,
                 label="Cherry-picked examples",
                     microphone,
                     model_type
                 ],
+                cache_mode="lazy",
                 outputs=[recorded_output, recorded_html_info],
                 fn=process_microphone,
                 label="Cherry-picked examples",

examples.py CHANGED Viewed

@@ -1,17 +1,18 @@
 examples = [
     [
-        "./test_wavs/Hue_short.wav"
     ],
     [
-        "./test_wavs/12345_short.wav"
     ],
     [
         "./test_wavs/Trump_long.mp3",
     ],
     [
         "./test_wavs/Ucraina_moderate.mp3",
-    ],
-    [
-        "./test_wavs/Duongsat_short.m4a",
     ]
 ]

 examples = [
     [
+        "./test_wavs/Hue_short.wav",
+        "FP32",
     ],
     [
+        "./test_wavs/12345_short.wav",
+        "FP32",
     ],
     [
         "./test_wavs/Trump_long.mp3",
+        "FP32",
     ],
     [
         "./test_wavs/Ucraina_moderate.mp3",
+        "FP32",
     ]
 ]