Realtime-whisper-large-v3-turbo

Running on Zero

App Files Files Community

KingNish commited on Oct 4, 2024

Commit

ea9f05f

verified ·

1 Parent(s): ceea111

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -3

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
 model.to(device)
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
-tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language="en")
 pipe = pipeline(
     task="automatic-speech-recognition",
@@ -55,6 +55,24 @@ def transcribe(inputs, previous_transcription):
         print(f"Error during Transcription: {e}")
         return previous_transcription, "Error"
 def clear():
     return ""
@@ -72,7 +90,7 @@ with gr.Blocks() as microphone:
         input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output, latency_textbox], time_limit=45, stream_every=2, concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
-with gr.Blocks() as flie:
     with gr.Column():
         gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
@@ -86,7 +104,20 @@ with gr.Blocks() as flie:
         submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
 with gr.Blocks() as demo:
-    gr.TabbedInterface([microphone, flie], ["Microphone", "Audio file"])
 demo.launch()

 model.to(device)
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
+tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
 pipe = pipeline(
     task="automatic-speech-recognition",
         print(f"Error during Transcription: {e}")
         return previous_transcription, "Error"
+@spaces.GPU
+def translate_and_transcribe(inputs, previous_transcription):
+    start_time = time.time()
+    try:
+        filename = f"{uuid.uuid4().hex}.wav"
+        sample_rate, audio_data = inputs
+        scipy.io.wavfile.write(filename, sample_rate, audio_data)
+        translation = pipe(filename, language="<|es|>" , generate_kwargs={"task": "translate"} )["text"]
+        previous_transcription += translation
+        end_time = time.time()
+        latency = end_time - start_time
+        return previous_transcription, f"{latency:.2f}"
+    except Exception as e:
+        print(f"Error during Translation and Transcription: {e}")
+        return previous_transcription, "Error"
 def clear():
     return ""
         input_audio_microphone.stream(transcribe, [input_audio_microphone, output], [output, latency_textbox], time_limit=45, stream_every=2, concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
+with gr.Blocks() as file:
     with gr.Column():
         gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
         submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
+with gr.Blocks() as translate:
+    with gr.Column():
+        gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
+        with gr.Row():
+            input_audio_microphone = gr.Audio(streaming=True)
+            output = gr.Textbox(label="Transcription and Translation", value="")
+            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
+        with gr.Row():
+            clear_button = gr.Button("Clear Output")
+        input_audio_microphone.stream(translate_and_transcribe, [input_audio_microphone, output], [output, latency_textbox], time_limit=45, stream_every=2, concurrency_limit=None)
+        clear_button.click(clear, outputs=[output])
 with gr.Blocks() as demo:
+    gr.TabbedInterface([microphone, translate, file], ["Microphone", "Realtime Translation", "Transcribe from file"])
 demo.launch()