Spaces:

rbcurzon
/

speech-to-text

Running

App Files Files Community

rbcurzon commited on Apr 26

Commit

c81e356

verified ·

1 Parent(s): 13abff5

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -21

app.py CHANGED Viewed

@@ -14,6 +14,9 @@ from transformers import VitsModel, AutoTokenizer
 import numpy as np
 import scipy
 from IPython.display import Audio
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -37,6 +40,28 @@ app = FastAPI(
     description="Process and transcribe audio in real-time using Whisper"
 )
 def translate(text, srcLang, tgtLang):
     sys_instruct = "You are a professional translator. Generate a translation of the text and return only the result. Return only the translated text."
     response = client.models.generate_content(
@@ -50,8 +75,8 @@ def translate(text, srcLang, tgtLang):
 @app.post("/translateAudio/")
 async def translate_audio(
     file: UploadFile = File(...),
-    srcLang: str = Form(...),
-    tgtLang: str = Form(...)
     ):
     try:
@@ -60,14 +85,6 @@ async def translate_audio(
             f.write(content)
             print(f"Successfully uploaded {file.filename}")
-        wav = read_audio(file.filename)
-        speech_timestamps = get_speech_timestamps(wav, model)
-        save_audio(
-            "only_speech.wav",
-            collect_chunks(speech_timestamps, wav),
-            sampling_rate=16000
-        )
         generate_kwargs = {
             "language": "tagalog",
             "return_timestamps": True,
@@ -75,24 +92,19 @@ async def translate_audio(
             # "initial_prompt":  "The sentence may be cut off, do not make up words to fill in the rest of the sentence."
         }
         result = pipe(
-            "only_speech.wav", # Transcribe audio
             batch_size=8,
             return_timestamps=True,
             generate_kwargs=generate_kwargs
         )
         print(result)
-        timestamp = result['chunks'][0]['timestamp']
-        end_time = timestamp[1]
-        if end_time is None:
-            raise Exception("The speech is difficult to understand.")
-        translatedResult = translate(result['text'], srcLang=srcLang, tgtLang=tgtLang)
         result_dict = {
             "transcribed_text": result['text'],
-            "translated_text": translatedResult,
             "srcLang": srcLang,
             "tgtLang": tgtLang
         }
@@ -109,8 +121,8 @@ async def translate_audio(
             file.file.close()
         if os.path.exists(file.filename):
             os.remove(file.filename)
-        if os.path.exists("only_speech.wav"):
-            os.remove("only_speech.wav")
 @app.post("/translateText/")

 import numpy as np
 import scipy
 from IPython.display import Audio
+import uuid
+import os
+import tempfile
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
     description="Process and transcribe audio in real-time using Whisper"
 )
+def remove_silence(filename):
+    wav = read_audio(filename)
+    speech_timestamps = get_speech_timestamps(wav, model)
+    temp_file = create_temp_filename()
+    save_audio(
+        temp_file,
+        collect_chunks(speech_timestamps, wav),
+        sampling_rate=16000
+    )
+    return temp_file
+def create_temp_filename():
+    # Step 1: Generate a unique file name using uuid
+    unique_id = str(uuid.uuid4())
+    temp_file_name = f"{unique_id}.wav"
+    # Step 2: Create a temporary file
+    temp_file_path = os.path.join(tempfile.gettempdir(), temp_file_name)
+    return temp_file_path
 def translate(text, srcLang, tgtLang):
     sys_instruct = "You are a professional translator. Generate a translation of the text and return only the result. Return only the translated text."
     response = client.models.generate_content(
 @app.post("/translateAudio/")
 async def translate_audio(
     file: UploadFile = File(...),
+    srcLang: str = Form("Tagalog"),
+    tgtLang: str = Form("Cebuano"))
     ):
     try:
             f.write(content)
             print(f"Successfully uploaded {file.filename}")
         generate_kwargs = {
             "language": "tagalog",
             "return_timestamps": True,
             # "initial_prompt":  "The sentence may be cut off, do not make up words to fill in the rest of the sentence."
         }
+        temp_file = remove_silence(file.filename)
         result = pipe(
+            temp_file,
             batch_size=8,
             return_timestamps=True,
             generate_kwargs=generate_kwargs
         )
         print(result)
         result_dict = {
             "transcribed_text": result['text'],
+            "translated_text": translate(result['text'], srcLang=srcLang, tgtLang=tgtLang),
             "srcLang": srcLang,
             "tgtLang": tgtLang
         }
             file.file.close()
         if os.path.exists(file.filename):
             os.remove(file.filename)
+        if os.path.exists(tempfile):
+            os.remove(tempfile)
 @app.post("/translateText/")