Spaces:

rbcurzon
/

speech-to-text

Running

App Files Files Community

rbcurzon commited on Apr 26

Commit

1b38b1a

verified ·

1 Parent(s): b5c3008

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -12

app.py CHANGED Viewed

@@ -1,22 +1,33 @@
 # -*- coding: utf-8 -*-
 import os
-from fastapi import FastAPI, WebSocket, UploadFile, File, HTTPException, Form
 from fastapi.middleware.cors import CORSMiddleware
 from google import genai
 from google.genai import types
-from silero_vad import load_silero_vad, read_audio, get_speech_timestamps, save_audio, collect_chunks
 import torch
-from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
-from fastapi.responses import FileResponse
-from transformers import VitsModel, AutoTokenizer
-import numpy as np
-import scipy
-from IPython.display import Audio
-import uuid
-import os
-import tempfile
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -78,7 +89,28 @@ async def translate_audio(
     srcLang: str = Form("Tagalog"),
     tgtLang: str = Form("Cebuano")
     ):
     try:
         content = await file.read()
         with open(file.filename, 'wb') as f:

 # -*- coding: utf-8 -*-
 import os
+import uuid
+import tempfile
+import numpy as np
+import scipy.io.wavfile
+from fastapi import FastAPI, UploadFile, File, HTTPException, Form
+from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
 from google import genai
 from google.genai import types
+from silero_vad import (
+    load_silero_vad,
+    read_audio,
+    get_speech_timestamps,
+    save_audio,
+    collect_chunks,
+)
 import torch
+from transformers import (
+    WhisperProcessor,
+    WhisperForConditionalGeneration,
+    pipeline,
+    VitsModel,
+    AutoTokenizer,
+)
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
     srcLang: str = Form("Tagalog"),
     tgtLang: str = Form("Cebuano")
     ):
+    """
+    Endpoint to translate audio files.
+    This endpoint accepts an audio file, processes it to remove silence, transcribes the audio,
+    and translates the transcribed text from the source language to the target language.
+    Args:
+        file (UploadFile): The audio file to be uploaded and processed.
+        srcLang (str): The source language of the audio transcription. Defaults to "Tagalog".
+        tgtLang (str): The target language for translation. Defaults to "Cebuano".
+    Returns:
+        dict: A dictionary containing:
+            - transcribed_text (str): The transcribed text from the audio.
+            - translated_text (str): The translated text from the source language to the target language.
+            - srcLang (str): The source language used for transcription.
+            - tgtLang (str): The target language used for translation.
+    Raises:
+        HTTPException: If an error occurs during processing, a 500 status code is returned with the error details.
+    Notes:
+        - The uploaded file is temporarily saved to disk for processing and removed after completion.
+        - Silence is removed from the audio file before transcription.
+        - The transcription and translation processes are performed asynchronously.
+    """
     try:
         content = await file.read()
         with open(file.filename, 'wb') as f: