# -*- coding: utf-8 -*- import os import uuid import tempfile import numpy as np import scipy.io.wavfile from fastapi import FastAPI, UploadFile, File, HTTPException, Form from fastapi.responses import FileResponse from fastapi.middleware.cors import CORSMiddleware from google import genai from google.genai import types from silero_vad import ( load_silero_vad, read_audio, get_speech_timestamps, save_audio, collect_chunks, ) import torch from transformers import ( WhisperProcessor, WhisperForConditionalGeneration, pipeline, VitsModel, AutoTokenizer, ) device = "cuda:0" if torch.cuda.is_available() else "cpu" # load model and processor model_id = "rbcurzon/whisper-medium-tgl" pipe = pipeline( "automatic-speech-recognition", model=model_id, chunk_length_s=30, device=device ) model = load_silero_vad() client = genai.Client(api_key=os.environ.get("GENAI_API_KEY")) # Do not share api key """**FastAPI**""" app = FastAPI( title="Real-Time Audio Processor", description="Process and transcribe audio in real-time using Whisper" ) def remove_silence(filename): wav = read_audio(filename) speech_timestamps = get_speech_timestamps(wav, model) temp_file = create_temp_filename() save_audio( temp_file, collect_chunks(speech_timestamps, wav), sampling_rate=16000 ) return temp_file def create_temp_filename(): # Step 1: Generate a unique file name using uuid unique_id = str(uuid.uuid4()) temp_file_name = f"{unique_id}.wav" # Step 2: Create a temporary file temp_file_path = os.path.join(tempfile.gettempdir(), temp_file_name) return temp_file_path def translate(text, srcLang, tgtLang): sys_instruct = "You are a professional translator. Generate a translation of the text and return only the result. Return only the translated text." response = client.models.generate_content( model="gemini-2.0-flash", config=types.GenerateContentConfig( system_instruction=sys_instruct), contents=f"Translate the text from {srcLang} to {tgtLang}: {text} ", ) return response.text @app.post("/translateAudio/") async def translate_audio( file: UploadFile = File(...), srcLang: str = Form("Tagalog"), tgtLang: str = Form("Cebuano") ): """ Endpoint to translate audio files. This endpoint accepts an audio file, processes it to remove silence, transcribes the audio, and translates the transcribed text from the source language to the target language. Args: file (UploadFile): The audio file to be uploaded and processed. srcLang (str): The source language of the audio transcription. Defaults to "Tagalog". tgtLang (str): The target language for translation. Defaults to "Cebuano". Returns: dict: A dictionary containing: - transcribed_text (str): The transcribed text from the audio. - translated_text (str): The translated text from the source language to the target language. - srcLang (str): The source language used for transcription. - tgtLang (str): The target language used for translation. Raises: HTTPException: If an error occurs during processing, a 500 status code is returned with the error details. Notes: - The uploaded file is temporarily saved to disk for processing and removed after completion. - Silence is removed from the audio file before transcription. - The transcription and translation processes are performed asynchronously. """ try: content = await file.read() with open(file.filename, 'wb') as f: f.write(content) print(f"Successfully uploaded {file.filename}") generate_kwargs = { "language": "tagalog", "return_timestamps": True, # "condition_on_prev_tokens": False, # "initial_prompt": "The sentence may be cut off, do not make up words to fill in the rest of the sentence." } temp_file = remove_silence(file.filename) result = pipe( temp_file, batch_size=8, return_timestamps=True, generate_kwargs=generate_kwargs ) print(result) result_dict = { "transcribed_text": result['text'], "translated_text": translate(result['text'], srcLang=srcLang, tgtLang=tgtLang), "srcLang": srcLang, "tgtLang": tgtLang } print(result_dict) return result_dict except Exception as error: print("Error: ", str(error)) raise HTTPException(status_code=500, detail=str(error)) finally: if file.file: file.file.close() if os.path.exists(file.filename): os.remove(file.filename) if os.path.exists(temp_file): os.remove(temp_file) @app.post("/translateText/") async def translate_text(text: str, srcLang: str = Form(...), tgtLang: str = Form(...)): result = translate(text, srcLang, tgtLang) print('Raw: ', text) print('Translated: ', result) result_dict = { "text": text, "translated_text": result, "srcLang": srcLang, "tgtLang": tgtLang } return result_dict @app.post("/synthesize/") async def synthesize(text: str): model = VitsModel.from_pretrained("facebook/mms-tts-tgl") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-tgl") inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform data_np = output.numpy() data_np_squeezed = np.squeeze(data_np) scipy.io.wavfile.write("speech.wav", rate=model.config.sampling_rate, data=data_np_squeezed) return FileResponse("speech.wav", media_type="audio/wav")