# -*- coding: utf-8 -*- """main.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/17Umb-Po_5pESiRv3-dcDRyootgqBjjWM """ import torch from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline device = "cuda:0" if torch.cuda.is_available() else "cpu" # load model and processor model_id = "rbcurzon/whisper-small-fil" pipe = pipeline("automatic-speech-recognition", model=model_id, chunk_length_s=30, device=device) """**FastAPI**""" import os import io from fastapi import FastAPI, WebSocket, UploadFile, File from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from google import genai from google.genai import types client = genai.Client(api_key=os.environ.get("GENAI_API_KEY")) # Do not share api key def translate(text, srcLang, tgtLang): sys_instruct = "You are a professional translator. Generate a translation of the text and return only the result. Return only the translated text." response = client.models.generate_content( model="gemini-2.0-flash", config=types.GenerateContentConfig( system_instruction=sys_instruct), contents=f"Translate the text from {srcLang} to {tgtLang}: {text} ", ) print(response) return response.text import shutil import aiofiles from tempfile import NamedTemporaryFile from fastapi import UploadFile, Form, File from pathlib import Path from typing import Annotated from silero_vad import load_silero_vad, read_audio, get_speech_timestamps, save_audio, collect_chunks model = load_silero_vad() app = FastAPI( title="Real-Time Audio Processor", description="Process and transcribe audio in real-time using Whisper" ) @app.post("/translateAudio/") async def test(file: UploadFile = File(...), srcLang: str = Form(...), tgtLang: str = Form(...)): # # Download audio # async with aiofiles.open(file.filename, 'wb') as out_file: # content = await file.read() # async read # await out_file.write(content) # async write try: content = await file.read() with open(file.filename, 'wb') as f: f.write(content) print(f"Successfully uploaded {file.filename}") wav = read_audio(file.filename) speech_timestamps = get_speech_timestamps(wav, model) save_audio( "only_speech.wav", collect_chunks(speech_timestamps, wav), sampling_rate=16000 ) result = pipe( "only_speech.wav", # Transcribe audio batch_size=8, return_timestamps=True, generate_kwargs={"language": "tagalog","return_timestamps": True,} ) translatedResult = translate(result['text'], srcLang=srcLang, tgtLang=tgtLang) return { "transcribed_text": result['text'], "translated_text": translatedResult, "srcLang": srcLang, "tgtLang": tgtLang } except Exception as error: print("Error: ", str(error))) raise HTTPException(status_code=500, detail=str(error)) finally: if file.file: file.file.close() if os.path.exists(file.filename) os.remove(file.filename) if os.path.exists("only_speech.wav") os.remove("only_speech.wav") @app.post("/translateText/") async def test(text: str, srcLang: str = Form(...), tgtLang: str = Form(...)): result = translate(text, srcLang, tgtLang) print('Raw: ', text) print('Translated: ', result) return {'translated_text': result}