Spaces:
Sleeping
Sleeping
File size: 5,452 Bytes
af03ede 933c26a 1b38b1a 933c26a 1b38b1a 80656c6 933c26a af03ede 1b38b1a 110b876 af03ede bbb8c7f 1b38b1a ef78070 1b38b1a af03ede 21ef06a 29a2754 25b58c8 af03ede c6e5581 ba04059 5c6c33f ba04059 933c26a ba04059 933c26a af03ede 933c26a af03ede c81e356 af03ede 568f1de af03ede e26ee62 af03ede a4d0f05 5069aea 21ef06a 173efc2 bf36c60 173efc2 c794494 936227e 933c26a c81e356 d9c7010 933c26a 1b38b1a 1f3c73c ad2f36e 1f3c73c ad2f36e ba04059 430f706 bf98a17 ba04059 c81e356 ad2f36e c81e356 ad2f36e ba04059 ad2f36e 80656c6 21ef06a cb24b35 e8eea35 ad2f36e c81e356 ad2f36e e8eea35 cb24b35 ad2f36e 25b58c8 ad2f36e 7485ddb ad2f36e deb75d7 ad2f36e b5c3008 22fe124 c794494 936227e a4d0f05 c794494 25b58c8 ba04059 21ef06a fb4506a 80242cd 739d409 80242cd 739d409 80242cd 739d409 80656c6 739d409 25b58c8 80656c6 21ef06a 2f83497 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# -*- coding: utf-8 -*-
import os
import uuid
import tempfile
import numpy as np
import scipy.io.wavfile
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from starlette.background import BackgroundTask
from google import genai
from google.genai import types
from silero_vad import (
load_silero_vad,
read_audio,
get_speech_timestamps,
save_audio,
collect_chunks,
)
import torch
import torchaudio
from transformers import (
pipeline,
VitsModel,
VitsTokenizer,
)
import time
import gradio as gr
import logging
logging.basicConfig(level=logging.INFO
, format='%(asctime)s - %(levelname)s - %(message)s')
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# load model and processor
model_id = "rbcurzon/whisper-medium-ph"
pipe = pipeline(
"automatic-speech-recognition",
model=model_id,
chunk_length_s=30,
device=device
)
model = load_silero_vad()
client = genai.Client(api_key=os.environ.get("GENAI_API_KEY")) # Do not share api key
"""**FastAPI**"""
app = FastAPI(
title="Real-Time Audio Processor",
description="Process and transcribe audio in real-time using Whisper"
)
def remove_silence(filename):
wav = read_audio(filename)
speech_timestamps = get_speech_timestamps(wav, model)
temp_file = create_temp_filename()
save_audio(
temp_file,
collect_chunks(speech_timestamps, wav),
sampling_rate=16000
)
return temp_file
def create_temp_filename():
# Step 1: Generate a unique file name using uuid
unique_id = str(uuid.uuid4())
temp_file_name = f"{unique_id}.wav"
# Step 2: Create a temporary file
temp_file_path = os.path.join(tempfile.gettempdir(), temp_file_name)
return temp_file_path
def translate(text, srcLang, tgtLang):
sys_instruct = "You are a professional translator. Generate a translation of the text and return only the result. Return only the translated text."
response = client.models.generate_content(
model="gemini-2.0-flash",
config=types.GenerateContentConfig(
system_instruction=sys_instruct),
contents=f"Translate the text from {srcLang} to {tgtLang}: {text} ",
)
return response.text
def remove_file(file):
time.sleep(600) # delay for 10 minutes
os.remove(file)
@app.get("/")
def read_root():
return{
"detail":"Philippine Regional Language Translator",
}
@app.post("/translateAudio/")
async def translate_audio(
file: UploadFile = File(...),
srcLang: str = Form("Tagalog"),
tgtLang: str = Form("Cebuano")
):
try:
content = await file.read()
with open(file.filename, 'wb') as f:
f.write(content)
print(f"Successfully uploaded {file.filename}")
generate_kwargs = {
"language": "tagalog",
"return_timestamps": True,
# "condition_on_prev_tokens": False,
# "initial_prompt": "The sentence may be cut off, do not make up words to fill in the rest of the sentence."
}
temp_file = remove_silence(file.filename)
result = pipe(
temp_file,
batch_size=8,
return_timestamps=True,
generate_kwargs=generate_kwargs
)
# print(result)
result_dict = {
"transcribed_text": result['text'],
"translated_text": translate(result['text'], srcLang=srcLang, tgtLang=tgtLang),
"srcLang": srcLang,
"tgtLang": tgtLang
}
return result_dict
except Exception as error:
logging.error(f"Error translating audio {file.filename}: {error}")
raise HTTPException(status_code=500, detail=str(error))
finally:
if file.file:
file.file.close()
if os.path.exists(file.filename):
os.remove(file.filename)
if os.path.exists(temp_file):
os.remove(temp_file)
@app.post("/translateText/")
async def translate_text(text: str,
srcLang: str = Form(...),
tgtLang: str = Form(...)):
result = translate(text, srcLang, tgtLang)
if not result:
logging.error("Translation failed for text: %s", text)
raise HTTPException(status_code=500, detail="Translation failed")
result_dict = {
"text": text,
"translated_text": result,
"srcLang": srcLang,
"tgtLang": tgtLang
}
return result_dict
@app.post("/synthesize/")
async def synthesize(text: str):
model = VitsModel.from_pretrained("facebook/mms-tts-tgl")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-tgl")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
with torch.no_grad():
outputs = model(input_ids)
speech = outputs["waveform"]
temp_file = create_temp_filename()
torchaudio.save(temp_file, speech.cpu(), 16000)
logging.info(f"Synthesizing completed for text: {text}")
return FileResponse(
temp_file,
media_type="audio/wav",
background=BackgroundTask(remove_file, temp_file)
) |