# -*- coding: utf-8 -*-
import os
import uuid
import tempfile
import numpy as np
import scipy.io.wavfile

from fastapi import FastAPI, UploadFile, File, HTTPException, Form
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware

from google import genai
from google.genai import types

from silero_vad import (
    load_silero_vad,
    read_audio,
    get_speech_timestamps,
    save_audio,
    collect_chunks,
)

import torch
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    pipeline,
    VitsModel,
    AutoTokenizer,
)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load model and processor
model_id = "rbcurzon/whisper-medium-tgl"

pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    chunk_length_s=30,
    device=device
)

model = load_silero_vad()

client = genai.Client(api_key=os.environ.get("GENAI_API_KEY")) # Do not share api key

"""**FastAPI**"""
app = FastAPI(
    title="Real-Time Audio Processor",
    description="Process and transcribe audio in real-time using Whisper"
)

def remove_silence(filename):  
    wav = read_audio(filename)
    speech_timestamps = get_speech_timestamps(wav, model)
    temp_file = create_temp_filename()
    save_audio(
        temp_file, 
        collect_chunks(speech_timestamps, wav), 
        sampling_rate=16000
    )

    return temp_file

def create_temp_filename():
    # Step 1: Generate a unique file name using uuid
    unique_id = str(uuid.uuid4())
    temp_file_name = f"{unique_id}.wav"
    
    # Step 2: Create a temporary file
    temp_file_path = os.path.join(tempfile.gettempdir(), temp_file_name)
    
    return temp_file_path

def translate(text, srcLang, tgtLang):
    sys_instruct = "You are a professional translator. Generate a translation of the text and return only the result. Return only the translated text."
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        config=types.GenerateContentConfig(
            system_instruction=sys_instruct),
        contents=f"Translate the text from {srcLang} to {tgtLang}: {text} ",
    )
    return response.text

@app.post("/translateAudio/")
async def translate_audio(
    file: UploadFile = File(...),
    srcLang: str = Form("Tagalog"),
    tgtLang: str = Form("Cebuano")
    ):
    """
    Endpoint to translate audio files.
    This endpoint accepts an audio file, processes it to remove silence, transcribes the audio,
    and translates the transcribed text from the source language to the target language.
    Args:
        file (UploadFile): The audio file to be uploaded and processed.
        srcLang (str): The source language of the audio transcription. Defaults to "Tagalog".
        tgtLang (str): The target language for translation. Defaults to "Cebuano".
    Returns:
        dict: A dictionary containing:
            - transcribed_text (str): The transcribed text from the audio.
            - translated_text (str): The translated text from the source language to the target language.
            - srcLang (str): The source language used for transcription.
            - tgtLang (str): The target language used for translation.
    Raises:
        HTTPException: If an error occurs during processing, a 500 status code is returned with the error details.
    Notes:
        - The uploaded file is temporarily saved to disk for processing and removed after completion.
        - Silence is removed from the audio file before transcription.
        - The transcription and translation processes are performed asynchronously.
    """

    try:
        content = await file.read() 
        with open(file.filename, 'wb') as f:
            f.write(content)
            print(f"Successfully uploaded {file.filename}")

        generate_kwargs = {
            "language": "tagalog",
            "return_timestamps": True,
            # "condition_on_prev_tokens": False,
            # "initial_prompt":  "The sentence may be cut off, do not make up words to fill in the rest of the sentence."
        }
        
        temp_file = remove_silence(file.filename)
        
        result = pipe(
            temp_file,
            batch_size=8, 
            return_timestamps=True,
            generate_kwargs=generate_kwargs
        )
        print(result)
        
        result_dict = {
            "transcribed_text": result['text'], 
            "translated_text": translate(result['text'], srcLang=srcLang, tgtLang=tgtLang), 
            "srcLang": srcLang, 
            "tgtLang": tgtLang
        } 
        print(result_dict)
        
        return result_dict
       
    except Exception as error:
        print("Error: ", str(error))
        raise HTTPException(status_code=500, detail=str(error))
        
    finally:
        if file.file:
            file.file.close()
        if os.path.exists(file.filename):
            os.remove(file.filename)
        if os.path.exists(temp_file):
            os.remove(temp_file)
        

@app.post("/translateText/")
async def translate_text(text: str,
               srcLang: str = Form(...),
               tgtLang: str = Form(...)):
    result = translate(text, srcLang, tgtLang)
    print('Raw: ', text)
    print('Translated: ', result)
    
    result_dict = {
        "text": text, 
        "translated_text": result, 
        "srcLang": srcLang, 
        "tgtLang": tgtLang
    }
    return result_dict

@app.post("/synthesize/")
async def synthesize(text: str):
    model = VitsModel.from_pretrained("facebook/mms-tts-tgl")
    tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-tgl")
    
    inputs = tokenizer(text, return_tensors="pt")
    
    with torch.no_grad():
        output = model(**inputs).waveform
    
    data_np = output.numpy()
    data_np_squeezed = np.squeeze(data_np)
    
    scipy.io.wavfile.write("speech.wav", rate=model.config.sampling_rate, data=data_np_squeezed)
        
    return FileResponse("speech.wav", media_type="audio/wav")