import os
import sys
import hashlib
import gradio as gr
from subprocess import call

# Install dependencies
os.system("pip install coqui-tts-pygoruut==0.27.4")

# Model configuration with unique keys
MODELS = {
    "US English (VITS LJspeech)": {
        "model_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/best_model.pth",
        "config_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/config.json",
        "key": "ljspeech_us",
        "default_text": "Hello world, this is an English TTS example."
    },
    "Korean (VITS KSS)": {
        "model_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/best_model.pth",
        "config_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/config.json",
        "key": "kss_ko",
        "default_text": "안녕하세요 세상, 이것은 한국어 TTS 예제입니다."
    },
    "MinNan Hokkien (VITS SuiSiann)": {
        "model_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/best_model.pth",
        "config_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/config.json",
        "key": "suisiann_minnan_hokkien",
        "default_text": "你好！我是蔡贏。我的人在台北。我閣好笑你會幫參。"
    },
    "Uyghur (VITS UQSpeech)": {
        "model_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/best_model.pth",
        "config_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/config.json",
        "key": "uqspeech_ug",
        "default_text": "بۈگۈن ھاۋا  ئىنتايىن  ياخشى  بولۇپ، كۈن  نۇرى  چاقناۋاتدۇ."
    },
    "Slovak (VITS Female)": {
        "model_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/best_model.pth",
        "config_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/config.json",
        "key": "slovakspeech_sk",
        "default_text": "Ahoj svet, toto je príklad syntézy reči."
    }
}

CACHE_DIR = "model_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def run_cmd(command):
    try:
        print("Running:", " ".join(command))
        call(command)
    except KeyboardInterrupt:
        print("Process interrupted")
        sys.exit(1)

def download_model(model_name):
    model_info = MODELS[model_name]
    
    # Create unique filenames with key hash
    key_hash = hashlib.md5(model_info["key"].encode()).hexdigest()[:8]
    model_path = os.path.join(CACHE_DIR, f"best_model_{model_info['key']}_{key_hash}.pth")
    config_path = os.path.join(CACHE_DIR, f"config_{model_info['key']}_{key_hash}.json")
    
    # Download only if missing
    if not os.path.exists(model_path):
        print(f"Downloading model for {model_name}...")
        os.system(f"wget -q {model_info['model_url']} -O {model_path}")
    else:
        print(f"Using cached model: {model_path}")
        
    if not os.path.exists(config_path):
        print(f"Downloading config for {model_name}...")
        os.system(f"wget -q {model_info['config_url']} -O {config_path}")
    else:
        print(f"Using cached config: {config_path}")
    
    return model_path, config_path

def inference(text, model_name):
    model_info = MODELS[model_name]
    model_path, config_path = download_model(model_name)

    # Create md5 hash from text + model key
    md5_hash = hashlib.md5((text + model_info["key"]).encode()).hexdigest()
    output_file = os.path.join(OUTPUT_DIR, f"{md5_hash}.wav")

    # If file already exists, return it
    if os.path.exists(output_file):
        print(f"Cache hit: {output_file}")
        return output_file

    # Otherwise synthesize new audio
    print(f"Cache miss: synthesizing {output_file}")
    cmd = [
        "tts",
        "--text", text,
        "--model_path", model_path,
        "--config_path", config_path,
        "--out_path", output_file
    ]
    run_cmd(cmd)
    return output_file

# Gradio UI
inputs = [
    gr.Textbox(lines=5, label="Input Text"),
    gr.Dropdown(choices=list(MODELS.keys()), label="Select Model", value=list(MODELS.keys())[0])
]
outputs = gr.Audio(type="filepath", label="Output Audio")

title = "Multi-Language Coqui VITS TTS"
description = """
Choose between **US English (LJspeech)**, **Korean (KSS)**, **MinNan Hokkien (SuiSiann)**, **Uyghur (UQSpeech)**, and **Slovak (Female)** VITS models to synthesize speech from text.  
Powered by [Coqui TTS](https://github.com/coqui-ai/TTS).
"""
examples = [
    [MODELS["US English (VITS LJspeech)"]["default_text"], "US English (VITS LJspeech)"],
    [MODELS["Korean (VITS KSS)"]["default_text"], "Korean (VITS KSS)"],
    [MODELS["MinNan Hokkien (VITS SuiSiann)"]["default_text"], "MinNan Hokkien (VITS SuiSiann)"],
    [MODELS["Uyghur (VITS UQSpeech)"]["default_text"], "Uyghur (VITS UQSpeech)"],
    [MODELS["Slovak (VITS Female)"]["default_text"], "Slovak (VITS Female)"]
]

gr.Interface(
    fn=inference,
    inputs=inputs,
    outputs=outputs,
    title=title,
    description=description,
    examples=examples,
    allow_flagging="never",
    live=False
).launch()