import os import sys import hashlib import gradio as gr from subprocess import call # Install dependencies os.system("pip install coqui-tts-pygoruut==0.27.4") # Model configuration with unique keys MODELS = { "US English (VITS LJspeech)": { "model_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/best_model.pth", "config_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/config.json", "key": "ljspeech_us", "default_text": "Hello world, this is an English TTS example." }, "Korean (VITS KSS)": { "model_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/best_model.pth", "config_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/config.json", "key": "kss_ko", "default_text": "안녕하세요 세상, 이것은 한국어 TTS 예제입니다." }, "MinNan Hokkien (VITS SuiSiann)": { "model_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/best_model.pth", "config_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/config.json", "key": "suisiann_minnan_hokkien", "default_text": "你好!我是蔡贏。我的人在台北。我閣好笑你會幫參。" }, "Uyghur (VITS UQSpeech)": { "model_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/best_model.pth", "config_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/config.json", "key": "uqspeech_ug", "default_text": "بۈگۈن ھاۋا ئىنتايىن ياخشى بولۇپ، كۈن نۇرى چاقناۋاتدۇ." }, "Slovak (VITS Female)": { "model_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/best_model.pth", "config_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/config.json", "key": "slovakspeech_sk", "default_text": "Ahoj svet, toto je príklad syntézy reči." } } CACHE_DIR = "model_cache" os.makedirs(CACHE_DIR, exist_ok=True) OUTPUT_DIR = "outputs" os.makedirs(OUTPUT_DIR, exist_ok=True) def run_cmd(command): try: print("Running:", " ".join(command)) call(command) except KeyboardInterrupt: print("Process interrupted") sys.exit(1) def download_model(model_name): model_info = MODELS[model_name] # Create unique filenames with key hash key_hash = hashlib.md5(model_info["key"].encode()).hexdigest()[:8] model_path = os.path.join(CACHE_DIR, f"best_model_{model_info['key']}_{key_hash}.pth") config_path = os.path.join(CACHE_DIR, f"config_{model_info['key']}_{key_hash}.json") # Download only if missing if not os.path.exists(model_path): print(f"Downloading model for {model_name}...") os.system(f"wget -q {model_info['model_url']} -O {model_path}") else: print(f"Using cached model: {model_path}") if not os.path.exists(config_path): print(f"Downloading config for {model_name}...") os.system(f"wget -q {model_info['config_url']} -O {config_path}") else: print(f"Using cached config: {config_path}") return model_path, config_path def inference(text, model_name): model_info = MODELS[model_name] model_path, config_path = download_model(model_name) # Create md5 hash from text + model key md5_hash = hashlib.md5((text + model_info["key"]).encode()).hexdigest() output_file = os.path.join(OUTPUT_DIR, f"{md5_hash}.wav") # If file already exists, return it if os.path.exists(output_file): print(f"Cache hit: {output_file}") return output_file # Otherwise synthesize new audio print(f"Cache miss: synthesizing {output_file}") cmd = [ "tts", "--text", text, "--model_path", model_path, "--config_path", config_path, "--out_path", output_file ] run_cmd(cmd) return output_file # Gradio UI inputs = [ gr.Textbox(lines=5, label="Input Text"), gr.Dropdown(choices=list(MODELS.keys()), label="Select Model", value=list(MODELS.keys())[0]) ] outputs = gr.Audio(type="filepath", label="Output Audio") title = "Multi-Language Coqui VITS TTS" description = """ Choose between **US English (LJspeech)**, **Korean (KSS)**, **MinNan Hokkien (SuiSiann)**, **Uyghur (UQSpeech)**, and **Slovak (Female)** VITS models to synthesize speech from text. Powered by [Coqui TTS](https://github.com/coqui-ai/TTS). """ examples = [ [MODELS["US English (VITS LJspeech)"]["default_text"], "US English (VITS LJspeech)"], [MODELS["Korean (VITS KSS)"]["default_text"], "Korean (VITS KSS)"], [MODELS["MinNan Hokkien (VITS SuiSiann)"]["default_text"], "MinNan Hokkien (VITS SuiSiann)"], [MODELS["Uyghur (VITS UQSpeech)"]["default_text"], "Uyghur (VITS UQSpeech)"], [MODELS["Slovak (VITS Female)"]["default_text"], "Slovak (VITS Female)"] ] gr.Interface( fn=inference, inputs=inputs, outputs=outputs, title=title, description=description, examples=examples, allow_flagging="never", live=False ).launch()