coqui-tts / app.py
neurlang's picture
Update app.py
49e761b verified
import os
import sys
import hashlib
import gradio as gr
from subprocess import call
# Install dependencies
os.system("pip install coqui-tts-pygoruut==0.27.4")
# Model configuration with unique keys
MODELS = {
"US English (VITS LJspeech)": {
"model_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/best_model.pth",
"config_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/config.json",
"key": "ljspeech_us",
"default_text": "Hello world, this is an English TTS example."
},
"Korean (VITS KSS)": {
"model_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/best_model.pth",
"config_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/config.json",
"key": "kss_ko",
"default_text": "안녕하세요 세상, 이것은 한국어 TTS 예제입니다."
},
"MinNan Hokkien (VITS SuiSiann)": {
"model_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/best_model.pth",
"config_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/config.json",
"key": "suisiann_minnan_hokkien",
"default_text": "你好!我是蔡贏。我的人在台北。我閣好笑你會幫參。"
},
"Uyghur (VITS UQSpeech)": {
"model_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/best_model.pth",
"config_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/config.json",
"key": "uqspeech_ug",
"default_text": "بۈگۈن ھاۋا ئىنتايىن ياخشى بولۇپ، كۈن نۇرى چاقناۋاتدۇ."
},
"Slovak (VITS Female)": {
"model_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/best_model.pth",
"config_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/config.json",
"key": "slovakspeech_sk",
"default_text": "Ahoj svet, toto je príklad syntézy reči."
}
}
CACHE_DIR = "model_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
def run_cmd(command):
try:
print("Running:", " ".join(command))
call(command)
except KeyboardInterrupt:
print("Process interrupted")
sys.exit(1)
def download_model(model_name):
model_info = MODELS[model_name]
# Create unique filenames with key hash
key_hash = hashlib.md5(model_info["key"].encode()).hexdigest()[:8]
model_path = os.path.join(CACHE_DIR, f"best_model_{model_info['key']}_{key_hash}.pth")
config_path = os.path.join(CACHE_DIR, f"config_{model_info['key']}_{key_hash}.json")
# Download only if missing
if not os.path.exists(model_path):
print(f"Downloading model for {model_name}...")
os.system(f"wget -q {model_info['model_url']} -O {model_path}")
else:
print(f"Using cached model: {model_path}")
if not os.path.exists(config_path):
print(f"Downloading config for {model_name}...")
os.system(f"wget -q {model_info['config_url']} -O {config_path}")
else:
print(f"Using cached config: {config_path}")
return model_path, config_path
def inference(text, model_name):
model_info = MODELS[model_name]
model_path, config_path = download_model(model_name)
# Create md5 hash from text + model key
md5_hash = hashlib.md5((text + model_info["key"]).encode()).hexdigest()
output_file = os.path.join(OUTPUT_DIR, f"{md5_hash}.wav")
# If file already exists, return it
if os.path.exists(output_file):
print(f"Cache hit: {output_file}")
return output_file
# Otherwise synthesize new audio
print(f"Cache miss: synthesizing {output_file}")
cmd = [
"tts",
"--text", text,
"--model_path", model_path,
"--config_path", config_path,
"--out_path", output_file
]
run_cmd(cmd)
return output_file
# Gradio UI
inputs = [
gr.Textbox(lines=5, label="Input Text"),
gr.Dropdown(choices=list(MODELS.keys()), label="Select Model", value=list(MODELS.keys())[0])
]
outputs = gr.Audio(type="filepath", label="Output Audio")
title = "Multi-Language Coqui VITS TTS"
description = """
Choose between **US English (LJspeech)**, **Korean (KSS)**, **MinNan Hokkien (SuiSiann)**, **Uyghur (UQSpeech)**, and **Slovak (Female)** VITS models to synthesize speech from text.
Powered by [Coqui TTS](https://github.com/coqui-ai/TTS).
"""
examples = [
[MODELS["US English (VITS LJspeech)"]["default_text"], "US English (VITS LJspeech)"],
[MODELS["Korean (VITS KSS)"]["default_text"], "Korean (VITS KSS)"],
[MODELS["MinNan Hokkien (VITS SuiSiann)"]["default_text"], "MinNan Hokkien (VITS SuiSiann)"],
[MODELS["Uyghur (VITS UQSpeech)"]["default_text"], "Uyghur (VITS UQSpeech)"],
[MODELS["Slovak (VITS Female)"]["default_text"], "Slovak (VITS Female)"]
]
gr.Interface(
fn=inference,
inputs=inputs,
outputs=outputs,
title=title,
description=description,
examples=examples,
allow_flagging="never",
live=False
).launch()