|
import os |
|
import sys |
|
import hashlib |
|
import gradio as gr |
|
from subprocess import call |
|
|
|
|
|
os.system("pip install coqui-tts-pygoruut==0.27.4") |
|
|
|
|
|
MODELS = { |
|
"US English (VITS LJspeech)": { |
|
"model_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/best_model.pth", |
|
"config_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/config.json", |
|
"key": "ljspeech_us", |
|
"default_text": "Hello world, this is an English TTS example." |
|
}, |
|
"Korean (VITS KSS)": { |
|
"model_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/best_model.pth", |
|
"config_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/config.json", |
|
"key": "kss_ko", |
|
"default_text": "안녕하세요 세상, 이것은 한국어 TTS 예제입니다." |
|
}, |
|
"MinNan Hokkien (VITS SuiSiann)": { |
|
"model_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/best_model.pth", |
|
"config_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/config.json", |
|
"key": "suisiann_minnan_hokkien", |
|
"default_text": "你好!我是蔡贏。我的人在台北。我閣好笑你會幫參。" |
|
}, |
|
"Uyghur (VITS UQSpeech)": { |
|
"model_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/best_model.pth", |
|
"config_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/config.json", |
|
"key": "uqspeech_ug", |
|
"default_text": "بۈگۈن ھاۋا ئىنتايىن ياخشى بولۇپ، كۈن نۇرى چاقناۋاتدۇ." |
|
}, |
|
"Slovak (VITS Female)": { |
|
"model_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/best_model.pth", |
|
"config_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/config.json", |
|
"key": "slovakspeech_sk", |
|
"default_text": "Ahoj svet, toto je príklad syntézy reči." |
|
} |
|
} |
|
|
|
CACHE_DIR = "model_cache" |
|
os.makedirs(CACHE_DIR, exist_ok=True) |
|
OUTPUT_DIR = "outputs" |
|
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
|
def run_cmd(command): |
|
try: |
|
print("Running:", " ".join(command)) |
|
call(command) |
|
except KeyboardInterrupt: |
|
print("Process interrupted") |
|
sys.exit(1) |
|
|
|
def download_model(model_name): |
|
model_info = MODELS[model_name] |
|
|
|
|
|
key_hash = hashlib.md5(model_info["key"].encode()).hexdigest()[:8] |
|
model_path = os.path.join(CACHE_DIR, f"best_model_{model_info['key']}_{key_hash}.pth") |
|
config_path = os.path.join(CACHE_DIR, f"config_{model_info['key']}_{key_hash}.json") |
|
|
|
|
|
if not os.path.exists(model_path): |
|
print(f"Downloading model for {model_name}...") |
|
os.system(f"wget -q {model_info['model_url']} -O {model_path}") |
|
else: |
|
print(f"Using cached model: {model_path}") |
|
|
|
if not os.path.exists(config_path): |
|
print(f"Downloading config for {model_name}...") |
|
os.system(f"wget -q {model_info['config_url']} -O {config_path}") |
|
else: |
|
print(f"Using cached config: {config_path}") |
|
|
|
return model_path, config_path |
|
|
|
def inference(text, model_name): |
|
model_info = MODELS[model_name] |
|
model_path, config_path = download_model(model_name) |
|
|
|
|
|
md5_hash = hashlib.md5((text + model_info["key"]).encode()).hexdigest() |
|
output_file = os.path.join(OUTPUT_DIR, f"{md5_hash}.wav") |
|
|
|
|
|
if os.path.exists(output_file): |
|
print(f"Cache hit: {output_file}") |
|
return output_file |
|
|
|
|
|
print(f"Cache miss: synthesizing {output_file}") |
|
cmd = [ |
|
"tts", |
|
"--text", text, |
|
"--model_path", model_path, |
|
"--config_path", config_path, |
|
"--out_path", output_file |
|
] |
|
run_cmd(cmd) |
|
return output_file |
|
|
|
|
|
inputs = [ |
|
gr.Textbox(lines=5, label="Input Text"), |
|
gr.Dropdown(choices=list(MODELS.keys()), label="Select Model", value=list(MODELS.keys())[0]) |
|
] |
|
outputs = gr.Audio(type="filepath", label="Output Audio") |
|
|
|
title = "Multi-Language Coqui VITS TTS" |
|
description = """ |
|
Choose between **US English (LJspeech)**, **Korean (KSS)**, **MinNan Hokkien (SuiSiann)**, **Uyghur (UQSpeech)**, and **Slovak (Female)** VITS models to synthesize speech from text. |
|
Powered by [Coqui TTS](https://github.com/coqui-ai/TTS). |
|
""" |
|
examples = [ |
|
[MODELS["US English (VITS LJspeech)"]["default_text"], "US English (VITS LJspeech)"], |
|
[MODELS["Korean (VITS KSS)"]["default_text"], "Korean (VITS KSS)"], |
|
[MODELS["MinNan Hokkien (VITS SuiSiann)"]["default_text"], "MinNan Hokkien (VITS SuiSiann)"], |
|
[MODELS["Uyghur (VITS UQSpeech)"]["default_text"], "Uyghur (VITS UQSpeech)"], |
|
[MODELS["Slovak (VITS Female)"]["default_text"], "Slovak (VITS Female)"] |
|
] |
|
|
|
gr.Interface( |
|
fn=inference, |
|
inputs=inputs, |
|
outputs=outputs, |
|
title=title, |
|
description=description, |
|
examples=examples, |
|
allow_flagging="never", |
|
live=False |
|
).launch() |
|
|