Spaces:

neurlang
/

coqui-tts

Running

App Files Files Community

coqui-tts / app.py

neurlang

Update app.py

49e761b verified about 1 month ago

raw

history blame contribute delete

5.31 kB

	import os
	import sys
	import hashlib
	import gradio as gr
	from subprocess import call

	# Install dependencies
	os.system("pip install coqui-tts-pygoruut==0.27.4")

	# Model configuration with unique keys
	MODELS = {
	"US English (VITS LJspeech)": {
	"model_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/best_model.pth",
	"config_url": "https://huggingface.co/neurlang/coqui-vits-ljspeech-us-english/resolve/main/config.json",
	"key": "ljspeech_us",
	"default_text": "Hello world, this is an English TTS example."
	},
	"Korean (VITS KSS)": {
	"model_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/best_model.pth",
	"config_url": "https://huggingface.co/neurlang/coqui-vits-kss-korean/resolve/main/config.json",
	"key": "kss_ko",
	"default_text": "안녕하세요 세상, 이것은 한국어 TTS 예제입니다."
	},
	"MinNan Hokkien (VITS SuiSiann)": {
	"model_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/best_model.pth",
	"config_url": "https://huggingface.co/neurlang/coqui-vits-suisiann-minnan-hokkien/resolve/main/config.json",
	"key": "suisiann_minnan_hokkien",
	"default_text": "你好！我是蔡贏。我的人在台北。我閣好笑你會幫參。"
	},
	"Uyghur (VITS UQSpeech)": {
	"model_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/best_model.pth",
	"config_url": "https://huggingface.co/neurlang/coqui-vits-uqspeech-uyghur/resolve/main/config.json",
	"key": "uqspeech_ug",
	"default_text": "بۈگۈن ھاۋا ئىنتايىن ياخشى بولۇپ، كۈن نۇرى چاقناۋاتدۇ."
	},
	"Slovak (VITS Female)": {
	"model_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/best_model.pth",
	"config_url": "https://huggingface.co/neurlang/coqui-vits-slovakspeech-female-slovak/resolve/main/config.json",
	"key": "slovakspeech_sk",
	"default_text": "Ahoj svet, toto je príklad syntézy reči."
	}
	}

	CACHE_DIR = "model_cache"
	os.makedirs(CACHE_DIR, exist_ok=True)
	OUTPUT_DIR = "outputs"
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	def run_cmd(command):
	try:
	print("Running:", " ".join(command))
	call(command)
	except KeyboardInterrupt:
	print("Process interrupted")
	sys.exit(1)

	def download_model(model_name):
	model_info = MODELS[model_name]

	# Create unique filenames with key hash
	key_hash = hashlib.md5(model_info["key"].encode()).hexdigest()[:8]
	model_path = os.path.join(CACHE_DIR, f"best_model_{model_info['key']}_{key_hash}.pth")
	config_path = os.path.join(CACHE_DIR, f"config_{model_info['key']}_{key_hash}.json")

	# Download only if missing
	if not os.path.exists(model_path):
	print(f"Downloading model for {model_name}...")
	os.system(f"wget -q {model_info['model_url']} -O {model_path}")
	else:
	print(f"Using cached model: {model_path}")

	if not os.path.exists(config_path):
	print(f"Downloading config for {model_name}...")
	os.system(f"wget -q {model_info['config_url']} -O {config_path}")
	else:
	print(f"Using cached config: {config_path}")

	return model_path, config_path

	def inference(text, model_name):
	model_info = MODELS[model_name]
	model_path, config_path = download_model(model_name)

	# Create md5 hash from text + model key
	md5_hash = hashlib.md5((text + model_info["key"]).encode()).hexdigest()
	output_file = os.path.join(OUTPUT_DIR, f"{md5_hash}.wav")

	# If file already exists, return it
	if os.path.exists(output_file):
	print(f"Cache hit: {output_file}")
	return output_file

	# Otherwise synthesize new audio
	print(f"Cache miss: synthesizing {output_file}")
	cmd = [
	"tts",
	"--text", text,
	"--model_path", model_path,
	"--config_path", config_path,
	"--out_path", output_file
	]
	run_cmd(cmd)
	return output_file

	# Gradio UI
	inputs = [
	gr.Textbox(lines=5, label="Input Text"),
	gr.Dropdown(choices=list(MODELS.keys()), label="Select Model", value=list(MODELS.keys())[0])
	]
	outputs = gr.Audio(type="filepath", label="Output Audio")

	title = "Multi-Language Coqui VITS TTS"
	description = """
	Choose between US English (LJspeech), Korean (KSS), MinNan Hokkien (SuiSiann), Uyghur (UQSpeech), and Slovak (Female) VITS models to synthesize speech from text.
	Powered by [Coqui TTS](https://github.com/coqui-ai/TTS).
	"""
	examples = [
	[MODELS["US English (VITS LJspeech)"]["default_text"], "US English (VITS LJspeech)"],
	[MODELS["Korean (VITS KSS)"]["default_text"], "Korean (VITS KSS)"],
	[MODELS["MinNan Hokkien (VITS SuiSiann)"]["default_text"], "MinNan Hokkien (VITS SuiSiann)"],
	[MODELS["Uyghur (VITS UQSpeech)"]["default_text"], "Uyghur (VITS UQSpeech)"],
	[MODELS["Slovak (VITS Female)"]["default_text"], "Slovak (VITS Female)"]
	]

	gr.Interface(
	fn=inference,
	inputs=inputs,
	outputs=outputs,
	title=title,
	description=description,
	examples=examples,
	allow_flagging="never",
	live=False
	).launch()