VoiceClone-TTS

Running on Zero

App Files Files Community

VoiceClone-TTS / app.py

ginipick

Update app.py

860df56 verified about 2 months ago

raw

history blame

19.5 kB

	import os
	import spaces
	import torch
	import torchaudio
	import gradio as gr
	from os import getenv

	from zonos.model import Zonos
	from zonos.conditioning import make_cond_dict, supported_language_codes

	device = "cuda"
	MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
	MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
	for model in MODELS.values():
	model.requires_grad_(False).eval()


	def update_ui(model_choice):
	"""
	Dynamically show/hide UI elements based on the model's conditioners.
	We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
	"""
	model = MODELS[model_choice]
	cond_names = [c.name for c in model.prefix_conditioner.conditioners]
	print("Conditioners in this model:", cond_names)

	text_update = gr.update(visible=("espeak" in cond_names))
	language_update = gr.update(visible=("espeak" in cond_names))
	speaker_audio_update = gr.update(visible=("speaker" in cond_names))
	prefix_audio_update = gr.update(visible=True)
	emotion1_update = gr.update(visible=("emotion" in cond_names))
	emotion2_update = gr.update(visible=("emotion" in cond_names))
	emotion3_update = gr.update(visible=("emotion" in cond_names))
	emotion4_update = gr.update(visible=("emotion" in cond_names))
	emotion5_update = gr.update(visible=("emotion" in cond_names))
	emotion6_update = gr.update(visible=("emotion" in cond_names))
	emotion7_update = gr.update(visible=("emotion" in cond_names))
	emotion8_update = gr.update(visible=("emotion" in cond_names))
	vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
	fmax_slider_update = gr.update(visible=("fmax" in cond_names))
	pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
	speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
	dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
	speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
	unconditional_keys_update = gr.update(
	choices=[name for name in cond_names if name not in ("espeak", "language_id")]
	)

	return (
	text_update,
	language_update,
	speaker_audio_update,
	prefix_audio_update,
	emotion1_update,
	emotion2_update,
	emotion3_update,
	emotion4_update,
	emotion5_update,
	emotion6_update,
	emotion7_update,
	emotion8_update,
	vq_single_slider_update,
	fmax_slider_update,
	pitch_std_slider_update,
	speaking_rate_slider_update,
	dnsmos_slider_update,
	speaker_noised_checkbox_update,
	unconditional_keys_update,
	)


	@spaces.GPU(duration=120)
	def generate_audio(
	model_choice,
	text,
	language,
	speaker_audio,
	prefix_audio,
	e1,
	e2,
	e3,
	e4,
	e5,
	e6,
	e7,
	e8,
	vq_single,
	fmax,
	pitch_std,
	speaking_rate,
	dnsmos_ovrl,
	speaker_noised,
	cfg_scale,
	min_p,
	seed,
	randomize_seed,
	unconditional_keys,
	progress=gr.Progress(),
	):
	"""
	Generates audio based on the provided UI parameters.
	We do NOT use language_id or ctc_loss even if the model has them.
	"""
	selected_model = MODELS[model_choice]

	speaker_noised_bool = bool(speaker_noised)
	fmax = float(fmax)
	pitch_std = float(pitch_std)
	speaking_rate = float(speaking_rate)
	dnsmos_ovrl = float(dnsmos_ovrl)
	cfg_scale = float(cfg_scale)
	min_p = float(min_p)
	seed = int(seed)
	max_new_tokens = 86 * 30

	if randomize_seed:
	seed = torch.randint(0, 2**32 - 1, (1,)).item()
	torch.manual_seed(seed)

	speaker_embedding = None
	if speaker_audio is not None and "speaker" not in unconditional_keys:
	wav, sr = torchaudio.load(speaker_audio)
	speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
	speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)

	audio_prefix_codes = None
	if prefix_audio is not None:
	wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
	wav_prefix = wav_prefix.mean(0, keepdim=True)
	wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
	wav_prefix = wav_prefix.to(device, dtype=torch.float32)
	with torch.autocast(device, dtype=torch.float32):
	audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))

	emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)

	vq_val = float(vq_single)
	vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)

	cond_dict = make_cond_dict(
	text=text,
	language=language,
	speaker=speaker_embedding,
	emotion=emotion_tensor,
	vqscore_8=vq_tensor,
	fmax=fmax,
	pitch_std=pitch_std,
	speaking_rate=speaking_rate,
	dnsmos_ovrl=dnsmos_ovrl,
	speaker_noised=speaker_noised_bool,
	device=device,
	unconditional_keys=unconditional_keys,
	)
	conditioning = selected_model.prepare_conditioning(cond_dict)

	estimated_generation_duration = 30 * len(text) / 400
	estimated_total_steps = int(estimated_generation_duration * 86)

	def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
	progress((step, estimated_total_steps))
	return True

	codes = selected_model.generate(
	prefix_conditioning=conditioning,
	audio_prefix_codes=audio_prefix_codes,
	max_new_tokens=max_new_tokens,
	cfg_scale=cfg_scale,
	batch_size=1,
	sampling_params=dict(min_p=min_p),
	callback=update_progress,
	)

	wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
	sr_out = selected_model.autoencoder.sampling_rate
	if wav_out.dim() == 2 and wav_out.size(0) > 1:
	wav_out = wav_out[0:1, :]
	return (sr_out, wav_out.squeeze().numpy()), seed


	# Custom CSS for pastel gradient background and enhanced UI
	custom_css = """
	.gradio-container {
	background: linear-gradient(135deg, #f3e7ff, #e6f0ff, #ffe6f2, #e6fff9);
	background-size: 400% 400%;
	animation: gradient 15s ease infinite;
	}

	@keyframes gradient {
	0% {
	background-position: 0% 50%;
	}
	50% {
	background-position: 100% 50%;
	}
	100% {
	background-position: 0% 50%;
	}
	}

	.container {
	max-width: 1200px;
	margin: 0 auto;
	padding: 20px;
	}

	.panel {
	background-color: rgba(255, 255, 255, 0.7);
	border-radius: 16px;
	padding: 20px;
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
	margin-bottom: 16px;
	backdrop-filter: blur(5px);
	transition: all 0.3s ease;
	}

	.panel:hover {
	box-shadow: 0 6px 16px rgba(0, 0, 0, 0.12);
	transform: translateY(-2px);
	}

	.title {
	font-size: 1.2em;
	font-weight: 600;
	margin-bottom: 12px;
	color: #6a3ea1;
	border-bottom: 2px solid #f0e6ff;
	padding-bottom: 8px;
	}

	.slider-container {
	background-color: rgba(255, 255, 255, 0.5);
	border-radius: 10px;
	padding: 10px;
	margin: 5px 0;
	}

	/* Make sliders more appealing */
	input[type=range] {
	height: 5px;
	appearance: none;
	width: 100%;
	border-radius: 3px;
	background: linear-gradient(90deg, #9c83e0, #83b1e0);
	}

	.generate-button {
	background: linear-gradient(90deg, #a673ff, #7c4dff);
	color: white;
	border: none;
	border-radius: 8px;
	padding: 12px 24px;
	font-size: 16px;
	font-weight: 500;
	cursor: pointer;
	transition: all 0.3s ease;
	box-shadow: 0 4px 10px rgba(124, 77, 255, 0.2);
	display: block;
	width: 100%;
	margin: 20px 0;
	}

	.generate-button:hover {
	background: linear-gradient(90deg, #9c5eff, #6a3aff);
	box-shadow: 0 6px 15px rgba(124, 77, 255, 0.3);
	transform: translateY(-2px);
	}

	/* Tabs styling */
	.tabs {
	display: flex;
	border-bottom: 1px solid #e0e0e0;
	margin-bottom: 20px;
	}

	.tab {
	padding: 10px 20px;
	cursor: pointer;
	transition: all 0.3s ease;
	background-color: transparent;
	border: none;
	color: #666;
	}

	.tab.active {
	color: #7c4dff;
	border-bottom: 3px solid #7c4dff;
	font-weight: 600;
	}

	/* Emotion sliders container */
	.emotion-grid {
	display: grid;
	grid-template-columns: repeat(4, 1fr);
	gap: 12px;
	}

	/* Header styling */
	.app-header {
	text-align: center;
	margin-bottom: 25px;
	}

	.app-header h1 {
	font-size: 2.5em;
	color: #6a3ea1;
	margin-bottom: 8px;
	font-weight: 700;
	}

	.app-header p {
	font-size: 1.1em;
	color: #666;
	margin-bottom: 20px;
	}

	/* Audio player styling */
	.audio-output {
	margin-top: 20px;
	}

	/* Make output area more prominent */
	.output-container {
	background-color: rgba(255, 255, 255, 0.85);
	border-radius: 16px;
	padding: 24px;
	box-shadow: 0 8px 18px rgba(0, 0, 0, 0.1);
	margin-top: 20px;
	}
	"""


	def build_interface():
	# Build interface with enhanced visual elements and layout
	with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
	gr.HTML(
	"""
	<div class='container' style='display:flex; justify-content:center; gap:12px;'>
	<a href="https://huggingface.co/spaces/openfree/Best-AI" target="_blank">
	<img src="https://img.shields.io/static/v1?label=OpenFree&message=BEST%20AI%20Services&color=%230000ff&labelColor=%23000080&logo=huggingface&logoColor=%23ffa500&style=for-the-badge" alt="OpenFree badge">
	</a>

	<a href="https://discord.gg/openfreeai" target="_blank">
	<img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="Discord badge">
	</a>
	</div>
	"""
	)

	# Header section
	with gr.Column(elem_classes="app-header"):
	gr.Markdown("# ✨ Zonos Text-to-Speech Generator ✨")
	gr.Markdown("Create natural-sounding speech with customizable voice characteristics")

	# Main content container
	with gr.Column(elem_classes="container"):
	# First panel - Text & Model Selection
	with gr.Column(elem_classes="panel"):
	gr.Markdown('<div class="title">💬 Text & Model Configuration</div>')
	with gr.Row():
	with gr.Column(scale=2):
	model_choice = gr.Dropdown(
	choices=MODEL_NAMES,
	value="Zyphra/Zonos-v0.1-transformer",
	label="Zonos Model Type",
	info="Select the model variant to use.",
	)
	text = gr.Textbox(
	label="Text to Synthesize",
	value="Zonos uses eSpeak for text to phoneme conversion!",
	lines=4,
	max_length=500,
	)
	language = gr.Dropdown(
	choices=supported_language_codes,
	value="en-us",
	label="Language Code",
	info="Select a language code.",
	)
	with gr.Column(scale=1):
	prefix_audio = gr.Audio(
	value="assets/silence_100ms.wav",
	label="Optional Prefix Audio (continue from this audio)",
	type="filepath",
	)

	# Second panel - Voice Characteristics
	with gr.Column(elem_classes="panel"):
	gr.Markdown('<div class="title">🎤 Voice Characteristics</div>')
	with gr.Row():
	with gr.Column(scale=1):
	speaker_audio = gr.Audio(
	label="Optional Speaker Audio (for voice cloning)",
	type="filepath",
	)
	speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)

	with gr.Column(scale=2):
	with gr.Row():
	with gr.Column():
	dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="Voice Quality", elem_classes="slider-container")
	fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Frequency Max (Hz)", elem_classes="slider-container")
	vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="Voice Clarity", elem_classes="slider-container")
	with gr.Column():
	pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Variation", elem_classes="slider-container")
	speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", elem_classes="slider-container")

	# Third panel - Generation Parameters
	with gr.Column(elem_classes="panel"):
	gr.Markdown('<div class="title">⚙️ Generation Parameters</div>')
	with gr.Row():
	with gr.Column():
	cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="Guidance Scale", elem_classes="slider-container")
	min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P (Randomness)", elem_classes="slider-container")
	with gr.Column():
	seed_number = gr.Number(label="Seed", value=420, precision=0)
	randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)

	# Emotion Panel with Tabbed Interface
	with gr.Accordion("🎭 Emotion Settings", open=False, elem_classes="panel"):
	gr.Markdown(
	"Adjust these sliders to control the emotional tone of the generated speech.\n"
	"For a neutral voice, keep 'Neutral' high and other emotions low."
	)
	with gr.Row(elem_classes="emotion-grid"):
	emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness", elem_classes="slider-container")
	emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness", elem_classes="slider-container")
	emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust", elem_classes="slider-container")
	emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear", elem_classes="slider-container")
	with gr.Row(elem_classes="emotion-grid"):
	emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise", elem_classes="slider-container")
	emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger", elem_classes="slider-container")
	emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other", elem_classes="slider-container")
	emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral", elem_classes="slider-container")

	# Advanced Settings Panel
	with gr.Accordion("⚡ Advanced Settings", open=False, elem_classes="panel"):
	gr.Markdown(
	"### Unconditional Toggles\n"
	"Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
	'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
	)
	unconditional_keys = gr.CheckboxGroup(
	[
	"speaker",
	"emotion",
	"vqscore_8",
	"fmax",
	"pitch_std",
	"speaking_rate",
	"dnsmos_ovrl",
	"speaker_noised",
	],
	value=["emotion"],
	label="Unconditional Keys",
	)

	# Generate Button and Output Area
	with gr.Column(elem_classes="panel output-container"):
	gr.Markdown('<div class="title">🔊 Generate & Output</div>')
	generate_button = gr.Button("Generate Audio", elem_classes="generate-button")
	output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True, elem_classes="audio-output")

	model_choice.change(
	fn=update_ui,
	inputs=[model_choice],
	outputs=[
	text,
	language,
	speaker_audio,
	prefix_audio,
	emotion1,
	emotion2,
	emotion3,
	emotion4,
	emotion5,
	emotion6,
	emotion7,
	emotion8,
	vq_single_slider,
	fmax_slider,
	pitch_std_slider,
	speaking_rate_slider,
	dnsmos_slider,
	speaker_noised_checkbox,
	unconditional_keys,
	],
	)

	# On page load, trigger the same UI refresh
	demo.load(
	fn=update_ui,
	inputs=[model_choice],
	outputs=[
	text,
	language,
	speaker_audio,
	prefix_audio,
	emotion1,
	emotion2,
	emotion3,
	emotion4,
	emotion5,
	emotion6,
	emotion7,
	emotion8,
	vq_single_slider,
	fmax_slider,
	pitch_std_slider,
	speaking_rate_slider,
	dnsmos_slider,
	speaker_noised_checkbox,
	unconditional_keys,
	],
	)

	# Generate audio on button click
	generate_button.click(
	fn=generate_audio,
	inputs=[
	model_choice,
	text,
	language,
	speaker_audio,
	prefix_audio,
	emotion1,
	emotion2,
	emotion3,
	emotion4,
	emotion5,
	emotion6,
	emotion7,
	emotion8,
	vq_single_slider,
	fmax_slider,
	pitch_std_slider,
	speaking_rate_slider,
	dnsmos_slider,
	speaker_noised_checkbox,
	cfg_scale_slider,
	min_p_slider,
	seed_number,
	randomize_seed_toggle,
	unconditional_keys,
	],
	outputs=[output_audio, seed_number],
	)

	return demo


	if __name__ == "__main__":
	demo = build_interface()
	share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
	demo.launch(server_name="0.0.0.0", server_port=7860, share=share)