# Install dependencies in application code, as we don't have access to a GPU at build time # Thanks to https://huggingface.co/Steveeeeeeen for their code to handle this! import os import shlex import subprocess subprocess.run(shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True) subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True) subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True) import spaces import gradio as gr import numpy as np from typing import Tuple, Dict, Any, Optional from taproot import Task # Configuration is_hf_spaces = os.getenv("SYSTEM", "") == "spaces" max_characters = 2000 header_markdown = """ # Zonos v0.1 State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio) ## Unleashed Use this space to generate long-form speech up to around ~2 minutes in length. To generate an unlimited length, clone this space and run it locally. ### Tips - If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice. - When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition. - The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz. - The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed. - Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect. """.strip() # Create pipelines, downloading required files as necessary hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False) hybrid_task.download_required_files(text_callback=print) hybrid_pipe = hybrid_task() hybrid_pipe.load() transformer_task = Task.get( "speech-synthesis", model="zonos-transformer", available_only=False ) transformer_task.download_required_files(text_callback=print) transformer_pipe = transformer_task() if is_hf_spaces: # Must load all models on GPU when using ZERO transformer_pipe.load() # Global state pipelines = { "Zonos Transformer v0.1": transformer_pipe, "Zonos Hybrid v0.1": hybrid_pipe, } pipeline_names = list(pipelines.keys()) supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes # Model toggle def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]: """ Dynamically show/hide UI elements based on the model's conditioners. """ if not is_hf_spaces: # When not using ZERO, we can onload/offload pipes for pipeline_name, pipeline in pipelines.items(): if pipeline_name == pipeline_choice: pipeline.load() else: pipeline.unload() pipe = pipelines[pipeline_choice] cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners] vqscore_update = gr.update(visible=("vqscore_8" in cond_names)) emotion_update = gr.update(visible=("emotion" in cond_names)) fmax_update = gr.update(visible=("fmax" in cond_names)) pitch_update = gr.update(visible=("pitch_std" in cond_names)) speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names)) dnsmos_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names)) return ( vqscore_update, emotion_update, fmax_update, pitch_update, speaking_rate_update, dnsmos_update, speaker_noised_update, ) # Invocation method @spaces.GPU(duration=180) def generate_audio( pipeline_choice: str, text: str, language: str, speaker_audio: Optional[str], prefix_audio: Optional[str], e1: float, e2: float, e3: float, e4: float, e5: float, e6: float, e7: float, e8: float, vq_single: float, fmax: float, pitch_std: float, speaking_rate: float, dnsmos_ovrl: float, speaker_noised: bool, cfg_scale: float, min_p: float, seed: int, max_chunk_length: int, cross_fade_duration: float, punctuation_pause_duration: float, target_rms: float, randomize_seed: bool, skip_dnsmos: bool, skip_vqscore: bool, skip_fmax: bool, skip_pitch: bool, skip_speaking_rate: bool, skip_emotion: bool, skip_speaker: bool, progress=gr.Progress(), ) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]: """ Generates audio based on the provided UI parameters. """ selected_pipeline = pipelines[pipeline_choice] if randomize_seed: seed = np.random.randint(0, 2**32) def on_progress(step: int, total: int) -> None: progress((step, total)) selected_pipeline.on_progress(on_progress) try: wav_out = selected_pipeline( text=text, language=language, reference_audio=speaker_audio, prefix_audio=prefix_audio, seed=seed, max_chunk_length=max_chunk_length, cross_fade_duration=cross_fade_duration, punctuation_pause_duration=punctuation_pause_duration, target_rms=target_rms, cfg_scale=cfg_scale, min_p=min_p, fmax=fmax, pitch_std=pitch_std, emotion_happiness=e1, emotion_sadness=e2, emotion_disgust=e3, emotion_fear=e4, emotion_surprise=e5, emotion_anger=e6, emotion_other=e7, emotion_neutral=e8, speaking_rate=speaking_rate, vq_score=vq_single, speaker_noised=speaker_noised, dnsmos=dnsmos_ovrl, skip_speaker=skip_speaker, skip_dnsmos=skip_dnsmos, skip_vq_score=skip_vqscore, skip_fmax=skip_fmax, skip_pitch=skip_pitch, skip_speaking_rate=skip_speaking_rate, skip_emotion=skip_emotion, output_format="float", ) return (44100, wav_out.squeeze().numpy()), seed finally: selected_pipeline.off_progress() # Interface if __name__ == "__main__": with gr.Blocks() as demo: with gr.Row(): with gr.Column(scale=3): gr.Markdown(header_markdown) gr.Image( value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png", container=False, interactive=False, show_label=False, show_share_button=False, show_fullscreen_button=False, show_download_button=False, ) with gr.Row(equal_height=True): pipeline_choice = gr.Dropdown( choices=pipeline_names, value=pipeline_names[0], label="Zonos Model Variant", ) language = gr.Dropdown( choices=supported_language_codes, value="en-us", label="Language", ) with gr.Row(): if not is_hf_spaces: limit_text = "Unlimited" else: limit_text = f"Up to {max_characters}" text = gr.Textbox( label=f"Speech Text ({limit_text} Characters)", value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.", lines=4, max_lines=20, max_length=max_characters if is_hf_spaces else None, ) with gr.Row(): generate_button = gr.Button("Generate Audio") with gr.Row(): output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True) with gr.Row(): gr.Markdown("## Long-Form Parameters") with gr.Column(variant="panel"): with gr.Row(equal_height=True): max_chunk_length = gr.Slider( 1, 300, 150, 1, label="Max Chunk Length (Characters)", info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so." ) target_rms = gr.Slider( 0.0, 1.0, 0.10, 0.01, label="Target RMS", info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels." ) with gr.Row(equal_height=True): punctuation_pause_duration = gr.Slider( 0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)", info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration." ) cross_fade_duration = gr.Slider( 0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)", info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration." ) with gr.Row(): gr.Markdown("## Generation Parameters") with gr.Row(variant="panel", equal_height=True): with gr.Column(): prefix_audio = gr.Audio( label="Optional Prefix Audio (continue from this audio)", type="filepath", ) with gr.Column(scale=3): cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale") min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P") seed_number = gr.Number(label="Seed", value=6475309, precision=0) randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True) with gr.Row(): gr.Markdown( "## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled." ) with gr.Row(variant="panel", equal_height=True) as speaker_row: with gr.Column(): speaker_uncond = gr.Checkbox(label="Skip Speaker") speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False) speaker_audio = gr.Audio( label="Optional Speaker Audio (for cloning)", type="filepath", scale=3, ) with gr.Row(variant="panel", equal_height=True) as emotion_row: emotion_uncond = gr.Checkbox(label="Skip Emotion") with gr.Column(scale=3): with gr.Row(): emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness") emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness") emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust") emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear") with gr.Row(): emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise") emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger") emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other") emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral") with gr.Row(variant="panel", equal_height=True) as dnsmos_row: dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS") dnsmos_slider = gr.Slider( 1.0, 5.0, value=4.0, step=0.1, label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]", scale=3, ) with gr.Row(variant="panel", equal_height=True) as vq_score_row: vq_uncond = gr.Checkbox(label="Skip VQScore") vq_single_slider = gr.Slider( 0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3 ) with gr.Row(variant="panel", equal_height=True) as fmax_row: fmax_uncond = gr.Checkbox(label="Skip Fmax") fmax_slider = gr.Slider( 0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3 ) with gr.Row(variant="panel", equal_height=True) as pitch_row: pitch_uncond = gr.Checkbox(label="Skip Pitch") pitch_std_slider = gr.Slider( 0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3 ) with gr.Row(variant="panel", equal_height=True) as speaking_rate_row: speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate") speaking_rate_slider = gr.Slider( 5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3 ) pipeline_choice.change( fn=update_ui, inputs=[pipeline_choice], outputs=[ vq_score_row, emotion_row, fmax_row, pitch_row, speaking_rate_row, dnsmos_row, speaker_noised_checkbox, ], ) # Trigger UI update on load demo.load( fn=update_ui, inputs=[pipeline_choice], outputs=[ vq_score_row, emotion_row, fmax_row, pitch_row, speaking_rate_row, dnsmos_row, speaker_noised_checkbox, ], ) # Generate audio on button click generate_button.click( fn=generate_audio, inputs=[ pipeline_choice, text, language, speaker_audio, prefix_audio, emotion1, emotion2, emotion3, emotion4, emotion5, emotion6, emotion7, emotion8, vq_single_slider, fmax_slider, pitch_std_slider, speaking_rate_slider, dnsmos_slider, speaker_noised_checkbox, cfg_scale_slider, min_p_slider, seed_number, max_chunk_length, cross_fade_duration, punctuation_pause_duration, target_rms, randomize_seed_toggle, dnsmos_uncond, vq_uncond, fmax_uncond, pitch_uncond, speaking_rate_uncond, emotion_uncond, speaker_uncond, ], outputs=[output_audio, seed_number], ) demo.launch()