zonos-longform-unleashed

Running on Zero

App Files Files Community

benjamin-paine commited on 26 days ago

Commit

00e0e52

verified ·

1 Parent(s): 133ded5

Create app.py

Browse files

Files changed (1) hide show

app.py +397 -0

app.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# Install dependencies in application code, as we don't have access to a GPU at build time
+# Thanks to https://huggingface.co/Steveeeeeeen for their code to handle this!
+import os
+import shlex
+import subprocess
+subprocess.run(shlex.split("pip install flash-attn  --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True)
+subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
+subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
+import spaces
+import gradio as gr
+import numpy as np
+from typing import Tuple, Dict, Any, Optional
+from taproot import Task
+# Create pipelines, downloading required files as necessary
+hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
+hybrid_task.download_required_files(text_callback=print)
+hybrid_pipe = hybrid_task()
+hybrid_pipe.load()
+transformer_task = Task.get(
+    "speech-synthesis", model="zonos-transformer", available_only=False
+)
+transformer_task.download_required_files(text_callback=print)
+transformer_pipe = transformer_task()
+transformer_pipe.load() # Remove this line if you're running outside of HF spaces to save ~4GB of VRAM
+# Global state and configuration
+pipelines = {
+    "Zonos Transformer v0.1": transformer_pipe,
+    "Zonos Hybrid v0.1": hybrid_pipe,
+}
+pipeline_names = list(pipelines.keys())
+supported_language_codes = hybrid_pipe.supported_languages  # Same for both pipes
+max_characters = 4500
+header_markdown = """
+# Zonos v0.1
+State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
+## Unleashed
+Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
+### Tips
+- If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
+- When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
+- The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
+- The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
+- Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
+""".strip()
+# Model toggle
+def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
+    """
+    Dynamically show/hide UI elements based on the model's conditioners.
+    """
+    for pipeline_name, pipeline in pipelines.items():
+        if pipeline_name == pipeline_choice:
+            pipeline.load()
+        else:
+            pipeline.unload()
+    pipe = pipelines[pipeline_choice]
+    cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
+    vqscore_update = gr.update(visible=("vqscore_8" in cond_names))
+    emotion_update = gr.update(visible=("emotion" in cond_names))
+    fmax_update = gr.update(visible=("fmax" in cond_names))
+    pitch_update = gr.update(visible=("pitch_std" in cond_names))
+    speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names))
+    dnsmos_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
+    speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names))
+    return (
+        vqscore_update,
+        emotion_update,
+        fmax_update,
+        pitch_update,
+        speaking_rate_update,
+        dnsmos_update,
+        speaker_noised_update,
+    )
+# Invocation method
+@spaces.GPU(duration=180)
+def generate_audio(
+    pipeline_choice: str,
+    text: str,
+    language: str,
+    speaker_audio: Optional[str],
+    prefix_audio: Optional[str],
+    e1: float,
+    e2: float,
+    e3: float,
+    e4: float,
+    e5: float,
+    e6: float,
+    e7: float,
+    e8: float,
+    vq_single: float,
+    fmax: float,
+    pitch_std: float,
+    speaking_rate: float,
+    dnsmos_ovrl: float,
+    speaker_noised: bool,
+    cfg_scale: float,
+    min_p: float,
+    seed: int,
+    max_chunk_length: int,
+    cross_fade_duration: float,
+    punctuation_pause_duration: float,
+    target_rms: float,
+    randomize_seed: bool,
+    skip_dnsmos: bool,
+    skip_vqscore: bool,
+    skip_fmax: bool,
+    skip_pitch: bool,
+    skip_speaking_rate: bool,
+    skip_emotion: bool,
+    skip_speaker: bool,
+    progress=gr.Progress(),
+) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
+    """
+    Generates audio based on the provided UI parameters.
+    """
+    selected_pipeline = pipelines[pipeline_choice]
+    if randomize_seed:
+        seed = np.random.randint(0, 2**32)
+    def on_progress(step: int, total: int) -> None:
+        progress((step, total))
+    selected_pipeline.on_progress(on_progress)
+    try:
+        wav_out = selected_pipeline(
+            text=text,
+            language=language,
+            reference_audio=speaker_audio,
+            prefix_audio=prefix_audio,
+            seed=seed,
+            max_chunk_length=max_chunk_length,
+            cross_fade_duration=cross_fade_duration,
+            punctuation_pause_duration=punctuation_pause_duration,
+            target_rms=target_rms,
+            cfg_scale=cfg_scale,
+            min_p=min_p,
+            fmax=fmax,
+            pitch_std=pitch_std,
+            emotion_happiness=e1,
+            emotion_sadness=e2,
+            emotion_disgust=e3,
+            emotion_fear=e4,
+            emotion_surprise=e5,
+            emotion_anger=e6,
+            emotion_other=e7,
+            emotion_neutral=e8,
+            speaking_rate=speaking_rate,
+            vq_score=vq_single,
+            speaker_noised=speaker_noised,
+            dnsmos=dnsmos_ovrl,
+            skip_speaker=skip_speaker,
+            skip_dnsmos=skip_dnsmos,
+            skip_vq_score=skip_vqscore,
+            skip_fmax=skip_fmax,
+            skip_pitch=skip_pitch,
+            skip_speaking_rate=skip_speaking_rate,
+            skip_emotion=skip_emotion,
+            output_format="float",
+        )
+        return (44100, wav_out.squeeze().numpy()), seed
+    finally:
+        selected_pipeline.off_progress()
+# Interface
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=3):
+            gr.Markdown(header_markdown)
+        gr.Image(
+            value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
+            container=False,
+            interactive=False,
+            show_label=False,
+            show_share_button=False,
+            show_fullscreen_button=False,
+            show_download_button=False,
+        )
+    with gr.Row(equal_height=True):
+        pipeline_choice = gr.Dropdown(
+            choices=pipeline_names,
+            value=pipeline_names[0],
+            label="Zonos Model Variant",
+        )
+        language = gr.Dropdown(
+            choices=supported_language_codes,
+            value="en-us",
+            label="Language",
+        )
+    with gr.Row():
+        if max_characters is None:
+            limit_text = "Unlimited"
+        else:
+            limit_text = f"Up to {max_characters}"
+        text = gr.Textbox(
+            label=f"Speech Text ({limit_text} Characters)",
+            value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
+            lines=4,
+            max_lines=20,
+            max_length=max_characters,
+        )
+    with gr.Row():
+        generate_button = gr.Button("Generate Audio")
+    with gr.Row():
+        output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
+    with gr.Row():
+        gr.Markdown("## Long-Form Parameters")
+    with gr.Column(variant="panel"):
+        with gr.Row(equal_height=True):
+            max_chunk_length = gr.Slider(
+                1, 300, 150, 1, label="Max Chunk Length (Characters)",
+                info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so."
+            )
+            target_rms = gr.Slider(
+                0.0, 1.0, 0.10, 0.01, label="Target RMS",
+                info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels."
+            )
+        with gr.Row(equal_height=True):
+            punctuation_pause_duration = gr.Slider(
+                0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)",
+                info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration."
+            )
+            cross_fade_duration = gr.Slider(
+                0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)",
+                info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration."
+            )
+    with gr.Row():
+        gr.Markdown("## Generation Parameters")
+    with gr.Row(variant="panel", equal_height=True):
+        with gr.Column():
+            prefix_audio = gr.Audio(
+                label="Optional Prefix Audio (continue from this audio)",
+                type="filepath",
+            )
+        with gr.Column(scale=3):
+            cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
+            min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
+            seed_number = gr.Number(label="Seed", value=6475309, precision=0)
+            randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True)
+    with gr.Row():
+        gr.Markdown(
+            "## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled."
+        )
+    with gr.Row(variant="panel", equal_height=True) as speaker_row:
+        with gr.Column():
+            speaker_uncond = gr.Checkbox(label="Skip Speaker")
+            speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False)
+        speaker_audio = gr.Audio(
+            label="Optional Speaker Audio (for cloning)",
+            type="filepath",
+            scale=3,
+        )
+    with gr.Row(variant="panel", equal_height=True) as emotion_row:
+        emotion_uncond = gr.Checkbox(label="Skip Emotion")
+        with gr.Column(scale=3):
+            with gr.Row():
+                emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness")
+                emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness")
+                emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust")
+                emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear")
+            with gr.Row():
+                emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise")
+                emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger")
+                emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other")
+                emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral")
+    with gr.Row(variant="panel", equal_height=True) as dnsmos_row:
+        dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS")
+        dnsmos_slider = gr.Slider(
+            1.0,
+            5.0,
+            value=4.0,
+            step=0.1,
+            label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]",
+            scale=3,
+        )
+    with gr.Row(variant="panel", equal_height=True) as vq_score_row:
+        vq_uncond = gr.Checkbox(label="Skip VQScore")
+        vq_single_slider = gr.Slider(
+            0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3
+        )
+    with gr.Row(variant="panel", equal_height=True) as fmax_row:
+        fmax_uncond = gr.Checkbox(label="Skip Fmax")
+        fmax_slider = gr.Slider(
+            0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3
+        )
+    with gr.Row(variant="panel", equal_height=True) as pitch_row:
+        pitch_uncond = gr.Checkbox(label="Skip Pitch")
+        pitch_std_slider = gr.Slider(
+            0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3
+        )
+    with gr.Row(variant="panel", equal_height=True) as speaking_rate_row:
+        speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate")
+        speaking_rate_slider = gr.Slider(
+            5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3
+        )
+    pipeline_choice.change(
+        fn=update_ui,
+        inputs=[pipeline_choice],
+        outputs=[
+            vq_score_row,
+            emotion_row,
+            fmax_row,
+            pitch_row,
+            speaking_rate_row,
+            dnsmos_row,
+            speaker_noised_checkbox,
+        ],
+    )
+    # Trigger UI update on load
+    demo.load(
+        fn=update_ui,
+        inputs=[pipeline_choice],
+        outputs=[
+            vq_score_row,
+            emotion_row,
+            fmax_row,
+            pitch_row,
+            speaking_rate_row,
+            dnsmos_row,
+            speaker_noised_checkbox,
+        ],
+    )
+    # Generate audio on button click
+    generate_button.click(
+        fn=generate_audio,
+        inputs=[
+            pipeline_choice,
+            text,
+            language,
+            speaker_audio,
+            prefix_audio,
+            emotion1,
+            emotion2,
+            emotion3,
+            emotion4,
+            emotion5,
+            emotion6,
+            emotion7,
+            emotion8,
+            vq_single_slider,
+            fmax_slider,
+            pitch_std_slider,
+            speaking_rate_slider,
+            dnsmos_slider,
+            speaker_noised_checkbox,
+            cfg_scale_slider,
+            min_p_slider,
+            seed_number,
+            max_chunk_length,
+            cross_fade_duration,
+            punctuation_pause_duration,
+            target_rms,
+            randomize_seed_toggle,
+            dnsmos_uncond,
+            vq_uncond,
+            fmax_uncond,
+            pitch_uncond,
+            speaking_rate_uncond,
+            emotion_uncond,
+            speaker_uncond,
+        ],
+        outputs=[output_audio, seed_number],
+    )
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False)