zonos-longform

Running on Zero

App Files Files Community

benjamin-paine commited on 26 days ago

Commit

634a721

verified ·

1 Parent(s): 9516a6e

Update app.py

Browse files

Files changed (1) hide show

app.py +212 -212

app.py CHANGED Viewed

@@ -181,225 +181,225 @@ def generate_audio(
         selected_pipeline.off_progress()
 # Interface
-with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column(scale=3):
-            gr.Markdown(header_markdown)
-        gr.Image(
-            value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
-            container=False,
-            interactive=False,
-            show_label=False,
-            show_share_button=False,
-            show_fullscreen_button=False,
-            show_download_button=False,
-        )
-    with gr.Row(equal_height=True):
-        pipeline_choice = gr.Dropdown(
-            choices=pipeline_names,
-            value=pipeline_names[0],
-            label="Zonos Model Variant",
-        )
-        language = gr.Dropdown(
-            choices=supported_language_codes,
-            value="en-us",
-            label="Language",
-        )
-    with gr.Row():
-        if not is_hf_spaces:
-            limit_text = "Unlimited"
-        else:
-            limit_text = f"Up to {max_characters}"
-        text = gr.Textbox(
-            label=f"Speech Text ({limit_text} Characters)",
-            value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
-            lines=4,
-            max_lines=20,
-            max_length=max_characters if is_hf_spaces else None,
-        )
-    with gr.Row():
-        generate_button = gr.Button("Generate Audio")
-    with gr.Row():
-        output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
-    with gr.Row():
-        gr.Markdown("## Long-Form Parameters")
-    with gr.Column(variant="panel"):
         with gr.Row(equal_height=True):
-            max_chunk_length = gr.Slider(
-                1, 300, 150, 1, label="Max Chunk Length (Characters)",
-                info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so."
             )
-            target_rms = gr.Slider(
-                0.0, 1.0, 0.10, 0.01, label="Target RMS",
-                info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels."
             )
-        with gr.Row(equal_height=True):
-            punctuation_pause_duration = gr.Slider(
-                0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)",
-                info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration."
             )
-            cross_fade_duration = gr.Slider(
-                0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)",
-                info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration."
             )
-    with gr.Row():
-        gr.Markdown("## Generation Parameters")
-    with gr.Row(variant="panel", equal_height=True):
-        with gr.Column():
-            prefix_audio = gr.Audio(
-                label="Optional Prefix Audio (continue from this audio)",
                 type="filepath",
             )
-        with gr.Column(scale=3):
-            cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
-            min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
-            seed_number = gr.Number(label="Seed", value=6475309, precision=0)
-            randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True)
-    with gr.Row():
-        gr.Markdown(
-            "## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled."
-        )
-    with gr.Row(variant="panel", equal_height=True) as speaker_row:
-        with gr.Column():
-            speaker_uncond = gr.Checkbox(label="Skip Speaker")
-            speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False)
-        speaker_audio = gr.Audio(
-            label="Optional Speaker Audio (for cloning)",
-            type="filepath",
-            scale=3,
-        )
-    with gr.Row(variant="panel", equal_height=True) as emotion_row:
-        emotion_uncond = gr.Checkbox(label="Skip Emotion")
-        with gr.Column(scale=3):
-            with gr.Row():
-                emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness")
-                emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness")
-                emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust")
-                emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear")
-            with gr.Row():
-                emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise")
-                emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger")
-                emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other")
-                emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral")
-    with gr.Row(variant="panel", equal_height=True) as dnsmos_row:
-        dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS")
-        dnsmos_slider = gr.Slider(
-            1.0,
-            5.0,
-            value=4.0,
-            step=0.1,
-            label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]",
-            scale=3,
-        )
-    with gr.Row(variant="panel", equal_height=True) as vq_score_row:
-        vq_uncond = gr.Checkbox(label="Skip VQScore")
-        vq_single_slider = gr.Slider(
-            0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3
-        )
-    with gr.Row(variant="panel", equal_height=True) as fmax_row:
-        fmax_uncond = gr.Checkbox(label="Skip Fmax")
-        fmax_slider = gr.Slider(
-            0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3
         )
-    with gr.Row(variant="panel", equal_height=True) as pitch_row:
-        pitch_uncond = gr.Checkbox(label="Skip Pitch")
-        pitch_std_slider = gr.Slider(
-            0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3
         )
-    with gr.Row(variant="panel", equal_height=True) as speaking_rate_row:
-        speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate")
-        speaking_rate_slider = gr.Slider(
-            5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3
         )
-    pipeline_choice.change(
-        fn=update_ui,
-        inputs=[pipeline_choice],
-        outputs=[
-            vq_score_row,
-            emotion_row,
-            fmax_row,
-            pitch_row,
-            speaking_rate_row,
-            dnsmos_row,
-            speaker_noised_checkbox,
-        ],
-    )
-    # Trigger UI update on load
-    demo.load(
-        fn=update_ui,
-        inputs=[pipeline_choice],
-        outputs=[
-            vq_score_row,
-            emotion_row,
-            fmax_row,
-            pitch_row,
-            speaking_rate_row,
-            dnsmos_row,
-            speaker_noised_checkbox,
-        ],
-    )
-    # Generate audio on button click
-    generate_button.click(
-        fn=generate_audio,
-        inputs=[
-            pipeline_choice,
-            text,
-            language,
-            speaker_audio,
-            prefix_audio,
-            emotion1,
-            emotion2,
-            emotion3,
-            emotion4,
-            emotion5,
-            emotion6,
-            emotion7,
-            emotion8,
-            vq_single_slider,
-            fmax_slider,
-            pitch_std_slider,
-            speaking_rate_slider,
-            dnsmos_slider,
-            speaker_noised_checkbox,
-            cfg_scale_slider,
-            min_p_slider,
-            seed_number,
-            max_chunk_length,
-            cross_fade_duration,
-            punctuation_pause_duration,
-            target_rms,
-            randomize_seed_toggle,
-            dnsmos_uncond,
-            vq_uncond,
-            fmax_uncond,
-            pitch_uncond,
-            speaking_rate_uncond,
-            emotion_uncond,
-            speaker_uncond,
-        ],
-        outputs=[output_audio, seed_number],
-    )
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False)

         selected_pipeline.off_progress()
 # Interface
+if __name__ == "__main__":
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown(header_markdown)
+            gr.Image(
+                value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
+                container=False,
+                interactive=False,
+                show_label=False,
+                show_share_button=False,
+                show_fullscreen_button=False,
+                show_download_button=False,
+            )
         with gr.Row(equal_height=True):
+            pipeline_choice = gr.Dropdown(
+                choices=pipeline_names,
+                value=pipeline_names[0],
+                label="Zonos Model Variant",
             )
+            language = gr.Dropdown(
+                choices=supported_language_codes,
+                value="en-us",
+                label="Language",
             )
+        with gr.Row():
+            if not is_hf_spaces:
+                limit_text = "Unlimited"
+            else:
+                limit_text = f"Up to {max_characters}"
+            text = gr.Textbox(
+                label=f"Speech Text ({limit_text} Characters)",
+                value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
+                lines=4,
+                max_lines=20,
+                max_length=max_characters if is_hf_spaces else None,
             )
+        with gr.Row():
+            generate_button = gr.Button("Generate Audio")
+        with gr.Row():
+            output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
+        with gr.Row():
+            gr.Markdown("## Long-Form Parameters")
+        with gr.Column(variant="panel"):
+            with gr.Row(equal_height=True):
+                max_chunk_length = gr.Slider(
+                    1, 300, 150, 1, label="Max Chunk Length (Characters)",
+                    info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so."
+                )
+                target_rms = gr.Slider(
+                    0.0, 1.0, 0.10, 0.01, label="Target RMS",
+                    info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels."
+                )
+            with gr.Row(equal_height=True):
+                punctuation_pause_duration = gr.Slider(
+                    0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)",
+                    info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration."
+                )
+                cross_fade_duration = gr.Slider(
+                    0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)",
+                    info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration."
+                )
+        with gr.Row():
+            gr.Markdown("## Generation Parameters")
+        with gr.Row(variant="panel", equal_height=True):
+            with gr.Column():
+                prefix_audio = gr.Audio(
+                    label="Optional Prefix Audio (continue from this audio)",
+                    type="filepath",
+                )
+            with gr.Column(scale=3):
+                cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
+                min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
+                seed_number = gr.Number(label="Seed", value=6475309, precision=0)
+                randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True)
+        with gr.Row():
+            gr.Markdown(
+                "## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled."
             )
+        with gr.Row(variant="panel", equal_height=True) as speaker_row:
+            with gr.Column():
+                speaker_uncond = gr.Checkbox(label="Skip Speaker")
+                speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False)
+            speaker_audio = gr.Audio(
+                label="Optional Speaker Audio (for cloning)",
                 type="filepath",
+                scale=3,
             )
+        with gr.Row(variant="panel", equal_height=True) as emotion_row:
+            emotion_uncond = gr.Checkbox(label="Skip Emotion")
+            with gr.Column(scale=3):
+                with gr.Row():
+                    emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness")
+                    emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness")
+                    emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust")
+                    emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear")
+                with gr.Row():
+                    emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise")
+                    emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger")
+                    emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other")
+                    emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral")
+        with gr.Row(variant="panel", equal_height=True) as dnsmos_row:
+            dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS")
+            dnsmos_slider = gr.Slider(
+                1.0,
+                5.0,
+                value=4.0,
+                step=0.1,
+                label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]",
+                scale=3,
+            )
+        with gr.Row(variant="panel", equal_height=True) as vq_score_row:
+            vq_uncond = gr.Checkbox(label="Skip VQScore")
+            vq_single_slider = gr.Slider(
+                0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3
+            )
+        with gr.Row(variant="panel", equal_height=True) as fmax_row:
+            fmax_uncond = gr.Checkbox(label="Skip Fmax")
+            fmax_slider = gr.Slider(
+                0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3
+            )
+        with gr.Row(variant="panel", equal_height=True) as pitch_row:
+            pitch_uncond = gr.Checkbox(label="Skip Pitch")
+            pitch_std_slider = gr.Slider(
+                0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3
+            )
+        with gr.Row(variant="panel", equal_height=True) as speaking_rate_row:
+            speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate")
+            speaking_rate_slider = gr.Slider(
+                5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3
+            )
+        pipeline_choice.change(
+            fn=update_ui,
+            inputs=[pipeline_choice],
+            outputs=[
+                vq_score_row,
+                emotion_row,
+                fmax_row,
+                pitch_row,
+                speaking_rate_row,
+                dnsmos_row,
+                speaker_noised_checkbox,
+            ],
         )
+        # Trigger UI update on load
+        demo.load(
+            fn=update_ui,
+            inputs=[pipeline_choice],
+            outputs=[
+                vq_score_row,
+                emotion_row,
+                fmax_row,
+                pitch_row,
+                speaking_rate_row,
+                dnsmos_row,
+                speaker_noised_checkbox,
+            ],
         )
+        # Generate audio on button click
+        generate_button.click(
+            fn=generate_audio,
+            inputs=[
+                pipeline_choice,
+                text,
+                language,
+                speaker_audio,
+                prefix_audio,
+                emotion1,
+                emotion2,
+                emotion3,
+                emotion4,
+                emotion5,
+                emotion6,
+                emotion7,
+                emotion8,
+                vq_single_slider,
+                fmax_slider,
+                pitch_std_slider,
+                speaking_rate_slider,
+                dnsmos_slider,
+                speaker_noised_checkbox,
+                cfg_scale_slider,
+                min_p_slider,
+                seed_number,
+                max_chunk_length,
+                cross_fade_duration,
+                punctuation_pause_duration,
+                target_rms,
+                randomize_seed_toggle,
+                dnsmos_uncond,
+                vq_uncond,
+                fmax_uncond,
+                pitch_uncond,
+                speaking_rate_uncond,
+                emotion_uncond,
+                speaker_uncond,
+            ],
+            outputs=[output_audio, seed_number],
         )
+        demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False)