zonos-longform

Running on Zero

App Files Files Community

benjamin-paine commited on 26 days ago

Commit

9516a6e

verified ·

1 Parent(s): 11649ae

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -23

app.py CHANGED Viewed

@@ -15,6 +15,22 @@ import numpy as np
 from typing import Tuple, Dict, Any, Optional
 from taproot import Task
 # Create pipelines, downloading required files as necessary
 hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
 hybrid_task.download_required_files(text_callback=print)
@@ -26,40 +42,31 @@ transformer_task = Task.get(
 )
 transformer_task.download_required_files(text_callback=print)
 transformer_pipe = transformer_task()
-transformer_pipe.load() # Remove this line if you're running outside of HF spaces to save ~4GB of VRAM
-# Global state and configuration
 pipelines = {
     "Zonos Transformer v0.1": transformer_pipe,
     "Zonos Hybrid v0.1": hybrid_pipe,
 }
 pipeline_names = list(pipelines.keys())
 supported_language_codes = hybrid_pipe.supported_languages  # Same for both pipes
-max_characters = 4500
-header_markdown = """
-# Zonos v0.1
-State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
-## Unleashed
-Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
-### Tips
-- If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
-- When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
-- The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
-- The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
-- Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
-""".strip()
 # Model toggle
 def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
     """
     Dynamically show/hide UI elements based on the model's conditioners.
     """
-    for pipeline_name, pipeline in pipelines.items():
-        if pipeline_name == pipeline_choice:
-            pipeline.load()
-        else:
-            pipeline.unload()
     pipe = pipelines[pipeline_choice]
     cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
@@ -202,16 +209,17 @@ with gr.Blocks() as demo:
         )
     with gr.Row():
-        if max_characters is None:
             limit_text = "Unlimited"
         else:
             limit_text = f"Up to {max_characters}"
         text = gr.Textbox(
             label=f"Speech Text ({limit_text} Characters)",
             value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
             lines=4,
             max_lines=20,
-            max_length=max_characters,
         )
     with gr.Row():

 from typing import Tuple, Dict, Any, Optional
 from taproot import Task
+# Configuration
+is_hf_spaces = True # Set to false when running locally
+max_characters = 4500
+header_markdown = """
+# Zonos v0.1
+State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
+## Unleashed
+Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
+### Tips
+- If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
+- When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
+- The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
+- The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
+- Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
+""".strip()
 # Create pipelines, downloading required files as necessary
 hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
 hybrid_task.download_required_files(text_callback=print)
 )
 transformer_task.download_required_files(text_callback=print)
 transformer_pipe = transformer_task()
+if is_hf_spaces:
+    # Must load all models on GPU when using ZERO
+    transformer_pipe.load()
+# Global state
 pipelines = {
     "Zonos Transformer v0.1": transformer_pipe,
     "Zonos Hybrid v0.1": hybrid_pipe,
 }
 pipeline_names = list(pipelines.keys())
 supported_language_codes = hybrid_pipe.supported_languages  # Same for both pipes
 # Model toggle
 def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
     """
     Dynamically show/hide UI elements based on the model's conditioners.
     """
+    if not is_hf_spaces:
+        # When not using ZERO, we can onload/offload pipes
+        for pipeline_name, pipeline in pipelines.items():
+            if pipeline_name == pipeline_choice:
+                pipeline.load()
+            else:
+                pipeline.unload()
     pipe = pipelines[pipeline_choice]
     cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
         )
     with gr.Row():
+        if not is_hf_spaces:
             limit_text = "Unlimited"
         else:
             limit_text = f"Up to {max_characters}"
         text = gr.Textbox(
             label=f"Speech Text ({limit_text} Characters)",
             value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
             lines=4,
             max_lines=20,
+            max_length=max_characters if is_hf_spaces else None,
         )
     with gr.Row():