Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,22 @@ import numpy as np
|
|
15 |
from typing import Tuple, Dict, Any, Optional
|
16 |
from taproot import Task
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
# Create pipelines, downloading required files as necessary
|
19 |
hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
|
20 |
hybrid_task.download_required_files(text_callback=print)
|
@@ -26,40 +42,31 @@ transformer_task = Task.get(
|
|
26 |
)
|
27 |
transformer_task.download_required_files(text_callback=print)
|
28 |
transformer_pipe = transformer_task()
|
29 |
-
transformer_pipe.load() # Remove this line if you're running outside of HF spaces to save ~4GB of VRAM
|
30 |
|
31 |
-
|
|
|
|
|
|
|
|
|
32 |
pipelines = {
|
33 |
"Zonos Transformer v0.1": transformer_pipe,
|
34 |
"Zonos Hybrid v0.1": hybrid_pipe,
|
35 |
}
|
36 |
pipeline_names = list(pipelines.keys())
|
37 |
supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes
|
38 |
-
max_characters = 4500
|
39 |
-
header_markdown = """
|
40 |
-
# Zonos v0.1
|
41 |
-
State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
|
42 |
-
## Unleashed
|
43 |
-
Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
|
44 |
-
### Tips
|
45 |
-
- If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
|
46 |
-
- When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
|
47 |
-
- The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
|
48 |
-
- The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
|
49 |
-
- Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
|
50 |
-
""".strip()
|
51 |
-
|
52 |
|
53 |
# Model toggle
|
54 |
def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
|
55 |
"""
|
56 |
Dynamically show/hide UI elements based on the model's conditioners.
|
57 |
"""
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
63 |
|
64 |
pipe = pipelines[pipeline_choice]
|
65 |
cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
|
@@ -202,16 +209,17 @@ with gr.Blocks() as demo:
|
|
202 |
)
|
203 |
|
204 |
with gr.Row():
|
205 |
-
if
|
206 |
limit_text = "Unlimited"
|
207 |
else:
|
208 |
limit_text = f"Up to {max_characters}"
|
|
|
209 |
text = gr.Textbox(
|
210 |
label=f"Speech Text ({limit_text} Characters)",
|
211 |
value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
|
212 |
lines=4,
|
213 |
max_lines=20,
|
214 |
-
max_length=max_characters,
|
215 |
)
|
216 |
|
217 |
with gr.Row():
|
|
|
15 |
from typing import Tuple, Dict, Any, Optional
|
16 |
from taproot import Task
|
17 |
|
18 |
+
# Configuration
|
19 |
+
is_hf_spaces = True # Set to false when running locally
|
20 |
+
max_characters = 4500
|
21 |
+
header_markdown = """
|
22 |
+
# Zonos v0.1
|
23 |
+
State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
|
24 |
+
## Unleashed
|
25 |
+
Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
|
26 |
+
### Tips
|
27 |
+
- If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
|
28 |
+
- When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
|
29 |
+
- The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
|
30 |
+
- The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
|
31 |
+
- Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
|
32 |
+
""".strip()
|
33 |
+
|
34 |
# Create pipelines, downloading required files as necessary
|
35 |
hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
|
36 |
hybrid_task.download_required_files(text_callback=print)
|
|
|
42 |
)
|
43 |
transformer_task.download_required_files(text_callback=print)
|
44 |
transformer_pipe = transformer_task()
|
|
|
45 |
|
46 |
+
if is_hf_spaces:
|
47 |
+
# Must load all models on GPU when using ZERO
|
48 |
+
transformer_pipe.load()
|
49 |
+
|
50 |
+
# Global state
|
51 |
pipelines = {
|
52 |
"Zonos Transformer v0.1": transformer_pipe,
|
53 |
"Zonos Hybrid v0.1": hybrid_pipe,
|
54 |
}
|
55 |
pipeline_names = list(pipelines.keys())
|
56 |
supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# Model toggle
|
59 |
def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
|
60 |
"""
|
61 |
Dynamically show/hide UI elements based on the model's conditioners.
|
62 |
"""
|
63 |
+
if not is_hf_spaces:
|
64 |
+
# When not using ZERO, we can onload/offload pipes
|
65 |
+
for pipeline_name, pipeline in pipelines.items():
|
66 |
+
if pipeline_name == pipeline_choice:
|
67 |
+
pipeline.load()
|
68 |
+
else:
|
69 |
+
pipeline.unload()
|
70 |
|
71 |
pipe = pipelines[pipeline_choice]
|
72 |
cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
|
|
|
209 |
)
|
210 |
|
211 |
with gr.Row():
|
212 |
+
if not is_hf_spaces:
|
213 |
limit_text = "Unlimited"
|
214 |
else:
|
215 |
limit_text = f"Up to {max_characters}"
|
216 |
+
|
217 |
text = gr.Textbox(
|
218 |
label=f"Speech Text ({limit_text} Characters)",
|
219 |
value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
|
220 |
lines=4,
|
221 |
max_lines=20,
|
222 |
+
max_length=max_characters if is_hf_spaces else None,
|
223 |
)
|
224 |
|
225 |
with gr.Row():
|