benjamin-paine commited on
Commit
00e0e52
·
verified ·
1 Parent(s): 133ded5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +397 -0
app.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install dependencies in application code, as we don't have access to a GPU at build time
2
+ # Thanks to https://huggingface.co/Steveeeeeeen for their code to handle this!
3
+ import os
4
+ import shlex
5
+ import subprocess
6
+
7
+ subprocess.run(shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True)
8
+ subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
9
+ subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
10
+
11
+ import spaces
12
+ import gradio as gr
13
+ import numpy as np
14
+
15
+ from typing import Tuple, Dict, Any, Optional
16
+ from taproot import Task
17
+
18
+ # Create pipelines, downloading required files as necessary
19
+ hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
20
+ hybrid_task.download_required_files(text_callback=print)
21
+ hybrid_pipe = hybrid_task()
22
+ hybrid_pipe.load()
23
+
24
+ transformer_task = Task.get(
25
+ "speech-synthesis", model="zonos-transformer", available_only=False
26
+ )
27
+ transformer_task.download_required_files(text_callback=print)
28
+ transformer_pipe = transformer_task()
29
+ transformer_pipe.load() # Remove this line if you're running outside of HF spaces to save ~4GB of VRAM
30
+
31
+ # Global state and configuration
32
+ pipelines = {
33
+ "Zonos Transformer v0.1": transformer_pipe,
34
+ "Zonos Hybrid v0.1": hybrid_pipe,
35
+ }
36
+ pipeline_names = list(pipelines.keys())
37
+ supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes
38
+ max_characters = 4500
39
+ header_markdown = """
40
+ # Zonos v0.1
41
+ State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
42
+ ## Unleashed
43
+ Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
44
+ ### Tips
45
+ - If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
46
+ - When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
47
+ - The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
48
+ - The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
49
+ - Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
50
+ """.strip()
51
+
52
+
53
+ # Model toggle
54
+ def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
55
+ """
56
+ Dynamically show/hide UI elements based on the model's conditioners.
57
+ """
58
+ for pipeline_name, pipeline in pipelines.items():
59
+ if pipeline_name == pipeline_choice:
60
+ pipeline.load()
61
+ else:
62
+ pipeline.unload()
63
+
64
+ pipe = pipelines[pipeline_choice]
65
+ cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
66
+
67
+ vqscore_update = gr.update(visible=("vqscore_8" in cond_names))
68
+ emotion_update = gr.update(visible=("emotion" in cond_names))
69
+ fmax_update = gr.update(visible=("fmax" in cond_names))
70
+ pitch_update = gr.update(visible=("pitch_std" in cond_names))
71
+ speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names))
72
+ dnsmos_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
73
+ speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names))
74
+
75
+ return (
76
+ vqscore_update,
77
+ emotion_update,
78
+ fmax_update,
79
+ pitch_update,
80
+ speaking_rate_update,
81
+ dnsmos_update,
82
+ speaker_noised_update,
83
+ )
84
+
85
+ # Invocation method
86
+ @spaces.GPU(duration=180)
87
+ def generate_audio(
88
+ pipeline_choice: str,
89
+ text: str,
90
+ language: str,
91
+ speaker_audio: Optional[str],
92
+ prefix_audio: Optional[str],
93
+ e1: float,
94
+ e2: float,
95
+ e3: float,
96
+ e4: float,
97
+ e5: float,
98
+ e6: float,
99
+ e7: float,
100
+ e8: float,
101
+ vq_single: float,
102
+ fmax: float,
103
+ pitch_std: float,
104
+ speaking_rate: float,
105
+ dnsmos_ovrl: float,
106
+ speaker_noised: bool,
107
+ cfg_scale: float,
108
+ min_p: float,
109
+ seed: int,
110
+ max_chunk_length: int,
111
+ cross_fade_duration: float,
112
+ punctuation_pause_duration: float,
113
+ target_rms: float,
114
+ randomize_seed: bool,
115
+ skip_dnsmos: bool,
116
+ skip_vqscore: bool,
117
+ skip_fmax: bool,
118
+ skip_pitch: bool,
119
+ skip_speaking_rate: bool,
120
+ skip_emotion: bool,
121
+ skip_speaker: bool,
122
+ progress=gr.Progress(),
123
+ ) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
124
+ """
125
+ Generates audio based on the provided UI parameters.
126
+ """
127
+ selected_pipeline = pipelines[pipeline_choice]
128
+ if randomize_seed:
129
+ seed = np.random.randint(0, 2**32)
130
+
131
+ def on_progress(step: int, total: int) -> None:
132
+ progress((step, total))
133
+
134
+ selected_pipeline.on_progress(on_progress)
135
+ try:
136
+ wav_out = selected_pipeline(
137
+ text=text,
138
+ language=language,
139
+ reference_audio=speaker_audio,
140
+ prefix_audio=prefix_audio,
141
+ seed=seed,
142
+ max_chunk_length=max_chunk_length,
143
+ cross_fade_duration=cross_fade_duration,
144
+ punctuation_pause_duration=punctuation_pause_duration,
145
+ target_rms=target_rms,
146
+ cfg_scale=cfg_scale,
147
+ min_p=min_p,
148
+ fmax=fmax,
149
+ pitch_std=pitch_std,
150
+ emotion_happiness=e1,
151
+ emotion_sadness=e2,
152
+ emotion_disgust=e3,
153
+ emotion_fear=e4,
154
+ emotion_surprise=e5,
155
+ emotion_anger=e6,
156
+ emotion_other=e7,
157
+ emotion_neutral=e8,
158
+ speaking_rate=speaking_rate,
159
+ vq_score=vq_single,
160
+ speaker_noised=speaker_noised,
161
+ dnsmos=dnsmos_ovrl,
162
+ skip_speaker=skip_speaker,
163
+ skip_dnsmos=skip_dnsmos,
164
+ skip_vq_score=skip_vqscore,
165
+ skip_fmax=skip_fmax,
166
+ skip_pitch=skip_pitch,
167
+ skip_speaking_rate=skip_speaking_rate,
168
+ skip_emotion=skip_emotion,
169
+ output_format="float",
170
+ )
171
+
172
+ return (44100, wav_out.squeeze().numpy()), seed
173
+ finally:
174
+ selected_pipeline.off_progress()
175
+
176
+ # Interface
177
+
178
+ with gr.Blocks() as demo:
179
+ with gr.Row():
180
+ with gr.Column(scale=3):
181
+ gr.Markdown(header_markdown)
182
+ gr.Image(
183
+ value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
184
+ container=False,
185
+ interactive=False,
186
+ show_label=False,
187
+ show_share_button=False,
188
+ show_fullscreen_button=False,
189
+ show_download_button=False,
190
+ )
191
+
192
+ with gr.Row(equal_height=True):
193
+ pipeline_choice = gr.Dropdown(
194
+ choices=pipeline_names,
195
+ value=pipeline_names[0],
196
+ label="Zonos Model Variant",
197
+ )
198
+ language = gr.Dropdown(
199
+ choices=supported_language_codes,
200
+ value="en-us",
201
+ label="Language",
202
+ )
203
+
204
+ with gr.Row():
205
+ if max_characters is None:
206
+ limit_text = "Unlimited"
207
+ else:
208
+ limit_text = f"Up to {max_characters}"
209
+ text = gr.Textbox(
210
+ label=f"Speech Text ({limit_text} Characters)",
211
+ value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
212
+ lines=4,
213
+ max_lines=20,
214
+ max_length=max_characters,
215
+ )
216
+
217
+ with gr.Row():
218
+ generate_button = gr.Button("Generate Audio")
219
+
220
+ with gr.Row():
221
+ output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
222
+
223
+ with gr.Row():
224
+ gr.Markdown("## Long-Form Parameters")
225
+
226
+ with gr.Column(variant="panel"):
227
+ with gr.Row(equal_height=True):
228
+ max_chunk_length = gr.Slider(
229
+ 1, 300, 150, 1, label="Max Chunk Length (Characters)",
230
+ info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so."
231
+ )
232
+ target_rms = gr.Slider(
233
+ 0.0, 1.0, 0.10, 0.01, label="Target RMS",
234
+ info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels."
235
+ )
236
+ with gr.Row(equal_height=True):
237
+ punctuation_pause_duration = gr.Slider(
238
+ 0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)",
239
+ info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration."
240
+ )
241
+ cross_fade_duration = gr.Slider(
242
+ 0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)",
243
+ info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration."
244
+ )
245
+
246
+ with gr.Row():
247
+ gr.Markdown("## Generation Parameters")
248
+
249
+ with gr.Row(variant="panel", equal_height=True):
250
+ with gr.Column():
251
+ prefix_audio = gr.Audio(
252
+ label="Optional Prefix Audio (continue from this audio)",
253
+ type="filepath",
254
+ )
255
+ with gr.Column(scale=3):
256
+ cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
257
+ min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
258
+ seed_number = gr.Number(label="Seed", value=6475309, precision=0)
259
+ randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True)
260
+
261
+ with gr.Row():
262
+ gr.Markdown(
263
+ "## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled."
264
+ )
265
+
266
+ with gr.Row(variant="panel", equal_height=True) as speaker_row:
267
+ with gr.Column():
268
+ speaker_uncond = gr.Checkbox(label="Skip Speaker")
269
+ speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False)
270
+
271
+ speaker_audio = gr.Audio(
272
+ label="Optional Speaker Audio (for cloning)",
273
+ type="filepath",
274
+ scale=3,
275
+ )
276
+
277
+ with gr.Row(variant="panel", equal_height=True) as emotion_row:
278
+ emotion_uncond = gr.Checkbox(label="Skip Emotion")
279
+ with gr.Column(scale=3):
280
+ with gr.Row():
281
+ emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness")
282
+ emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness")
283
+ emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust")
284
+ emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear")
285
+ with gr.Row():
286
+ emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise")
287
+ emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger")
288
+ emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other")
289
+ emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral")
290
+
291
+ with gr.Row(variant="panel", equal_height=True) as dnsmos_row:
292
+ dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS")
293
+ dnsmos_slider = gr.Slider(
294
+ 1.0,
295
+ 5.0,
296
+ value=4.0,
297
+ step=0.1,
298
+ label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]",
299
+ scale=3,
300
+ )
301
+
302
+ with gr.Row(variant="panel", equal_height=True) as vq_score_row:
303
+ vq_uncond = gr.Checkbox(label="Skip VQScore")
304
+ vq_single_slider = gr.Slider(
305
+ 0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3
306
+ )
307
+
308
+ with gr.Row(variant="panel", equal_height=True) as fmax_row:
309
+ fmax_uncond = gr.Checkbox(label="Skip Fmax")
310
+ fmax_slider = gr.Slider(
311
+ 0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3
312
+ )
313
+
314
+ with gr.Row(variant="panel", equal_height=True) as pitch_row:
315
+ pitch_uncond = gr.Checkbox(label="Skip Pitch")
316
+ pitch_std_slider = gr.Slider(
317
+ 0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3
318
+ )
319
+
320
+ with gr.Row(variant="panel", equal_height=True) as speaking_rate_row:
321
+ speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate")
322
+ speaking_rate_slider = gr.Slider(
323
+ 5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3
324
+ )
325
+
326
+ pipeline_choice.change(
327
+ fn=update_ui,
328
+ inputs=[pipeline_choice],
329
+ outputs=[
330
+ vq_score_row,
331
+ emotion_row,
332
+ fmax_row,
333
+ pitch_row,
334
+ speaking_rate_row,
335
+ dnsmos_row,
336
+ speaker_noised_checkbox,
337
+ ],
338
+ )
339
+
340
+ # Trigger UI update on load
341
+ demo.load(
342
+ fn=update_ui,
343
+ inputs=[pipeline_choice],
344
+ outputs=[
345
+ vq_score_row,
346
+ emotion_row,
347
+ fmax_row,
348
+ pitch_row,
349
+ speaking_rate_row,
350
+ dnsmos_row,
351
+ speaker_noised_checkbox,
352
+ ],
353
+ )
354
+
355
+ # Generate audio on button click
356
+ generate_button.click(
357
+ fn=generate_audio,
358
+ inputs=[
359
+ pipeline_choice,
360
+ text,
361
+ language,
362
+ speaker_audio,
363
+ prefix_audio,
364
+ emotion1,
365
+ emotion2,
366
+ emotion3,
367
+ emotion4,
368
+ emotion5,
369
+ emotion6,
370
+ emotion7,
371
+ emotion8,
372
+ vq_single_slider,
373
+ fmax_slider,
374
+ pitch_std_slider,
375
+ speaking_rate_slider,
376
+ dnsmos_slider,
377
+ speaker_noised_checkbox,
378
+ cfg_scale_slider,
379
+ min_p_slider,
380
+ seed_number,
381
+ max_chunk_length,
382
+ cross_fade_duration,
383
+ punctuation_pause_duration,
384
+ target_rms,
385
+ randomize_seed_toggle,
386
+ dnsmos_uncond,
387
+ vq_uncond,
388
+ fmax_uncond,
389
+ pitch_uncond,
390
+ speaking_rate_uncond,
391
+ emotion_uncond,
392
+ speaker_uncond,
393
+ ],
394
+ outputs=[output_audio, seed_number],
395
+ )
396
+
397
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False)