benjamin-paine commited on
Commit
634a721
·
verified ·
1 Parent(s): 9516a6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -212
app.py CHANGED
@@ -181,225 +181,225 @@ def generate_audio(
181
  selected_pipeline.off_progress()
182
 
183
  # Interface
184
-
185
- with gr.Blocks() as demo:
186
- with gr.Row():
187
- with gr.Column(scale=3):
188
- gr.Markdown(header_markdown)
189
- gr.Image(
190
- value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
191
- container=False,
192
- interactive=False,
193
- show_label=False,
194
- show_share_button=False,
195
- show_fullscreen_button=False,
196
- show_download_button=False,
197
- )
198
-
199
- with gr.Row(equal_height=True):
200
- pipeline_choice = gr.Dropdown(
201
- choices=pipeline_names,
202
- value=pipeline_names[0],
203
- label="Zonos Model Variant",
204
- )
205
- language = gr.Dropdown(
206
- choices=supported_language_codes,
207
- value="en-us",
208
- label="Language",
209
- )
210
-
211
- with gr.Row():
212
- if not is_hf_spaces:
213
- limit_text = "Unlimited"
214
- else:
215
- limit_text = f"Up to {max_characters}"
216
-
217
- text = gr.Textbox(
218
- label=f"Speech Text ({limit_text} Characters)",
219
- value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
220
- lines=4,
221
- max_lines=20,
222
- max_length=max_characters if is_hf_spaces else None,
223
- )
224
-
225
- with gr.Row():
226
- generate_button = gr.Button("Generate Audio")
227
-
228
- with gr.Row():
229
- output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
230
-
231
- with gr.Row():
232
- gr.Markdown("## Long-Form Parameters")
233
-
234
- with gr.Column(variant="panel"):
235
  with gr.Row(equal_height=True):
236
- max_chunk_length = gr.Slider(
237
- 1, 300, 150, 1, label="Max Chunk Length (Characters)",
238
- info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so."
 
239
  )
240
- target_rms = gr.Slider(
241
- 0.0, 1.0, 0.10, 0.01, label="Target RMS",
242
- info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels."
 
243
  )
244
- with gr.Row(equal_height=True):
245
- punctuation_pause_duration = gr.Slider(
246
- 0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)",
247
- info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration."
 
 
 
 
 
 
 
 
 
248
  )
249
- cross_fade_duration = gr.Slider(
250
- 0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)",
251
- info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  )
253
-
254
- with gr.Row():
255
- gr.Markdown("## Generation Parameters")
256
-
257
- with gr.Row(variant="panel", equal_height=True):
258
- with gr.Column():
259
- prefix_audio = gr.Audio(
260
- label="Optional Prefix Audio (continue from this audio)",
261
  type="filepath",
 
262
  )
263
- with gr.Column(scale=3):
264
- cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
265
- min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
266
- seed_number = gr.Number(label="Seed", value=6475309, precision=0)
267
- randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True)
268
-
269
- with gr.Row():
270
- gr.Markdown(
271
- "## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled."
272
- )
273
-
274
- with gr.Row(variant="panel", equal_height=True) as speaker_row:
275
- with gr.Column():
276
- speaker_uncond = gr.Checkbox(label="Skip Speaker")
277
- speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False)
278
-
279
- speaker_audio = gr.Audio(
280
- label="Optional Speaker Audio (for cloning)",
281
- type="filepath",
282
- scale=3,
283
- )
284
-
285
- with gr.Row(variant="panel", equal_height=True) as emotion_row:
286
- emotion_uncond = gr.Checkbox(label="Skip Emotion")
287
- with gr.Column(scale=3):
288
- with gr.Row():
289
- emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness")
290
- emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness")
291
- emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust")
292
- emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear")
293
- with gr.Row():
294
- emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise")
295
- emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger")
296
- emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other")
297
- emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral")
298
-
299
- with gr.Row(variant="panel", equal_height=True) as dnsmos_row:
300
- dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS")
301
- dnsmos_slider = gr.Slider(
302
- 1.0,
303
- 5.0,
304
- value=4.0,
305
- step=0.1,
306
- label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]",
307
- scale=3,
308
- )
309
-
310
- with gr.Row(variant="panel", equal_height=True) as vq_score_row:
311
- vq_uncond = gr.Checkbox(label="Skip VQScore")
312
- vq_single_slider = gr.Slider(
313
- 0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3
314
- )
315
-
316
- with gr.Row(variant="panel", equal_height=True) as fmax_row:
317
- fmax_uncond = gr.Checkbox(label="Skip Fmax")
318
- fmax_slider = gr.Slider(
319
- 0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3
 
 
 
 
 
320
  )
321
-
322
- with gr.Row(variant="panel", equal_height=True) as pitch_row:
323
- pitch_uncond = gr.Checkbox(label="Skip Pitch")
324
- pitch_std_slider = gr.Slider(
325
- 0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3
 
 
 
 
 
 
 
 
 
326
  )
327
-
328
- with gr.Row(variant="panel", equal_height=True) as speaking_rate_row:
329
- speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate")
330
- speaking_rate_slider = gr.Slider(
331
- 5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  )
333
-
334
- pipeline_choice.change(
335
- fn=update_ui,
336
- inputs=[pipeline_choice],
337
- outputs=[
338
- vq_score_row,
339
- emotion_row,
340
- fmax_row,
341
- pitch_row,
342
- speaking_rate_row,
343
- dnsmos_row,
344
- speaker_noised_checkbox,
345
- ],
346
- )
347
-
348
- # Trigger UI update on load
349
- demo.load(
350
- fn=update_ui,
351
- inputs=[pipeline_choice],
352
- outputs=[
353
- vq_score_row,
354
- emotion_row,
355
- fmax_row,
356
- pitch_row,
357
- speaking_rate_row,
358
- dnsmos_row,
359
- speaker_noised_checkbox,
360
- ],
361
- )
362
-
363
- # Generate audio on button click
364
- generate_button.click(
365
- fn=generate_audio,
366
- inputs=[
367
- pipeline_choice,
368
- text,
369
- language,
370
- speaker_audio,
371
- prefix_audio,
372
- emotion1,
373
- emotion2,
374
- emotion3,
375
- emotion4,
376
- emotion5,
377
- emotion6,
378
- emotion7,
379
- emotion8,
380
- vq_single_slider,
381
- fmax_slider,
382
- pitch_std_slider,
383
- speaking_rate_slider,
384
- dnsmos_slider,
385
- speaker_noised_checkbox,
386
- cfg_scale_slider,
387
- min_p_slider,
388
- seed_number,
389
- max_chunk_length,
390
- cross_fade_duration,
391
- punctuation_pause_duration,
392
- target_rms,
393
- randomize_seed_toggle,
394
- dnsmos_uncond,
395
- vq_uncond,
396
- fmax_uncond,
397
- pitch_uncond,
398
- speaking_rate_uncond,
399
- emotion_uncond,
400
- speaker_uncond,
401
- ],
402
- outputs=[output_audio, seed_number],
403
- )
404
-
405
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False)
 
181
  selected_pipeline.off_progress()
182
 
183
  # Interface
184
+ if __name__ == "__main__":
185
+ with gr.Blocks() as demo:
186
+ with gr.Row():
187
+ with gr.Column(scale=3):
188
+ gr.Markdown(header_markdown)
189
+ gr.Image(
190
+ value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
191
+ container=False,
192
+ interactive=False,
193
+ show_label=False,
194
+ show_share_button=False,
195
+ show_fullscreen_button=False,
196
+ show_download_button=False,
197
+ )
198
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  with gr.Row(equal_height=True):
200
+ pipeline_choice = gr.Dropdown(
201
+ choices=pipeline_names,
202
+ value=pipeline_names[0],
203
+ label="Zonos Model Variant",
204
  )
205
+ language = gr.Dropdown(
206
+ choices=supported_language_codes,
207
+ value="en-us",
208
+ label="Language",
209
  )
210
+
211
+ with gr.Row():
212
+ if not is_hf_spaces:
213
+ limit_text = "Unlimited"
214
+ else:
215
+ limit_text = f"Up to {max_characters}"
216
+
217
+ text = gr.Textbox(
218
+ label=f"Speech Text ({limit_text} Characters)",
219
+ value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
220
+ lines=4,
221
+ max_lines=20,
222
+ max_length=max_characters if is_hf_spaces else None,
223
  )
224
+
225
+ with gr.Row():
226
+ generate_button = gr.Button("Generate Audio")
227
+
228
+ with gr.Row():
229
+ output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
230
+
231
+ with gr.Row():
232
+ gr.Markdown("## Long-Form Parameters")
233
+
234
+ with gr.Column(variant="panel"):
235
+ with gr.Row(equal_height=True):
236
+ max_chunk_length = gr.Slider(
237
+ 1, 300, 150, 1, label="Max Chunk Length (Characters)",
238
+ info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so."
239
+ )
240
+ target_rms = gr.Slider(
241
+ 0.0, 1.0, 0.10, 0.01, label="Target RMS",
242
+ info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels."
243
+ )
244
+ with gr.Row(equal_height=True):
245
+ punctuation_pause_duration = gr.Slider(
246
+ 0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)",
247
+ info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration."
248
+ )
249
+ cross_fade_duration = gr.Slider(
250
+ 0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)",
251
+ info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration."
252
+ )
253
+
254
+ with gr.Row():
255
+ gr.Markdown("## Generation Parameters")
256
+
257
+ with gr.Row(variant="panel", equal_height=True):
258
+ with gr.Column():
259
+ prefix_audio = gr.Audio(
260
+ label="Optional Prefix Audio (continue from this audio)",
261
+ type="filepath",
262
+ )
263
+ with gr.Column(scale=3):
264
+ cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
265
+ min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
266
+ seed_number = gr.Number(label="Seed", value=6475309, precision=0)
267
+ randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True)
268
+
269
+ with gr.Row():
270
+ gr.Markdown(
271
+ "## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled."
272
  )
273
+
274
+ with gr.Row(variant="panel", equal_height=True) as speaker_row:
275
+ with gr.Column():
276
+ speaker_uncond = gr.Checkbox(label="Skip Speaker")
277
+ speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False)
278
+
279
+ speaker_audio = gr.Audio(
280
+ label="Optional Speaker Audio (for cloning)",
281
  type="filepath",
282
+ scale=3,
283
  )
284
+
285
+ with gr.Row(variant="panel", equal_height=True) as emotion_row:
286
+ emotion_uncond = gr.Checkbox(label="Skip Emotion")
287
+ with gr.Column(scale=3):
288
+ with gr.Row():
289
+ emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness")
290
+ emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness")
291
+ emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust")
292
+ emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear")
293
+ with gr.Row():
294
+ emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise")
295
+ emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger")
296
+ emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other")
297
+ emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral")
298
+
299
+ with gr.Row(variant="panel", equal_height=True) as dnsmos_row:
300
+ dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS")
301
+ dnsmos_slider = gr.Slider(
302
+ 1.0,
303
+ 5.0,
304
+ value=4.0,
305
+ step=0.1,
306
+ label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]",
307
+ scale=3,
308
+ )
309
+
310
+ with gr.Row(variant="panel", equal_height=True) as vq_score_row:
311
+ vq_uncond = gr.Checkbox(label="Skip VQScore")
312
+ vq_single_slider = gr.Slider(
313
+ 0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3
314
+ )
315
+
316
+ with gr.Row(variant="panel", equal_height=True) as fmax_row:
317
+ fmax_uncond = gr.Checkbox(label="Skip Fmax")
318
+ fmax_slider = gr.Slider(
319
+ 0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3
320
+ )
321
+
322
+ with gr.Row(variant="panel", equal_height=True) as pitch_row:
323
+ pitch_uncond = gr.Checkbox(label="Skip Pitch")
324
+ pitch_std_slider = gr.Slider(
325
+ 0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3
326
+ )
327
+
328
+ with gr.Row(variant="panel", equal_height=True) as speaking_rate_row:
329
+ speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate")
330
+ speaking_rate_slider = gr.Slider(
331
+ 5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3
332
+ )
333
+
334
+ pipeline_choice.change(
335
+ fn=update_ui,
336
+ inputs=[pipeline_choice],
337
+ outputs=[
338
+ vq_score_row,
339
+ emotion_row,
340
+ fmax_row,
341
+ pitch_row,
342
+ speaking_rate_row,
343
+ dnsmos_row,
344
+ speaker_noised_checkbox,
345
+ ],
346
  )
347
+
348
+ # Trigger UI update on load
349
+ demo.load(
350
+ fn=update_ui,
351
+ inputs=[pipeline_choice],
352
+ outputs=[
353
+ vq_score_row,
354
+ emotion_row,
355
+ fmax_row,
356
+ pitch_row,
357
+ speaking_rate_row,
358
+ dnsmos_row,
359
+ speaker_noised_checkbox,
360
+ ],
361
  )
362
+
363
+ # Generate audio on button click
364
+ generate_button.click(
365
+ fn=generate_audio,
366
+ inputs=[
367
+ pipeline_choice,
368
+ text,
369
+ language,
370
+ speaker_audio,
371
+ prefix_audio,
372
+ emotion1,
373
+ emotion2,
374
+ emotion3,
375
+ emotion4,
376
+ emotion5,
377
+ emotion6,
378
+ emotion7,
379
+ emotion8,
380
+ vq_single_slider,
381
+ fmax_slider,
382
+ pitch_std_slider,
383
+ speaking_rate_slider,
384
+ dnsmos_slider,
385
+ speaker_noised_checkbox,
386
+ cfg_scale_slider,
387
+ min_p_slider,
388
+ seed_number,
389
+ max_chunk_length,
390
+ cross_fade_duration,
391
+ punctuation_pause_duration,
392
+ target_rms,
393
+ randomize_seed_toggle,
394
+ dnsmos_uncond,
395
+ vq_uncond,
396
+ fmax_uncond,
397
+ pitch_uncond,
398
+ speaking_rate_uncond,
399
+ emotion_uncond,
400
+ speaker_uncond,
401
+ ],
402
+ outputs=[output_audio, seed_number],
403
  )
404
+
405
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False)