benjamin-paine commited on
Commit
9516a6e
·
verified ·
1 Parent(s): 11649ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -23
app.py CHANGED
@@ -15,6 +15,22 @@ import numpy as np
15
  from typing import Tuple, Dict, Any, Optional
16
  from taproot import Task
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Create pipelines, downloading required files as necessary
19
  hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
20
  hybrid_task.download_required_files(text_callback=print)
@@ -26,40 +42,31 @@ transformer_task = Task.get(
26
  )
27
  transformer_task.download_required_files(text_callback=print)
28
  transformer_pipe = transformer_task()
29
- transformer_pipe.load() # Remove this line if you're running outside of HF spaces to save ~4GB of VRAM
30
 
31
- # Global state and configuration
 
 
 
 
32
  pipelines = {
33
  "Zonos Transformer v0.1": transformer_pipe,
34
  "Zonos Hybrid v0.1": hybrid_pipe,
35
  }
36
  pipeline_names = list(pipelines.keys())
37
  supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes
38
- max_characters = 4500
39
- header_markdown = """
40
- # Zonos v0.1
41
- State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
42
- ## Unleashed
43
- Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
44
- ### Tips
45
- - If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
46
- - When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
47
- - The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
48
- - The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
49
- - Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
50
- """.strip()
51
-
52
 
53
  # Model toggle
54
  def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
55
  """
56
  Dynamically show/hide UI elements based on the model's conditioners.
57
  """
58
- for pipeline_name, pipeline in pipelines.items():
59
- if pipeline_name == pipeline_choice:
60
- pipeline.load()
61
- else:
62
- pipeline.unload()
 
 
63
 
64
  pipe = pipelines[pipeline_choice]
65
  cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
@@ -202,16 +209,17 @@ with gr.Blocks() as demo:
202
  )
203
 
204
  with gr.Row():
205
- if max_characters is None:
206
  limit_text = "Unlimited"
207
  else:
208
  limit_text = f"Up to {max_characters}"
 
209
  text = gr.Textbox(
210
  label=f"Speech Text ({limit_text} Characters)",
211
  value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
212
  lines=4,
213
  max_lines=20,
214
- max_length=max_characters,
215
  )
216
 
217
  with gr.Row():
 
15
  from typing import Tuple, Dict, Any, Optional
16
  from taproot import Task
17
 
18
+ # Configuration
19
+ is_hf_spaces = True # Set to false when running locally
20
+ max_characters = 4500
21
+ header_markdown = """
22
+ # Zonos v0.1
23
+ State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
24
+ ## Unleashed
25
+ Use this space to generate long-form speech up to around ~4 minutes in length. To generate an unlimited length, clone this space and run it locally, modifying the `max_characters` parameter to your desired length (or None for unlimited).
26
+ ### Tips
27
+ - If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
28
+ - When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
29
+ - The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
30
+ - The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
31
+ - Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
32
+ """.strip()
33
+
34
  # Create pipelines, downloading required files as necessary
35
  hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
36
  hybrid_task.download_required_files(text_callback=print)
 
42
  )
43
  transformer_task.download_required_files(text_callback=print)
44
  transformer_pipe = transformer_task()
 
45
 
46
+ if is_hf_spaces:
47
+ # Must load all models on GPU when using ZERO
48
+ transformer_pipe.load()
49
+
50
+ # Global state
51
  pipelines = {
52
  "Zonos Transformer v0.1": transformer_pipe,
53
  "Zonos Hybrid v0.1": hybrid_pipe,
54
  }
55
  pipeline_names = list(pipelines.keys())
56
  supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # Model toggle
59
  def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
60
  """
61
  Dynamically show/hide UI elements based on the model's conditioners.
62
  """
63
+ if not is_hf_spaces:
64
+ # When not using ZERO, we can onload/offload pipes
65
+ for pipeline_name, pipeline in pipelines.items():
66
+ if pipeline_name == pipeline_choice:
67
+ pipeline.load()
68
+ else:
69
+ pipeline.unload()
70
 
71
  pipe = pipelines[pipeline_choice]
72
  cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]
 
209
  )
210
 
211
  with gr.Row():
212
+ if not is_hf_spaces:
213
  limit_text = "Unlimited"
214
  else:
215
  limit_text = f"Up to {max_characters}"
216
+
217
  text = gr.Textbox(
218
  label=f"Speech Text ({limit_text} Characters)",
219
  value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
220
  lines=4,
221
  max_lines=20,
222
+ max_length=max_characters if is_hf_spaces else None,
223
  )
224
 
225
  with gr.Row():