ginipick commited on
Commit
fe178db
·
verified ·
1 Parent(s): e7ef00b

Create app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +370 -0
app-backup.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shlex
3
+ import subprocess
4
+
5
+ subprocess.run(shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True)
6
+ subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
7
+ subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
8
+
9
+ import spaces
10
+ import torch
11
+ import torchaudio
12
+ import gradio as gr
13
+ from os import getenv
14
+
15
+ from zonos.model import Zonos
16
+ from zonos.conditioning import make_cond_dict, supported_language_codes
17
+
18
+ device = "cuda"
19
+ MODEL_NAMES = ["Zyphra/Zonos-v0.1-transformer", "Zyphra/Zonos-v0.1-hybrid"]
20
+ MODELS = {name: Zonos.from_pretrained(name, device=device) for name in MODEL_NAMES}
21
+ for model in MODELS.values():
22
+ model.requires_grad_(False).eval()
23
+
24
+
25
+ def update_ui(model_choice):
26
+ """
27
+ Dynamically show/hide UI elements based on the model's conditioners.
28
+ We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model.
29
+ """
30
+ model = MODELS[model_choice]
31
+ cond_names = [c.name for c in model.prefix_conditioner.conditioners]
32
+ print("Conditioners in this model:", cond_names)
33
+
34
+ text_update = gr.update(visible=("espeak" in cond_names))
35
+ language_update = gr.update(visible=("espeak" in cond_names))
36
+ speaker_audio_update = gr.update(visible=("speaker" in cond_names))
37
+ prefix_audio_update = gr.update(visible=True)
38
+ emotion1_update = gr.update(visible=("emotion" in cond_names))
39
+ emotion2_update = gr.update(visible=("emotion" in cond_names))
40
+ emotion3_update = gr.update(visible=("emotion" in cond_names))
41
+ emotion4_update = gr.update(visible=("emotion" in cond_names))
42
+ emotion5_update = gr.update(visible=("emotion" in cond_names))
43
+ emotion6_update = gr.update(visible=("emotion" in cond_names))
44
+ emotion7_update = gr.update(visible=("emotion" in cond_names))
45
+ emotion8_update = gr.update(visible=("emotion" in cond_names))
46
+ vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names))
47
+ fmax_slider_update = gr.update(visible=("fmax" in cond_names))
48
+ pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names))
49
+ speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names))
50
+ dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
51
+ speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names))
52
+ unconditional_keys_update = gr.update(
53
+ choices=[name for name in cond_names if name not in ("espeak", "language_id")]
54
+ )
55
+
56
+ return (
57
+ text_update,
58
+ language_update,
59
+ speaker_audio_update,
60
+ prefix_audio_update,
61
+ emotion1_update,
62
+ emotion2_update,
63
+ emotion3_update,
64
+ emotion4_update,
65
+ emotion5_update,
66
+ emotion6_update,
67
+ emotion7_update,
68
+ emotion8_update,
69
+ vq_single_slider_update,
70
+ fmax_slider_update,
71
+ pitch_std_slider_update,
72
+ speaking_rate_slider_update,
73
+ dnsmos_slider_update,
74
+ speaker_noised_checkbox_update,
75
+ unconditional_keys_update,
76
+ )
77
+
78
+
79
+ @spaces.GPU(duration=120)
80
+ def generate_audio(
81
+ model_choice,
82
+ text,
83
+ language,
84
+ speaker_audio,
85
+ prefix_audio,
86
+ e1,
87
+ e2,
88
+ e3,
89
+ e4,
90
+ e5,
91
+ e6,
92
+ e7,
93
+ e8,
94
+ vq_single,
95
+ fmax,
96
+ pitch_std,
97
+ speaking_rate,
98
+ dnsmos_ovrl,
99
+ speaker_noised,
100
+ cfg_scale,
101
+ min_p,
102
+ seed,
103
+ randomize_seed,
104
+ unconditional_keys,
105
+ progress=gr.Progress(),
106
+ ):
107
+ """
108
+ Generates audio based on the provided UI parameters.
109
+ We do NOT use language_id or ctc_loss even if the model has them.
110
+ """
111
+ selected_model = MODELS[model_choice]
112
+
113
+ speaker_noised_bool = bool(speaker_noised)
114
+ fmax = float(fmax)
115
+ pitch_std = float(pitch_std)
116
+ speaking_rate = float(speaking_rate)
117
+ dnsmos_ovrl = float(dnsmos_ovrl)
118
+ cfg_scale = float(cfg_scale)
119
+ min_p = float(min_p)
120
+ seed = int(seed)
121
+ max_new_tokens = 86 * 30
122
+
123
+ if randomize_seed:
124
+ seed = torch.randint(0, 2**32 - 1, (1,)).item()
125
+ torch.manual_seed(seed)
126
+
127
+ speaker_embedding = None
128
+ if speaker_audio is not None and "speaker" not in unconditional_keys:
129
+ wav, sr = torchaudio.load(speaker_audio)
130
+ speaker_embedding = selected_model.make_speaker_embedding(wav, sr)
131
+ speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16)
132
+
133
+ audio_prefix_codes = None
134
+ if prefix_audio is not None:
135
+ wav_prefix, sr_prefix = torchaudio.load(prefix_audio)
136
+ wav_prefix = wav_prefix.mean(0, keepdim=True)
137
+ wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate)
138
+ wav_prefix = wav_prefix.to(device, dtype=torch.float32)
139
+ with torch.autocast(device, dtype=torch.float32):
140
+ audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0))
141
+
142
+ emotion_tensor = torch.tensor(list(map(float, [e1, e2, e3, e4, e5, e6, e7, e8])), device=device)
143
+
144
+ vq_val = float(vq_single)
145
+ vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0)
146
+
147
+ cond_dict = make_cond_dict(
148
+ text=text,
149
+ language=language,
150
+ speaker=speaker_embedding,
151
+ emotion=emotion_tensor,
152
+ vqscore_8=vq_tensor,
153
+ fmax=fmax,
154
+ pitch_std=pitch_std,
155
+ speaking_rate=speaking_rate,
156
+ dnsmos_ovrl=dnsmos_ovrl,
157
+ speaker_noised=speaker_noised_bool,
158
+ device=device,
159
+ unconditional_keys=unconditional_keys,
160
+ )
161
+ conditioning = selected_model.prepare_conditioning(cond_dict)
162
+
163
+ estimated_generation_duration = 30 * len(text) / 400
164
+ estimated_total_steps = int(estimated_generation_duration * 86)
165
+
166
+ def update_progress(_frame: torch.Tensor, step: int, _total_steps: int) -> bool:
167
+ progress((step, estimated_total_steps))
168
+ return True
169
+
170
+ codes = selected_model.generate(
171
+ prefix_conditioning=conditioning,
172
+ audio_prefix_codes=audio_prefix_codes,
173
+ max_new_tokens=max_new_tokens,
174
+ cfg_scale=cfg_scale,
175
+ batch_size=1,
176
+ sampling_params=dict(min_p=min_p),
177
+ callback=update_progress,
178
+ )
179
+
180
+ wav_out = selected_model.autoencoder.decode(codes).cpu().detach()
181
+ sr_out = selected_model.autoencoder.sampling_rate
182
+ if wav_out.dim() == 2 and wav_out.size(0) > 1:
183
+ wav_out = wav_out[0:1, :]
184
+ return (sr_out, wav_out.squeeze().numpy()), seed
185
+
186
+
187
+ def build_interface():
188
+ with gr.Blocks(theme='ParityError/Interstellar') as demo:
189
+ with gr.Row():
190
+ with gr.Column():
191
+ model_choice = gr.Dropdown(
192
+ choices=MODEL_NAMES,
193
+ value="Zyphra/Zonos-v0.1-transformer",
194
+ label="Zonos Model Type",
195
+ info="Select the model variant to use.",
196
+ )
197
+ text = gr.Textbox(
198
+ label="Text to Synthesize",
199
+ value="Zonos uses eSpeak for text to phoneme conversion!",
200
+ lines=4,
201
+ max_length=500, # approximately
202
+ )
203
+ language = gr.Dropdown(
204
+ choices=supported_language_codes,
205
+ value="en-us",
206
+ label="Language Code",
207
+ info="Select a language code.",
208
+ )
209
+ prefix_audio = gr.Audio(
210
+ value="assets/silence_100ms.wav",
211
+ label="Optional Prefix Audio (continue from this audio)",
212
+ type="filepath",
213
+ )
214
+ with gr.Column():
215
+ speaker_audio = gr.Audio(
216
+ label="Optional Speaker Audio (for cloning)",
217
+ type="filepath",
218
+ )
219
+ speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False)
220
+
221
+ with gr.Row():
222
+ with gr.Column():
223
+ gr.Markdown("## Conditioning Parameters")
224
+ dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall")
225
+ fmax_slider = gr.Slider(0, 24000, value=24000, step=1, label="Fmax (Hz)")
226
+ vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score")
227
+ pitch_std_slider = gr.Slider(0.0, 300.0, value=45.0, step=1, label="Pitch Std")
228
+ speaking_rate_slider = gr.Slider(5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate")
229
+
230
+ with gr.Column():
231
+ gr.Markdown("## Generation Parameters")
232
+ cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
233
+ min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
234
+ seed_number = gr.Number(label="Seed", value=420, precision=0)
235
+ randomize_seed_toggle = gr.Checkbox(label="Randomize Seed (before generation)", value=True)
236
+
237
+ with gr.Accordion("Advanced Parameters", open=False):
238
+ gr.Markdown(
239
+ "### Unconditional Toggles\n"
240
+ "Checking a box will make the model ignore the corresponding conditioning value and make it unconditional.\n"
241
+ 'Practically this means the given conditioning feature will be unconstrained and "filled in automatically".'
242
+ )
243
+ with gr.Row():
244
+ unconditional_keys = gr.CheckboxGroup(
245
+ [
246
+ "speaker",
247
+ "emotion",
248
+ "vqscore_8",
249
+ "fmax",
250
+ "pitch_std",
251
+ "speaking_rate",
252
+ "dnsmos_ovrl",
253
+ "speaker_noised",
254
+ ],
255
+ value=["emotion"],
256
+ label="Unconditional Keys",
257
+ )
258
+
259
+ gr.Markdown(
260
+ "### Emotion Sliders\n"
261
+ "Warning: The way these sliders work is not intuitive and may require some trial and error to get the desired effect.\n"
262
+ "Certain configurations can cause the model to become unstable. Setting emotion to unconditional may help."
263
+ )
264
+ with gr.Row():
265
+ emotion1 = gr.Slider(0.0, 1.0, 1.0, 0.05, label="Happiness")
266
+ emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness")
267
+ emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust")
268
+ emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear")
269
+ with gr.Row():
270
+ emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise")
271
+ emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger")
272
+ emotion7 = gr.Slider(0.0, 1.0, 0.1, 0.05, label="Other")
273
+ emotion8 = gr.Slider(0.0, 1.0, 0.2, 0.05, label="Neutral")
274
+
275
+ with gr.Column():
276
+ generate_button = gr.Button("Generate Audio")
277
+ output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)
278
+
279
+ model_choice.change(
280
+ fn=update_ui,
281
+ inputs=[model_choice],
282
+ outputs=[
283
+ text,
284
+ language,
285
+ speaker_audio,
286
+ prefix_audio,
287
+ emotion1,
288
+ emotion2,
289
+ emotion3,
290
+ emotion4,
291
+ emotion5,
292
+ emotion6,
293
+ emotion7,
294
+ emotion8,
295
+ vq_single_slider,
296
+ fmax_slider,
297
+ pitch_std_slider,
298
+ speaking_rate_slider,
299
+ dnsmos_slider,
300
+ speaker_noised_checkbox,
301
+ unconditional_keys,
302
+ ],
303
+ )
304
+
305
+ # On page load, trigger the same UI refresh
306
+ demo.load(
307
+ fn=update_ui,
308
+ inputs=[model_choice],
309
+ outputs=[
310
+ text,
311
+ language,
312
+ speaker_audio,
313
+ prefix_audio,
314
+ emotion1,
315
+ emotion2,
316
+ emotion3,
317
+ emotion4,
318
+ emotion5,
319
+ emotion6,
320
+ emotion7,
321
+ emotion8,
322
+ vq_single_slider,
323
+ fmax_slider,
324
+ pitch_std_slider,
325
+ speaking_rate_slider,
326
+ dnsmos_slider,
327
+ speaker_noised_checkbox,
328
+ unconditional_keys,
329
+ ],
330
+ )
331
+
332
+ # Generate audio on button click
333
+ generate_button.click(
334
+ fn=generate_audio,
335
+ inputs=[
336
+ model_choice,
337
+ text,
338
+ language,
339
+ speaker_audio,
340
+ prefix_audio,
341
+ emotion1,
342
+ emotion2,
343
+ emotion3,
344
+ emotion4,
345
+ emotion5,
346
+ emotion6,
347
+ emotion7,
348
+ emotion8,
349
+ vq_single_slider,
350
+ fmax_slider,
351
+ pitch_std_slider,
352
+ speaking_rate_slider,
353
+ dnsmos_slider,
354
+ speaker_noised_checkbox,
355
+ cfg_scale_slider,
356
+ min_p_slider,
357
+ seed_number,
358
+ randomize_seed_toggle,
359
+ unconditional_keys,
360
+ ],
361
+ outputs=[output_audio, seed_number],
362
+ )
363
+
364
+ return demo
365
+
366
+
367
+ if __name__ == "__main__":
368
+ demo = build_interface()
369
+ share = getenv("GRADIO_SHARE", "False").lower() in ("true", "1", "t")
370
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=share)