Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
|
@@ -100,8 +100,19 @@ phonemizers = dict(
|
|
| 100 |
j=Katsu(),
|
| 101 |
)
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
def phonemize(text, voice, norm=True):
|
| 104 |
-
lang = voice[0]
|
| 105 |
if norm:
|
| 106 |
text = normalize(text)
|
| 107 |
ps = phonemizers[lang].phonemize([text])
|
|
@@ -182,8 +193,8 @@ VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt')
|
|
| 182 |
SAMPLE_RATE = 24000
|
| 183 |
|
| 184 |
@torch.no_grad()
|
| 185 |
-
def forward(tokens,
|
| 186 |
-
ref_s = VOICES[device][
|
| 187 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 188 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 189 |
text_mask = length_to_mask(input_lengths).to(device)
|
|
@@ -207,8 +218,8 @@ def forward(tokens, voice, speed, device='cpu'):
|
|
| 207 |
return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 208 |
|
| 209 |
@spaces.GPU(duration=10)
|
| 210 |
-
def forward_gpu(tokens,
|
| 211 |
-
return forward(tokens,
|
| 212 |
|
| 213 |
def clamp_speed(speed):
|
| 214 |
if not isinstance(speed, float) and not isinstance(speed, int):
|
|
@@ -221,7 +232,7 @@ def clamp_speed(speed):
|
|
| 221 |
|
| 222 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 223 |
def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
|
| 224 |
-
|
| 225 |
ps = ps or phonemize(text, voice)
|
| 226 |
speed = clamp_speed(speed)
|
| 227 |
trim = trim if isinstance(trim, int) else 3000
|
|
@@ -235,14 +246,14 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
|
|
| 235 |
use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
|
| 236 |
try:
|
| 237 |
if use_gpu:
|
| 238 |
-
out = forward_gpu(tokens,
|
| 239 |
else:
|
| 240 |
-
out = forward(tokens,
|
| 241 |
except gr.exceptions.Error as e:
|
| 242 |
if use_gpu:
|
| 243 |
gr.Warning(str(e))
|
| 244 |
gr.Info('GPU failover to CPU')
|
| 245 |
-
out = forward(tokens,
|
| 246 |
else:
|
| 247 |
raise gr.Error(e)
|
| 248 |
return (None, '')
|
|
@@ -265,12 +276,15 @@ USE_GPU_INFOS = {
|
|
| 265 |
def change_use_gpu(value):
|
| 266 |
return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
|
| 267 |
|
|
|
|
|
|
|
|
|
|
| 268 |
with gr.Blocks() as basic_tts:
|
| 269 |
with gr.Row():
|
| 270 |
with gr.Column():
|
| 271 |
text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
|
| 272 |
with gr.Row():
|
| 273 |
-
voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
|
| 274 |
use_gpu = gr.Dropdown(
|
| 275 |
USE_GPU_CHOICES,
|
| 276 |
value='auto' if CUDA_AVAILABLE else False,
|
|
@@ -298,12 +312,21 @@ with gr.Blocks() as basic_tts:
|
|
| 298 |
trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
|
| 299 |
with gr.Accordion('Output Tokens', open=True):
|
| 300 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
| 302 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
| 303 |
|
| 304 |
@torch.no_grad()
|
| 305 |
-
def lf_forward(token_lists,
|
| 306 |
-
voicepack = VOICES[device][
|
| 307 |
outs = []
|
| 308 |
for tokens in token_lists:
|
| 309 |
ref_s = voicepack[len(tokens)]
|
|
@@ -331,8 +354,8 @@ def lf_forward(token_lists, voice, speed, device='cpu'):
|
|
| 331 |
return outs
|
| 332 |
|
| 333 |
@spaces.GPU
|
| 334 |
-
def lf_forward_gpu(token_lists,
|
| 335 |
-
return lf_forward(token_lists,
|
| 336 |
|
| 337 |
def resplit_strings(arr):
|
| 338 |
# Handle edge cases
|
|
@@ -388,6 +411,8 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
| 388 |
|
| 389 |
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
|
| 390 |
token_lists = list(map(tokenize, segments['Tokens']))
|
|
|
|
|
|
|
| 391 |
wavs = []
|
| 392 |
trim = int(trim / speed)
|
| 393 |
pad_between = int(pad_between / speed)
|
|
@@ -438,7 +463,7 @@ with gr.Blocks() as lf_tts:
|
|
| 438 |
text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
|
| 439 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
| 440 |
with gr.Row():
|
| 441 |
-
voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
|
| 442 |
use_gpu = gr.Dropdown(
|
| 443 |
[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
|
| 444 |
value=CUDA_AVAILABLE,
|
|
@@ -515,20 +540,26 @@ Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/
|
|
| 515 |
|
| 516 |
with gr.Blocks() as changelog:
|
| 517 |
gr.Markdown("""
|
| 518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
🔀 Hardware switching between CPU and GPU<br/>
|
| 520 |
🗣️ Restored old voices, back up to 32 total
|
| 521 |
|
| 522 |
-
|
| 523 |
🚀 Model v0.19<br/>
|
| 524 |
🧪 Validation losses: 0.261 mel, 0.627 dur, 1.897 f0<br/>
|
| 525 |
📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
|
| 526 |
|
| 527 |
-
|
| 528 |
🚀 Model v0.16<br/>
|
| 529 |
🧪 Validation losses: 0.263 mel, 0.646 dur, 1.934 f0
|
| 530 |
|
| 531 |
-
|
| 532 |
🚀 Model v0.14<br/>
|
| 533 |
🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
|
| 534 |
""")
|
|
|
|
| 100 |
j=Katsu(),
|
| 101 |
)
|
| 102 |
|
| 103 |
+
def resolve_voices(voice, warn=True):
|
| 104 |
+
if not isinstance(voice, str):
|
| 105 |
+
return ['af']
|
| 106 |
+
voices = voice.lower().replace('/', '_').replace(' ', '+').replace(',', '+').split('+')
|
| 107 |
+
if warn:
|
| 108 |
+
unks = {v for v in voices if v and v not in VOICES['cpu']}
|
| 109 |
+
if unks:
|
| 110 |
+
gr.Warning(f"Unknown voice{'s' if len(unks) > 1 else ''}: {','.join(unks)}")
|
| 111 |
+
voices = [v for v in voices if v in VOICES['cpu']]
|
| 112 |
+
return voices if voices else ['af']
|
| 113 |
+
|
| 114 |
def phonemize(text, voice, norm=True):
|
| 115 |
+
lang = resolve_voices(voice)[0][0]
|
| 116 |
if norm:
|
| 117 |
text = normalize(text)
|
| 118 |
ps = phonemizers[lang].phonemize([text])
|
|
|
|
| 193 |
SAMPLE_RATE = 24000
|
| 194 |
|
| 195 |
@torch.no_grad()
|
| 196 |
+
def forward(tokens, voices, speed, device='cpu'):
|
| 197 |
+
ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
|
| 198 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 199 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 200 |
text_mask = length_to_mask(input_lengths).to(device)
|
|
|
|
| 218 |
return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 219 |
|
| 220 |
@spaces.GPU(duration=10)
|
| 221 |
+
def forward_gpu(tokens, voices, speed):
|
| 222 |
+
return forward(tokens, voices, speed, device='cuda')
|
| 223 |
|
| 224 |
def clamp_speed(speed):
|
| 225 |
if not isinstance(speed, float) and not isinstance(speed, int):
|
|
|
|
| 232 |
|
| 233 |
# Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 234 |
def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
|
| 235 |
+
voices = resolve_voices(voice, warn=ps)
|
| 236 |
ps = ps or phonemize(text, voice)
|
| 237 |
speed = clamp_speed(speed)
|
| 238 |
trim = trim if isinstance(trim, int) else 3000
|
|
|
|
| 246 |
use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
|
| 247 |
try:
|
| 248 |
if use_gpu:
|
| 249 |
+
out = forward_gpu(tokens, voices, speed)
|
| 250 |
else:
|
| 251 |
+
out = forward(tokens, voices, speed)
|
| 252 |
except gr.exceptions.Error as e:
|
| 253 |
if use_gpu:
|
| 254 |
gr.Warning(str(e))
|
| 255 |
gr.Info('GPU failover to CPU')
|
| 256 |
+
out = forward(tokens, voices, speed)
|
| 257 |
else:
|
| 258 |
raise gr.Error(e)
|
| 259 |
return (None, '')
|
|
|
|
| 276 |
def change_use_gpu(value):
|
| 277 |
return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
|
| 278 |
|
| 279 |
+
def update_voice(voice, btn):
|
| 280 |
+
return f'{voice}+{btn}' if voice.startswith(btn[:2]) else btn
|
| 281 |
+
|
| 282 |
with gr.Blocks() as basic_tts:
|
| 283 |
with gr.Row():
|
| 284 |
with gr.Column():
|
| 285 |
text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
|
| 286 |
with gr.Row():
|
| 287 |
+
voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
|
| 288 |
use_gpu = gr.Dropdown(
|
| 289 |
USE_GPU_CHOICES,
|
| 290 |
value='auto' if CUDA_AVAILABLE else False,
|
|
|
|
| 312 |
trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
|
| 313 |
with gr.Accordion('Output Tokens', open=True):
|
| 314 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
| 315 |
+
with gr.Accordion('Voice Mixer', open=False):
|
| 316 |
+
gr.Markdown('Create a custom voice by mixing and matching other voices. Click an orange button to add one part to your mix, or click a gray button to start over. Free text input also allowed.')
|
| 317 |
+
for i in range(8):
|
| 318 |
+
with gr.Row():
|
| 319 |
+
for j in range(4):
|
| 320 |
+
with gr.Column():
|
| 321 |
+
btn = gr.Button(list(CHOICES.values())[i*4+j], variant='primary' if i*4+j < 10 else 'secondary')
|
| 322 |
+
btn.click(update_voice, inputs=[voice, btn], outputs=[voice])
|
| 323 |
+
voice.change(lambda v, b: gr.Button(b, variant='primary' if v.startswith(b[:2]) else 'secondary'), inputs=[voice, btn], outputs=[btn])
|
| 324 |
text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
| 325 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
|
| 326 |
|
| 327 |
@torch.no_grad()
|
| 328 |
+
def lf_forward(token_lists, voices, speed, device='cpu'):
|
| 329 |
+
voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
|
| 330 |
outs = []
|
| 331 |
for tokens in token_lists:
|
| 332 |
ref_s = voicepack[len(tokens)]
|
|
|
|
| 354 |
return outs
|
| 355 |
|
| 356 |
@spaces.GPU
|
| 357 |
+
def lf_forward_gpu(token_lists, voices, speed):
|
| 358 |
+
return lf_forward(token_lists, voices, speed, device='cuda')
|
| 359 |
|
| 360 |
def resplit_strings(arr):
|
| 361 |
# Handle edge cases
|
|
|
|
| 411 |
|
| 412 |
def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
|
| 413 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 414 |
+
voices = resolve_voices(voice)
|
| 415 |
+
speed = clamp_speed(speed)
|
| 416 |
wavs = []
|
| 417 |
trim = int(trim / speed)
|
| 418 |
pad_between = int(pad_between / speed)
|
|
|
|
| 463 |
text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
|
| 464 |
file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
|
| 465 |
with gr.Row():
|
| 466 |
+
voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
|
| 467 |
use_gpu = gr.Dropdown(
|
| 468 |
[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
|
| 469 |
value=CUDA_AVAILABLE,
|
|
|
|
| 540 |
|
| 541 |
with gr.Blocks() as changelog:
|
| 542 |
gr.Markdown("""
|
| 543 |
+
**25 Nov 2024**<br/>
|
| 544 |
+
🎨 Voice Mixer added
|
| 545 |
+
|
| 546 |
+
**24 Nov 2024**<br/>
|
| 547 |
+
🛑 Model training halted, v0.19 is the current stable version
|
| 548 |
+
|
| 549 |
+
**23 Nov 2024**<br/>
|
| 550 |
🔀 Hardware switching between CPU and GPU<br/>
|
| 551 |
🗣️ Restored old voices, back up to 32 total
|
| 552 |
|
| 553 |
+
**22 Nov 2024**<br/>
|
| 554 |
🚀 Model v0.19<br/>
|
| 555 |
🧪 Validation losses: 0.261 mel, 0.627 dur, 1.897 f0<br/>
|
| 556 |
📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
|
| 557 |
|
| 558 |
+
**15 Nov 2024**<br/>
|
| 559 |
🚀 Model v0.16<br/>
|
| 560 |
🧪 Validation losses: 0.263 mel, 0.646 dur, 1.934 f0
|
| 561 |
|
| 562 |
+
**12 Nov 2024**<br/>
|
| 563 |
🚀 Model v0.14<br/>
|
| 564 |
🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
|
| 565 |
""")
|