Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Upload app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -100,8 +100,19 @@ phonemizers = dict( | |
| 100 | 
             
                j=Katsu(),
         | 
| 101 | 
             
            )
         | 
| 102 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 103 | 
             
            def phonemize(text, voice, norm=True):
         | 
| 104 | 
            -
                lang = voice[0]
         | 
| 105 | 
             
                if norm:
         | 
| 106 | 
             
                    text = normalize(text)
         | 
| 107 | 
             
                ps = phonemizers[lang].phonemize([text])
         | 
| @@ -182,8 +193,8 @@ VOICES = {device: {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt') | |
| 182 | 
             
            SAMPLE_RATE = 24000
         | 
| 183 |  | 
| 184 | 
             
            @torch.no_grad()
         | 
| 185 | 
            -
            def forward(tokens,  | 
| 186 | 
            -
                ref_s = VOICES[device][ | 
| 187 | 
             
                tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
         | 
| 188 | 
             
                input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
         | 
| 189 | 
             
                text_mask = length_to_mask(input_lengths).to(device)
         | 
| @@ -207,8 +218,8 @@ def forward(tokens, voice, speed, device='cpu'): | |
| 207 | 
             
                return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
         | 
| 208 |  | 
| 209 | 
             
            @spaces.GPU(duration=10)
         | 
| 210 | 
            -
            def forward_gpu(tokens,  | 
| 211 | 
            -
                return forward(tokens,  | 
| 212 |  | 
| 213 | 
             
            def clamp_speed(speed):
         | 
| 214 | 
             
                if not isinstance(speed, float) and not isinstance(speed, int):
         | 
| @@ -221,7 +232,7 @@ def clamp_speed(speed): | |
| 221 |  | 
| 222 | 
             
            # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
         | 
| 223 | 
             
            def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
         | 
| 224 | 
            -
                 | 
| 225 | 
             
                ps = ps or phonemize(text, voice)
         | 
| 226 | 
             
                speed = clamp_speed(speed)
         | 
| 227 | 
             
                trim = trim if isinstance(trim, int) else 3000
         | 
| @@ -235,14 +246,14 @@ def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'): | |
| 235 | 
             
                use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
         | 
| 236 | 
             
                try:
         | 
| 237 | 
             
                    if use_gpu:
         | 
| 238 | 
            -
                        out = forward_gpu(tokens,  | 
| 239 | 
             
                    else:
         | 
| 240 | 
            -
                        out = forward(tokens,  | 
| 241 | 
             
                except gr.exceptions.Error as e:
         | 
| 242 | 
             
                    if use_gpu:
         | 
| 243 | 
             
                        gr.Warning(str(e))
         | 
| 244 | 
             
                        gr.Info('GPU failover to CPU')
         | 
| 245 | 
            -
                        out = forward(tokens,  | 
| 246 | 
             
                    else:
         | 
| 247 | 
             
                        raise gr.Error(e)
         | 
| 248 | 
             
                        return (None, '')
         | 
| @@ -265,12 +276,15 @@ USE_GPU_INFOS = { | |
| 265 | 
             
            def change_use_gpu(value):
         | 
| 266 | 
             
                return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
         | 
| 267 |  | 
|  | |
|  | |
|  | |
| 268 | 
             
            with gr.Blocks() as basic_tts:
         | 
| 269 | 
             
                with gr.Row():
         | 
| 270 | 
             
                    with gr.Column():
         | 
| 271 | 
             
                        text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
         | 
| 272 | 
             
                        with gr.Row():
         | 
| 273 | 
            -
                            voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
         | 
| 274 | 
             
                            use_gpu = gr.Dropdown(
         | 
| 275 | 
             
                                USE_GPU_CHOICES,
         | 
| 276 | 
             
                                value='auto' if CUDA_AVAILABLE else False,
         | 
| @@ -298,12 +312,21 @@ with gr.Blocks() as basic_tts: | |
| 298 | 
             
                            trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
         | 
| 299 | 
             
                        with gr.Accordion('Output Tokens', open=True):
         | 
| 300 | 
             
                            out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 301 | 
             
                text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
         | 
| 302 | 
             
                generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
         | 
| 303 |  | 
| 304 | 
             
            @torch.no_grad()
         | 
| 305 | 
            -
            def lf_forward(token_lists,  | 
| 306 | 
            -
                voicepack = VOICES[device][ | 
| 307 | 
             
                outs = []
         | 
| 308 | 
             
                for tokens in token_lists:
         | 
| 309 | 
             
                    ref_s = voicepack[len(tokens)]
         | 
| @@ -331,8 +354,8 @@ def lf_forward(token_lists, voice, speed, device='cpu'): | |
| 331 | 
             
                return outs
         | 
| 332 |  | 
| 333 | 
             
            @spaces.GPU
         | 
| 334 | 
            -
            def lf_forward_gpu(token_lists,  | 
| 335 | 
            -
                return lf_forward(token_lists,  | 
| 336 |  | 
| 337 | 
             
            def resplit_strings(arr):
         | 
| 338 | 
             
                # Handle edge cases
         | 
| @@ -388,6 +411,8 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2 | |
| 388 |  | 
| 389 | 
             
            def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
         | 
| 390 | 
             
                token_lists = list(map(tokenize, segments['Tokens']))
         | 
|  | |
|  | |
| 391 | 
             
                wavs = []
         | 
| 392 | 
             
                trim = int(trim / speed)
         | 
| 393 | 
             
                pad_between = int(pad_between / speed)
         | 
| @@ -438,7 +463,7 @@ with gr.Blocks() as lf_tts: | |
| 438 | 
             
                        text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
         | 
| 439 | 
             
                        file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
         | 
| 440 | 
             
                        with gr.Row():
         | 
| 441 | 
            -
                            voice = gr.Dropdown(list(CHOICES.items()), value='af', label='Voice', info='Starred voices are more stable')
         | 
| 442 | 
             
                            use_gpu = gr.Dropdown(
         | 
| 443 | 
             
                                [('ZeroGPU 🚀', True), ('CPU 🐌', False)],
         | 
| 444 | 
             
                                value=CUDA_AVAILABLE,
         | 
| @@ -515,20 +540,26 @@ Random Japanese texts: CC0 public domain from [Common Voice](https://github.com/ | |
| 515 |  | 
| 516 | 
             
            with gr.Blocks() as changelog:
         | 
| 517 | 
             
                gr.Markdown("""
         | 
| 518 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 519 | 
             
            🔀 Hardware switching between CPU and GPU<br/>
         | 
| 520 | 
             
            🗣️ Restored old voices, back up to 32 total
         | 
| 521 |  | 
| 522 | 
            -
             | 
| 523 | 
             
            🚀 Model v0.19<br/>
         | 
| 524 | 
             
            🧪 Validation losses: 0.261 mel, 0.627 dur, 1.897 f0<br/>
         | 
| 525 | 
             
            📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
         | 
| 526 |  | 
| 527 | 
            -
             | 
| 528 | 
             
            🚀 Model v0.16<br/>
         | 
| 529 | 
             
            🧪 Validation losses: 0.263 mel, 0.646 dur, 1.934 f0
         | 
| 530 |  | 
| 531 | 
            -
             | 
| 532 | 
             
            🚀 Model v0.14<br/>
         | 
| 533 | 
             
            🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
         | 
| 534 | 
             
            """)
         | 
|  | |
| 100 | 
             
                j=Katsu(),
         | 
| 101 | 
             
            )
         | 
| 102 |  | 
| 103 | 
            +
            def resolve_voices(voice, warn=True):
         | 
| 104 | 
            +
                if not isinstance(voice, str):
         | 
| 105 | 
            +
                    return ['af']
         | 
| 106 | 
            +
                voices = voice.lower().replace('/', '_').replace(' ', '+').replace(',', '+').split('+')
         | 
| 107 | 
            +
                if warn:
         | 
| 108 | 
            +
                    unks = {v for v in voices if v and v not in VOICES['cpu']}
         | 
| 109 | 
            +
                    if unks:
         | 
| 110 | 
            +
                        gr.Warning(f"Unknown voice{'s' if len(unks) > 1 else ''}: {','.join(unks)}")
         | 
| 111 | 
            +
                voices = [v for v in voices if v in VOICES['cpu']]
         | 
| 112 | 
            +
                return voices if voices else ['af']
         | 
| 113 | 
            +
             | 
| 114 | 
             
            def phonemize(text, voice, norm=True):
         | 
| 115 | 
            +
                lang = resolve_voices(voice)[0][0]
         | 
| 116 | 
             
                if norm:
         | 
| 117 | 
             
                    text = normalize(text)
         | 
| 118 | 
             
                ps = phonemizers[lang].phonemize([text])
         | 
|  | |
| 193 | 
             
            SAMPLE_RATE = 24000
         | 
| 194 |  | 
| 195 | 
             
            @torch.no_grad()
         | 
| 196 | 
            +
            def forward(tokens, voices, speed, device='cpu'):
         | 
| 197 | 
            +
                ref_s = torch.mean(torch.stack([VOICES[device][v][len(tokens)] for v in voices]), dim=0)
         | 
| 198 | 
             
                tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
         | 
| 199 | 
             
                input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
         | 
| 200 | 
             
                text_mask = length_to_mask(input_lengths).to(device)
         | 
|  | |
| 218 | 
             
                return models[device].decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
         | 
| 219 |  | 
| 220 | 
             
            @spaces.GPU(duration=10)
         | 
| 221 | 
            +
            def forward_gpu(tokens, voices, speed):
         | 
| 222 | 
            +
                return forward(tokens, voices, speed, device='cuda')
         | 
| 223 |  | 
| 224 | 
             
            def clamp_speed(speed):
         | 
| 225 | 
             
                if not isinstance(speed, float) and not isinstance(speed, int):
         | 
|  | |
| 232 |  | 
| 233 | 
             
            # Must be backwards compatible with https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
         | 
| 234 | 
             
            def generate(text, voice='af', ps=None, speed=1, trim=3000, use_gpu='auto'):
         | 
| 235 | 
            +
                voices = resolve_voices(voice, warn=ps)
         | 
| 236 | 
             
                ps = ps or phonemize(text, voice)
         | 
| 237 | 
             
                speed = clamp_speed(speed)
         | 
| 238 | 
             
                trim = trim if isinstance(trim, int) else 3000
         | 
|  | |
| 246 | 
             
                use_gpu = len(ps) > 99 if use_gpu == 'auto' else use_gpu
         | 
| 247 | 
             
                try:
         | 
| 248 | 
             
                    if use_gpu:
         | 
| 249 | 
            +
                        out = forward_gpu(tokens, voices, speed)
         | 
| 250 | 
             
                    else:
         | 
| 251 | 
            +
                        out = forward(tokens, voices, speed)
         | 
| 252 | 
             
                except gr.exceptions.Error as e:
         | 
| 253 | 
             
                    if use_gpu:
         | 
| 254 | 
             
                        gr.Warning(str(e))
         | 
| 255 | 
             
                        gr.Info('GPU failover to CPU')
         | 
| 256 | 
            +
                        out = forward(tokens, voices, speed)
         | 
| 257 | 
             
                    else:
         | 
| 258 | 
             
                        raise gr.Error(e)
         | 
| 259 | 
             
                        return (None, '')
         | 
|  | |
| 276 | 
             
            def change_use_gpu(value):
         | 
| 277 | 
             
                return gr.Dropdown(USE_GPU_CHOICES, value=value, label='Hardware', info=USE_GPU_INFOS[value], interactive=CUDA_AVAILABLE)
         | 
| 278 |  | 
| 279 | 
            +
            def update_voice(voice, btn):
         | 
| 280 | 
            +
                return f'{voice}+{btn}' if voice.startswith(btn[:2]) else btn
         | 
| 281 | 
            +
             | 
| 282 | 
             
            with gr.Blocks() as basic_tts:
         | 
| 283 | 
             
                with gr.Row():
         | 
| 284 | 
             
                    with gr.Column():
         | 
| 285 | 
             
                        text = gr.Textbox(label='Input Text', info='Generate speech for one segment of text using Kokoro, a TTS model with 80 million parameters')
         | 
| 286 | 
             
                        with gr.Row():
         | 
| 287 | 
            +
                            voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
         | 
| 288 | 
             
                            use_gpu = gr.Dropdown(
         | 
| 289 | 
             
                                USE_GPU_CHOICES,
         | 
| 290 | 
             
                                value='auto' if CUDA_AVAILABLE else False,
         | 
|  | |
| 312 | 
             
                            trim = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='✂️ Trim', info='Cut from both ends')
         | 
| 313 | 
             
                        with gr.Accordion('Output Tokens', open=True):
         | 
| 314 | 
             
                            out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
         | 
| 315 | 
            +
                with gr.Accordion('Voice Mixer', open=False):
         | 
| 316 | 
            +
                    gr.Markdown('Create a custom voice by mixing and matching other voices. Click an orange button to add one part to your mix, or click a gray button to start over. Free text input also allowed.')
         | 
| 317 | 
            +
                    for i in range(8):
         | 
| 318 | 
            +
                        with gr.Row():
         | 
| 319 | 
            +
                            for j in range(4):
         | 
| 320 | 
            +
                                with gr.Column():
         | 
| 321 | 
            +
                                    btn = gr.Button(list(CHOICES.values())[i*4+j], variant='primary' if i*4+j < 10 else 'secondary')
         | 
| 322 | 
            +
                                    btn.click(update_voice, inputs=[voice, btn], outputs=[voice])
         | 
| 323 | 
            +
                                    voice.change(lambda v, b: gr.Button(b, variant='primary' if v.startswith(b[:2]) else 'secondary'), inputs=[voice, btn], outputs=[btn])
         | 
| 324 | 
             
                text.submit(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
         | 
| 325 | 
             
                generate_btn.click(generate, inputs=[text, voice, in_ps, speed, trim, use_gpu], outputs=[audio, out_ps])
         | 
| 326 |  | 
| 327 | 
             
            @torch.no_grad()
         | 
| 328 | 
            +
            def lf_forward(token_lists, voices, speed, device='cpu'):
         | 
| 329 | 
            +
                voicepack = torch.mean(torch.stack([VOICES[device][v] for v in voices]), dim=0)
         | 
| 330 | 
             
                outs = []
         | 
| 331 | 
             
                for tokens in token_lists:
         | 
| 332 | 
             
                    ref_s = voicepack[len(tokens)]
         | 
|  | |
| 354 | 
             
                return outs
         | 
| 355 |  | 
| 356 | 
             
            @spaces.GPU
         | 
| 357 | 
            +
            def lf_forward_gpu(token_lists, voices, speed):
         | 
| 358 | 
            +
                return lf_forward(token_lists, voices, speed, device='cuda')
         | 
| 359 |  | 
| 360 | 
             
            def resplit_strings(arr):
         | 
| 361 | 
             
                # Handle edge cases
         | 
|  | |
| 411 |  | 
| 412 | 
             
            def lf_generate(segments, voice, speed=1, trim=0, pad_between=0, use_gpu=True):
         | 
| 413 | 
             
                token_lists = list(map(tokenize, segments['Tokens']))
         | 
| 414 | 
            +
                voices = resolve_voices(voice)
         | 
| 415 | 
            +
                speed = clamp_speed(speed)
         | 
| 416 | 
             
                wavs = []
         | 
| 417 | 
             
                trim = int(trim / speed)
         | 
| 418 | 
             
                pad_between = int(pad_between / speed)
         | 
|  | |
| 463 | 
             
                        text = gr.Textbox(label='Input Text', info='Generate speech in batches of 100 text segments and automatically join them together')
         | 
| 464 | 
             
                        file_input.upload(fn=extract_text, inputs=[file_input], outputs=[text])
         | 
| 465 | 
             
                        with gr.Row():
         | 
| 466 | 
            +
                            voice = gr.Dropdown(list(CHOICES.items()), value='af', allow_custom_value=True, label='Voice', info='Starred voices are more stable')
         | 
| 467 | 
             
                            use_gpu = gr.Dropdown(
         | 
| 468 | 
             
                                [('ZeroGPU 🚀', True), ('CPU 🐌', False)],
         | 
| 469 | 
             
                                value=CUDA_AVAILABLE,
         | 
|  | |
| 540 |  | 
| 541 | 
             
            with gr.Blocks() as changelog:
         | 
| 542 | 
             
                gr.Markdown("""
         | 
| 543 | 
            +
            **25 Nov 2024**<br/>
         | 
| 544 | 
            +
            🎨 Voice Mixer added
         | 
| 545 | 
            +
             | 
| 546 | 
            +
            **24 Nov 2024**<br/>
         | 
| 547 | 
            +
            🛑 Model training halted, v0.19 is the current stable version
         | 
| 548 | 
            +
             | 
| 549 | 
            +
            **23 Nov 2024**<br/>
         | 
| 550 | 
             
            🔀 Hardware switching between CPU and GPU<br/>
         | 
| 551 | 
             
            🗣️ Restored old voices, back up to 32 total
         | 
| 552 |  | 
| 553 | 
            +
            **22 Nov 2024**<br/>
         | 
| 554 | 
             
            🚀 Model v0.19<br/>
         | 
| 555 | 
             
            🧪 Validation losses: 0.261 mel, 0.627 dur, 1.897 f0<br/>
         | 
| 556 | 
             
            📄 https://hf.co/blog/hexgrad/kokoro-short-burst-upgrade
         | 
| 557 |  | 
| 558 | 
            +
            **15 Nov 2024**<br/>
         | 
| 559 | 
             
            🚀 Model v0.16<br/>
         | 
| 560 | 
             
            🧪 Validation losses: 0.263 mel, 0.646 dur, 1.934 f0
         | 
| 561 |  | 
| 562 | 
            +
            **12 Nov 2024**<br/>
         | 
| 563 | 
             
            🚀 Model v0.14<br/>
         | 
| 564 | 
             
            🧪 Validation losses: 0.262 mel, 0.642 dur, 1.889 f0
         | 
| 565 | 
             
            """)
         | 
 
			
