Spaces:
Running
on
Zero
Running
on
Zero
Upload 3 files
Browse files
app.py
CHANGED
|
@@ -7,6 +7,7 @@ import numpy as np
|
|
| 7 |
import os
|
| 8 |
import phonemizer
|
| 9 |
import random
|
|
|
|
| 10 |
import spaces
|
| 11 |
import torch
|
| 12 |
import yaml
|
|
@@ -32,17 +33,23 @@ def normalize(text):
|
|
| 32 |
text = text.replace('Mr.', 'Mister')
|
| 33 |
text = text.replace('Ms.', 'Miss')
|
| 34 |
text = text.replace('Mrs.', 'Mrs')
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
phonemizers = dict(
|
| 38 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
|
| 39 |
b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
|
| 40 |
-
j=Katsu()
|
| 41 |
)
|
| 42 |
|
| 43 |
-
def phonemize(text, voice):
|
| 44 |
lang = voice[0]
|
| 45 |
-
|
|
|
|
| 46 |
ps = phonemizers[lang].phonemize([text])
|
| 47 |
ps = ps[0] if ps else ''
|
| 48 |
# TODO: Custom phonemization rules?
|
|
@@ -50,6 +57,8 @@ def phonemize(text, voice):
|
|
| 50 |
# https://en.wiktionary.org/wiki/kokoro#English
|
| 51 |
ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
|
| 52 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
|
|
|
|
|
|
|
| 53 |
return ps.strip()
|
| 54 |
|
| 55 |
def length_to_mask(lengths):
|
|
@@ -69,11 +78,19 @@ def get_vocab():
|
|
| 69 |
return dicts
|
| 70 |
|
| 71 |
VOCAB = get_vocab()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 73 |
|
| 74 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
|
| 75 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
|
| 76 |
model = build_model(config['model_params'])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
_ = [model[key].eval() for key in model]
|
| 78 |
_ = [model[key].to(device) for key in model]
|
| 79 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
|
|
@@ -113,42 +130,45 @@ def s_curve(p):
|
|
| 113 |
|
| 114 |
SAMPLE_RATE = 24000
|
| 115 |
|
| 116 |
-
@spaces.GPU(duration=
|
| 117 |
@torch.no_grad()
|
| 118 |
def forward(tokens, voice, speed):
|
|
|
|
| 119 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 120 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 121 |
text_mask = length_to_mask(input_lengths).to(device)
|
| 122 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
| 123 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
| 124 |
-
ref_s = VOICES[voice]
|
| 125 |
s = ref_s[:, 128:]
|
| 126 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
| 127 |
x, _ = model.predictor.lstm(d)
|
| 128 |
duration = model.predictor.duration_proj(x)
|
| 129 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
| 130 |
-
pred_dur = torch.round(duration
|
| 131 |
-
pred_aln_trg = torch.zeros(input_lengths,
|
| 132 |
c_frame = 0
|
| 133 |
for i in range(pred_aln_trg.size(0)):
|
| 134 |
-
pred_aln_trg[i, c_frame:c_frame +
|
| 135 |
-
c_frame +=
|
| 136 |
-
en =
|
| 137 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
| 138 |
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
| 139 |
-
asr =
|
| 140 |
-
|
| 141 |
-
return out.squeeze().cpu().numpy()
|
| 142 |
|
| 143 |
-
def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=
|
| 144 |
ps = ps or phonemize(text, voice)
|
| 145 |
-
tokens =
|
| 146 |
if not tokens:
|
| 147 |
return (None, '')
|
| 148 |
elif len(tokens) > 510:
|
| 149 |
tokens = tokens[:510]
|
| 150 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
if reduce_noise > 0:
|
| 153 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
| 154 |
opening_cut = max(0, int(opening_cut / speed))
|
|
@@ -156,7 +176,7 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=5000
|
|
| 156 |
out[:opening_cut] = 0
|
| 157 |
closing_cut = max(0, int(closing_cut / speed))
|
| 158 |
if closing_cut > 0:
|
| 159 |
-
out
|
| 160 |
ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
|
| 161 |
for i in range(ease_in):
|
| 162 |
out[i+opening_cut] *= s_curve(i / ease_in)
|
|
@@ -165,7 +185,7 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=5000
|
|
| 165 |
out[-i-1-closing_cut] *= s_curve(i / ease_out)
|
| 166 |
return ((SAMPLE_RATE, out), ps)
|
| 167 |
|
| 168 |
-
with gr.Blocks() as
|
| 169 |
with gr.Row():
|
| 170 |
with gr.Column():
|
| 171 |
text = gr.Textbox(label='Input Text')
|
|
@@ -174,32 +194,196 @@ with gr.Blocks() as demo:
|
|
| 174 |
random_btn = gr.Button('Random Text', variant='secondary')
|
| 175 |
generate_btn = gr.Button('Generate', variant='primary')
|
| 176 |
random_btn.click(get_random_text, inputs=[voice], outputs=[text])
|
| 177 |
-
with gr.Accordion('Input
|
| 178 |
-
in_ps = gr.Textbox(show_label=False, info='Override the input text with custom
|
| 179 |
with gr.Row():
|
| 180 |
clear_btn = gr.ClearButton(in_ps)
|
| 181 |
-
phonemize_btn = gr.Button('
|
| 182 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
| 183 |
with gr.Column():
|
| 184 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
| 185 |
-
with gr.Accordion('Tokens', open=True):
|
| 186 |
-
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio. Same as input
|
| 187 |
-
with gr.Accordion('
|
| 188 |
with gr.Row():
|
| 189 |
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
|
| 190 |
with gr.Row():
|
| 191 |
-
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The
|
| 192 |
with gr.Row():
|
| 193 |
with gr.Column():
|
| 194 |
-
opening_cut = gr.Slider(minimum=0, maximum=24000, value=
|
| 195 |
with gr.Column():
|
| 196 |
-
closing_cut = gr.Slider(minimum=0, maximum=24000, value=
|
| 197 |
with gr.Row():
|
| 198 |
with gr.Column():
|
| 199 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
| 200 |
with gr.Column():
|
| 201 |
-
ease_out = gr.Slider(minimum=0, maximum=24000, value=
|
| 202 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out], outputs=[audio, out_ps])
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
if __name__ == '__main__':
|
| 205 |
-
|
|
|
|
| 7 |
import os
|
| 8 |
import phonemizer
|
| 9 |
import random
|
| 10 |
+
import re
|
| 11 |
import spaces
|
| 12 |
import torch
|
| 13 |
import yaml
|
|
|
|
| 33 |
text = text.replace('Mr.', 'Mister')
|
| 34 |
text = text.replace('Ms.', 'Miss')
|
| 35 |
text = text.replace('Mrs.', 'Mrs')
|
| 36 |
+
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
| 37 |
+
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
| 38 |
+
text = re.sub(r'[^\S \n]', ' ', text)
|
| 39 |
+
text = re.sub(r' +', ' ', text)
|
| 40 |
+
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
| 41 |
+
return parens_to_angles(text).strip()
|
| 42 |
|
| 43 |
phonemizers = dict(
|
| 44 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
|
| 45 |
b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
|
| 46 |
+
j=Katsu(),
|
| 47 |
)
|
| 48 |
|
| 49 |
+
def phonemize(text, voice, norm=True):
|
| 50 |
lang = voice[0]
|
| 51 |
+
if norm:
|
| 52 |
+
text = normalize(text)
|
| 53 |
ps = phonemizers[lang].phonemize([text])
|
| 54 |
ps = ps[0] if ps else ''
|
| 55 |
# TODO: Custom phonemization rules?
|
|
|
|
| 57 |
# https://en.wiktionary.org/wiki/kokoro#English
|
| 58 |
ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
|
| 59 |
ps = ''.join(filter(lambda p: p in VOCAB, ps))
|
| 60 |
+
if lang == 'j' and any(p in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' for p in ps):
|
| 61 |
+
gr.Warning('Japanese tokenizer does not handle English letters.')
|
| 62 |
return ps.strip()
|
| 63 |
|
| 64 |
def length_to_mask(lengths):
|
|
|
|
| 78 |
return dicts
|
| 79 |
|
| 80 |
VOCAB = get_vocab()
|
| 81 |
+
|
| 82 |
+
def tokenize(ps):
|
| 83 |
+
return [i for i in map(VOCAB.get, ps) if i is not None]
|
| 84 |
+
|
| 85 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 86 |
|
| 87 |
snapshot = snapshot_download(repo_id='hexgrad/kokoro', allow_patterns=['*.pt', '*.pth', '*.yml'], use_auth_token=os.environ['TOKEN'])
|
| 88 |
config = yaml.safe_load(open(os.path.join(snapshot, 'config.yml')))
|
| 89 |
model = build_model(config['model_params'])
|
| 90 |
+
for key, value in model.items():
|
| 91 |
+
for module in value.children():
|
| 92 |
+
if isinstance(module, torch.nn.RNNBase):
|
| 93 |
+
module.flatten_parameters()
|
| 94 |
_ = [model[key].eval() for key in model]
|
| 95 |
_ = [model[key].to(device) for key in model]
|
| 96 |
for key, state_dict in torch.load(os.path.join(snapshot, 'net.pth'), map_location='cpu', weights_only=True)['net'].items():
|
|
|
|
| 130 |
|
| 131 |
SAMPLE_RATE = 24000
|
| 132 |
|
| 133 |
+
@spaces.GPU(duration=1)
|
| 134 |
@torch.no_grad()
|
| 135 |
def forward(tokens, voice, speed):
|
| 136 |
+
ref_s = VOICES[voice]
|
| 137 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 138 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 139 |
text_mask = length_to_mask(input_lengths).to(device)
|
| 140 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
| 141 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
|
|
|
| 142 |
s = ref_s[:, 128:]
|
| 143 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
| 144 |
x, _ = model.predictor.lstm(d)
|
| 145 |
duration = model.predictor.duration_proj(x)
|
| 146 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
| 147 |
+
pred_dur = torch.round(duration).clamp(min=1).long()
|
| 148 |
+
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
| 149 |
c_frame = 0
|
| 150 |
for i in range(pred_aln_trg.size(0)):
|
| 151 |
+
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
|
| 152 |
+
c_frame += pred_dur[0,i].item()
|
| 153 |
+
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
| 154 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
| 155 |
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
| 156 |
+
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 157 |
+
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
|
|
|
| 158 |
|
| 159 |
+
def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000):
|
| 160 |
ps = ps or phonemize(text, voice)
|
| 161 |
+
tokens = tokenize(ps)
|
| 162 |
if not tokens:
|
| 163 |
return (None, '')
|
| 164 |
elif len(tokens) > 510:
|
| 165 |
tokens = tokens[:510]
|
| 166 |
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
|
| 167 |
+
try:
|
| 168 |
+
out = forward(tokens, voice, speed)
|
| 169 |
+
except gr.exceptions.Error as e:
|
| 170 |
+
raise gr.Error(e)
|
| 171 |
+
return (None, '')
|
| 172 |
if reduce_noise > 0:
|
| 173 |
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
| 174 |
opening_cut = max(0, int(opening_cut / speed))
|
|
|
|
| 176 |
out[:opening_cut] = 0
|
| 177 |
closing_cut = max(0, int(closing_cut / speed))
|
| 178 |
if closing_cut > 0:
|
| 179 |
+
out[-closing_cut:] = 0
|
| 180 |
ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
|
| 181 |
for i in range(ease_in):
|
| 182 |
out[i+opening_cut] *= s_curve(i / ease_in)
|
|
|
|
| 185 |
out[-i-1-closing_cut] *= s_curve(i / ease_out)
|
| 186 |
return ((SAMPLE_RATE, out), ps)
|
| 187 |
|
| 188 |
+
with gr.Blocks() as basic_tts:
|
| 189 |
with gr.Row():
|
| 190 |
with gr.Column():
|
| 191 |
text = gr.Textbox(label='Input Text')
|
|
|
|
| 194 |
random_btn = gr.Button('Random Text', variant='secondary')
|
| 195 |
generate_btn = gr.Button('Generate', variant='primary')
|
| 196 |
random_btn.click(get_random_text, inputs=[voice], outputs=[text])
|
| 197 |
+
with gr.Accordion('Input Tokens', open=False):
|
| 198 |
+
in_ps = gr.Textbox(show_label=False, info='Override the input text with custom phonemes. Leave this blank to automatically tokenize the input text instead.')
|
| 199 |
with gr.Row():
|
| 200 |
clear_btn = gr.ClearButton(in_ps)
|
| 201 |
+
phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
|
| 202 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
| 203 |
with gr.Column():
|
| 204 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
| 205 |
+
with gr.Accordion('Output Tokens', open=True):
|
| 206 |
+
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
| 207 |
+
with gr.Accordion('Audio Settings', open=False):
|
| 208 |
with gr.Row():
|
| 209 |
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
|
| 210 |
with gr.Row():
|
| 211 |
+
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 212 |
with gr.Row():
|
| 213 |
with gr.Column():
|
| 214 |
+
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
|
| 215 |
with gr.Column():
|
| 216 |
+
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
|
| 217 |
with gr.Row():
|
| 218 |
with gr.Column():
|
| 219 |
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
| 220 |
with gr.Column():
|
| 221 |
+
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
| 222 |
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out], outputs=[audio, out_ps])
|
| 223 |
|
| 224 |
+
@spaces.GPU
|
| 225 |
+
@torch.no_grad()
|
| 226 |
+
def lf_forward(token_lists, voice, speed):
|
| 227 |
+
ref_s = VOICES[voice]
|
| 228 |
+
s = ref_s[:, 128:]
|
| 229 |
+
outs = []
|
| 230 |
+
for tokens in token_lists:
|
| 231 |
+
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 232 |
+
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 233 |
+
text_mask = length_to_mask(input_lengths).to(device)
|
| 234 |
+
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
| 235 |
+
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
| 236 |
+
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
| 237 |
+
x, _ = model.predictor.lstm(d)
|
| 238 |
+
duration = model.predictor.duration_proj(x)
|
| 239 |
+
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
| 240 |
+
pred_dur = torch.round(duration).clamp(min=1).long()
|
| 241 |
+
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
| 242 |
+
c_frame = 0
|
| 243 |
+
for i in range(pred_aln_trg.size(0)):
|
| 244 |
+
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
|
| 245 |
+
c_frame += pred_dur[0,i].item()
|
| 246 |
+
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
| 247 |
+
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
| 248 |
+
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
| 249 |
+
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 250 |
+
outs.append(model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy())
|
| 251 |
+
return outs
|
| 252 |
+
|
| 253 |
+
def resplit_strings(arr):
|
| 254 |
+
# Handle edge cases
|
| 255 |
+
if not arr:
|
| 256 |
+
return '', ''
|
| 257 |
+
if len(arr) == 1:
|
| 258 |
+
return arr[0], ''
|
| 259 |
+
# Try each possible split point
|
| 260 |
+
min_diff = float('inf')
|
| 261 |
+
best_split = 0
|
| 262 |
+
# Calculate lengths when joined with spaces
|
| 263 |
+
lengths = [len(s) for s in arr]
|
| 264 |
+
spaces = len(arr) - 1 # Total spaces needed
|
| 265 |
+
# Try each split point
|
| 266 |
+
left_len = 0
|
| 267 |
+
right_len = sum(lengths) + spaces
|
| 268 |
+
for i in range(1, len(arr)):
|
| 269 |
+
# Add current word and space to left side
|
| 270 |
+
left_len += lengths[i-1] + (1 if i > 1 else 0)
|
| 271 |
+
# Remove current word and space from right side
|
| 272 |
+
right_len -= lengths[i-1] + 1
|
| 273 |
+
diff = abs(left_len - right_len)
|
| 274 |
+
if diff < min_diff:
|
| 275 |
+
min_diff = diff
|
| 276 |
+
best_split = i
|
| 277 |
+
# Join the strings with the best split point
|
| 278 |
+
return ' '.join(arr[:best_split]), ' '.join(arr[best_split:])
|
| 279 |
+
|
| 280 |
+
def recursive_split(text, voice):
|
| 281 |
+
if not text:
|
| 282 |
+
return []
|
| 283 |
+
tokens = phonemize(text, voice, norm=False)
|
| 284 |
+
if len(tokens) < 511:
|
| 285 |
+
return [(text, tokens, len(tokens))] if tokens else []
|
| 286 |
+
if ' ' not in text:
|
| 287 |
+
return []
|
| 288 |
+
for punctuation in ['!.?…', ':;', ',—']:
|
| 289 |
+
splits = re.split(f'(?:(?<=[{punctuation}])|(?<=[{punctuation}]["\'»])|(?<=[{punctuation}]["\'»]["\'»])) ', text)
|
| 290 |
+
if len(splits) > 1:
|
| 291 |
+
break
|
| 292 |
+
else:
|
| 293 |
+
splits = None
|
| 294 |
+
splits = splits or text.split(' ')
|
| 295 |
+
a, b = resplit_strings(splits)
|
| 296 |
+
return recursive_split(a, voice) + recursive_split(b, voice)
|
| 297 |
+
|
| 298 |
+
def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
|
| 299 |
+
if skip_square_brackets:
|
| 300 |
+
text = re.sub(r'\[.*?\]', '', text)
|
| 301 |
+
texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize(text))] if newline_split > 0 else [normalize(text)]
|
| 302 |
+
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 303 |
+
return [(i, *row) for i, row in enumerate(segments)]
|
| 304 |
+
|
| 305 |
+
def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad=5000):
|
| 306 |
+
token_lists = list(map(tokenize, segments['Tokens']))
|
| 307 |
+
wavs = []
|
| 308 |
+
opening_cut = max(0, int(opening_cut / speed))
|
| 309 |
+
closing_cut = max(0, int(closing_cut / speed))
|
| 310 |
+
pad = max(0, int(pad / speed))
|
| 311 |
+
batch_size = 100
|
| 312 |
+
for i in range(0, len(token_lists), batch_size):
|
| 313 |
+
try:
|
| 314 |
+
outs = lf_forward(token_lists[i:i+batch_size], voice, speed)
|
| 315 |
+
except gr.exceptions.Error as e:
|
| 316 |
+
if wavs:
|
| 317 |
+
gr.Warning(e)
|
| 318 |
+
else:
|
| 319 |
+
raise gr.Error(e)
|
| 320 |
+
break
|
| 321 |
+
for out in outs:
|
| 322 |
+
if reduce_noise > 0:
|
| 323 |
+
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
| 324 |
+
if opening_cut > 0:
|
| 325 |
+
out[:opening_cut] = 0
|
| 326 |
+
if closing_cut > 0:
|
| 327 |
+
out[-closing_cut:] = 0
|
| 328 |
+
ease_in = min(int(ease_in / speed), len(out)//2 - opening_cut)
|
| 329 |
+
for i in range(ease_in):
|
| 330 |
+
out[i+opening_cut] *= s_curve(i / ease_in)
|
| 331 |
+
ease_out = min(int(ease_out / speed), len(out)//2 - closing_cut)
|
| 332 |
+
for i in range(ease_out):
|
| 333 |
+
out[-i-1-closing_cut] *= s_curve(i / ease_out)
|
| 334 |
+
if wavs and pad > 0:
|
| 335 |
+
wavs.append(np.zeros(pad))
|
| 336 |
+
wavs.append(out)
|
| 337 |
+
return (SAMPLE_RATE, np.concatenate(wavs)) if wavs else None
|
| 338 |
+
|
| 339 |
+
def did_change_segments(segments):
|
| 340 |
+
x = len(segments) if segments['Length'].any() else 0
|
| 341 |
+
return [
|
| 342 |
+
gr.Button('Tokenize', variant='secondary' if x else 'primary'),
|
| 343 |
+
gr.Button(f'Generate x{x}', variant='primary' if x else 'secondary', interactive=x > 0),
|
| 344 |
+
]
|
| 345 |
+
|
| 346 |
+
with gr.Blocks() as lf_tts:
|
| 347 |
+
with gr.Row():
|
| 348 |
+
with gr.Column():
|
| 349 |
+
text = gr.Textbox(label='Input Text')
|
| 350 |
+
voice = gr.Dropdown(list(CHOICES.items()), label='Voice')
|
| 351 |
+
with gr.Accordion('Text Settings', open=False):
|
| 352 |
+
skip_square_brackets = gr.Checkbox(True, label='Skip [Square Brackets]', info='Recommended for academic papers, Wikipedia articles, or texts with citations.')
|
| 353 |
+
newline_split = gr.Number(2, label='Newline Split', info='Split the input text on this many newlines. Affects how the text is segmented.', precision=0, minimum=0)
|
| 354 |
+
with gr.Row():
|
| 355 |
+
segment_btn = gr.Button('Tokenize', variant='primary')
|
| 356 |
+
generate_btn = gr.Button('Generate x0', variant='secondary', interactive=False)
|
| 357 |
+
with gr.Column():
|
| 358 |
+
audio = gr.Audio(interactive=False, label='Output Audio')
|
| 359 |
+
with gr.Accordion('Audio Settings', open=False):
|
| 360 |
+
with gr.Row():
|
| 361 |
+
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
|
| 362 |
+
with gr.Row():
|
| 363 |
+
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 364 |
+
with gr.Row():
|
| 365 |
+
with gr.Column():
|
| 366 |
+
opening_cut = gr.Slider(minimum=0, maximum=24000, value=4000, step=1000, label='Opening Cut', info='✂️ Zero out this many samples at the start.')
|
| 367 |
+
with gr.Column():
|
| 368 |
+
closing_cut = gr.Slider(minimum=0, maximum=24000, value=2000, step=1000, label='Closing Cut', info='✂️ Zero out this many samples at the end.')
|
| 369 |
+
with gr.Row():
|
| 370 |
+
with gr.Column():
|
| 371 |
+
ease_in = gr.Slider(minimum=0, maximum=24000, value=3000, step=1000, label='Ease In', info='🚀 Ease in for this many samples, after opening cut.')
|
| 372 |
+
with gr.Column():
|
| 373 |
+
ease_out = gr.Slider(minimum=0, maximum=24000, value=1000, step=1000, label='Ease Out', info='📐 Ease out for this many samples, before closing cut.')
|
| 374 |
+
with gr.Row():
|
| 375 |
+
pad = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad', info='🔇 How many samples of silence to insert between segments.')
|
| 376 |
+
with gr.Row():
|
| 377 |
+
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 378 |
+
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 379 |
+
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 380 |
+
generate_btn.click(lf_generate, inputs=[segments, voice, speed, reduce_noise, opening_cut, closing_cut, ease_in, ease_out, pad], outputs=[audio])
|
| 381 |
+
|
| 382 |
+
with gr.Blocks() as app:
|
| 383 |
+
gr.TabbedInterface(
|
| 384 |
+
[basic_tts, lf_tts],
|
| 385 |
+
['Basic TTS', 'Long-Form'],
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
if __name__ == '__main__':
|
| 389 |
+
app.launch()
|