Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,6 +22,8 @@ from mel_processing import spectrogram_torch
|
|
| 22 |
import translators.server as tss
|
| 23 |
import psutil
|
| 24 |
from datetime import datetime
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def audio_postprocess(self, y):
|
| 27 |
if y is None:
|
|
@@ -44,7 +46,7 @@ def audio_postprocess(self, y):
|
|
| 44 |
gr.Audio.postprocess = audio_postprocess
|
| 45 |
|
| 46 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
| 47 |
-
languages = ['日本語', '简体中文', 'English']
|
| 48 |
characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
|
| 49 |
'4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
|
| 50 |
'8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
|
|
@@ -126,19 +128,73 @@ def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, i
|
|
| 126 |
text = tss.google(text_raw, from_language='zh', to_language='ja')
|
| 127 |
elif language == 'English':
|
| 128 |
text = tss.google(text_raw, from_language='en', to_language='ja')
|
|
|
|
|
|
|
| 129 |
char_id = int(character.split(':')[0])
|
| 130 |
stn_tst = get_text(text, hps, is_symbol)
|
| 131 |
with torch.no_grad():
|
| 132 |
x_tst = stn_tst.unsqueeze(0)
|
| 133 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
| 134 |
sid = torch.LongTensor([char_id])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
|
| 136 |
currentDateAndTime = datetime.now()
|
| 137 |
print(f"Character {character} inference successful: {text}\n")
|
| 138 |
if language != '日本語':
|
| 139 |
print(f"translate from {language}: {text_raw}")
|
| 140 |
show_memory_info(str(currentDateAndTime) + " infer调用后")
|
| 141 |
-
return (text,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
download_audio_js = """
|
| 144 |
() =>{{
|
|
@@ -173,7 +229,8 @@ if __name__ == "__main__":
|
|
| 173 |
"您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
|
| 174 |
"This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
|
| 175 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
| 176 |
-
"
|
|
|
|
| 177 |
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
|
| 178 |
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
|
| 179 |
)
|
|
@@ -181,7 +238,7 @@ if __name__ == "__main__":
|
|
| 181 |
with gr.Column():
|
| 182 |
# We instantiate the Textbox class
|
| 183 |
textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
|
| 184 |
-
with gr.Accordion(label="
|
| 185 |
temp_text_var = gr.Variable()
|
| 186 |
symbol_input = gr.Checkbox(value=False, label="Symbol input")
|
| 187 |
symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
|
|
@@ -226,9 +283,23 @@ if __name__ == "__main__":
|
|
| 226 |
text_output = gr.Textbox(label="Output Text")
|
| 227 |
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
| 228 |
btn = gr.Button("Generate!")
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
download = gr.Button("Download Audio")
|
| 233 |
download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
|
| 234 |
examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
|
|
@@ -246,16 +317,24 @@ if __name__ == "__main__":
|
|
| 246 |
fn=infer
|
| 247 |
)
|
| 248 |
gr.Markdown("# Updates Logs 更新日志:\n\n"
|
|
|
|
|
|
|
|
|
|
| 249 |
"2023/1/13:\n\n"
|
| 250 |
"增加了音素输入的example(米浴喘气)\n\n"
|
|
|
|
| 251 |
"2023/1/12:\n\n"
|
| 252 |
"增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
|
|
|
|
| 253 |
"调整了UI的布局。\n\n"
|
|
|
|
| 254 |
"2023/1/10:\n\n"
|
| 255 |
"数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
|
|
|
|
| 256 |
"2023/1/9:\n\n"
|
| 257 |
-
"人物全是特别周的bug已修复,对此带来的不便感到十分抱歉。\n\n"
|
| 258 |
"模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
|
|
|
|
| 259 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
|
|
|
| 260 |
)
|
| 261 |
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|
|
|
|
| 22 |
import translators.server as tss
|
| 23 |
import psutil
|
| 24 |
from datetime import datetime
|
| 25 |
+
import romajitable
|
| 26 |
+
from text.cleaners import japanese_cleaners
|
| 27 |
|
| 28 |
def audio_postprocess(self, y):
|
| 29 |
if y is None:
|
|
|
|
| 46 |
gr.Audio.postprocess = audio_postprocess
|
| 47 |
|
| 48 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
| 49 |
+
languages = ['日本語', '简体中文', 'English', 'English2Katakana']
|
| 50 |
characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
|
| 51 |
'4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
|
| 52 |
'8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
|
|
|
|
| 128 |
text = tss.google(text_raw, from_language='zh', to_language='ja')
|
| 129 |
elif language == 'English':
|
| 130 |
text = tss.google(text_raw, from_language='en', to_language='ja')
|
| 131 |
+
elif language == "English2Katakana":
|
| 132 |
+
text = romajitable.to_kana(text_raw).katakana
|
| 133 |
char_id = int(character.split(':')[0])
|
| 134 |
stn_tst = get_text(text, hps, is_symbol)
|
| 135 |
with torch.no_grad():
|
| 136 |
x_tst = stn_tst.unsqueeze(0)
|
| 137 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
| 138 |
sid = torch.LongTensor([char_id])
|
| 139 |
+
jp2phoneme = japanese_cleaners(text)
|
| 140 |
+
durations = net_g.predict_duration(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
| 141 |
+
noise_scale_w=noise_scale_w, length_scale=duration)
|
| 142 |
+
char_dur_list = []
|
| 143 |
+
for i, char in enumerate(jp2phoneme):
|
| 144 |
+
char_pos = i * 2 + 1
|
| 145 |
+
char_dur = durations[char_pos]
|
| 146 |
+
char_dur_list.append(char_dur)
|
| 147 |
+
char_spacing_dur_list = []
|
| 148 |
+
char_spacings = []
|
| 149 |
+
for i in range(len(durations)):
|
| 150 |
+
if i % 2 == 0: # spacing
|
| 151 |
+
char_spacings.append("spacing")
|
| 152 |
+
elif i % 2 == 1: # char
|
| 153 |
+
char_spacings.append(jp2phoneme[int((i - 1) / 2)])
|
| 154 |
+
char_spacing_dur_list.append(int(durations[i]))
|
| 155 |
+
# convert duration information to string
|
| 156 |
+
duration_info_str = ""
|
| 157 |
+
for i in range(len(char_spacings)):
|
| 158 |
+
if char_spacings[i] == "spacing":
|
| 159 |
+
duration_info_str += str(char_spacing_dur_list[i])
|
| 160 |
+
else:
|
| 161 |
+
duration_info_str += "{" + char_spacings[i] + ":" + str(char_spacing_dur_list[i]) + "}"
|
| 162 |
+
if i != len(char_spacings)-1:
|
| 163 |
+
duration_info_str += ", "
|
| 164 |
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
|
| 165 |
currentDateAndTime = datetime.now()
|
| 166 |
print(f"Character {character} inference successful: {text}\n")
|
| 167 |
if language != '日本語':
|
| 168 |
print(f"translate from {language}: {text_raw}")
|
| 169 |
show_memory_info(str(currentDateAndTime) + " infer调用后")
|
| 170 |
+
return (text,(22050, audio), jp2phoneme, duration_info_str)
|
| 171 |
+
|
| 172 |
+
def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale, noise_scale_w):
|
| 173 |
+
try:
|
| 174 |
+
phonemes = duration_info_str.split(", ")
|
| 175 |
+
recons_durs = []
|
| 176 |
+
recons_phonemes = ""
|
| 177 |
+
for item in phonemes:
|
| 178 |
+
if "{" not in item: # spacing
|
| 179 |
+
recons_durs.append(int(item))
|
| 180 |
+
else:
|
| 181 |
+
recons_phonemes += item.strip("{}").split(":")[0]
|
| 182 |
+
recons_durs.append(int(item.strip("{}").split(":")[1]))
|
| 183 |
+
except ValueError:
|
| 184 |
+
return ("Error: Format must not be changed!", None)
|
| 185 |
+
except AssertionError:
|
| 186 |
+
return ("Error: Format must not be changed!", None)
|
| 187 |
+
char_id = int(character.split(':')[0])
|
| 188 |
+
stn_tst = get_text(recons_phonemes, hps, is_symbol=True)
|
| 189 |
+
with torch.no_grad():
|
| 190 |
+
x_tst = stn_tst.unsqueeze(0)
|
| 191 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
| 192 |
+
sid = torch.LongTensor([char_id])
|
| 193 |
+
print(len(recons_durs))
|
| 194 |
+
print(x_tst.shape[1])
|
| 195 |
+
audio = net_g.infer_with_duration(x_tst, x_tst_lengths, w_ceil=recons_durs, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
|
| 196 |
+
length_scale=duration)[0][0, 0].data.cpu().float().numpy()
|
| 197 |
+
return (recons_phonemes, (22050, audio))
|
| 198 |
|
| 199 |
download_audio_js = """
|
| 200 |
() =>{{
|
|
|
|
| 229 |
"您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
|
| 230 |
"This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
|
| 231 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
| 232 |
+
"If you have any suggestions or bug reports, feel free to open discussion in Community.\n\n"
|
| 233 |
+
"若有bug反馈或建议,请在Community下开启一个新的Discussion。 \n\n"
|
| 234 |
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
|
| 235 |
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
|
| 236 |
)
|
|
|
|
| 238 |
with gr.Column():
|
| 239 |
# We instantiate the Textbox class
|
| 240 |
textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
|
| 241 |
+
with gr.Accordion(label="Phoneme Input", open=False):
|
| 242 |
temp_text_var = gr.Variable()
|
| 243 |
symbol_input = gr.Checkbox(value=False, label="Symbol input")
|
| 244 |
symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
|
|
|
|
| 283 |
text_output = gr.Textbox(label="Output Text")
|
| 284 |
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
| 285 |
btn = gr.Button("Generate!")
|
| 286 |
+
with gr.Accordion(label="Speaking Pace Control", open=True):
|
| 287 |
+
phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
|
| 288 |
+
duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here. You can edit phoneme durations here and click regenerate for more precise control.",
|
| 289 |
+
interactive = True)
|
| 290 |
+
gr.Markdown(
|
| 291 |
+
"\{ \}内的数字代表每个音素在生成的音频中的长度,\{ \}外的数字代表音素之间间隔的长度。"
|
| 292 |
+
"您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
|
| 293 |
+
"注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
|
| 294 |
+
"The numbers inside \{ \} represent the length for each phoneme in the generated audio, while the numbers out of \{ \} represent the length of spacings between phonemes."
|
| 295 |
+
"You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled."
|
| 296 |
+
"Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
|
| 297 |
+
)
|
| 298 |
+
cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
|
| 299 |
+
btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
|
| 300 |
+
outputs=[text_output, audio_output, phoneme_output, duration_output])
|
| 301 |
+
cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
|
| 302 |
+
outputs=[phoneme_output, audio_output])
|
| 303 |
download = gr.Button("Download Audio")
|
| 304 |
download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
|
| 305 |
examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
|
|
|
|
| 317 |
fn=infer
|
| 318 |
)
|
| 319 |
gr.Markdown("# Updates Logs 更新日志:\n\n"
|
| 320 |
+
"2023/1/24:\n\n"
|
| 321 |
+
"增加了对说话节奏的音素级控制。\n\n"
|
| 322 |
+
"Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
|
| 323 |
"2023/1/13:\n\n"
|
| 324 |
"增加了音素输入的example(米浴喘气)\n\n"
|
| 325 |
+
"Added one example of phoneme input.\n\n"
|
| 326 |
"2023/1/12:\n\n"
|
| 327 |
"增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
|
| 328 |
+
"Added phoneme input, which enables more precise control on output audio.\n\n"
|
| 329 |
"调整了UI的布局。\n\n"
|
| 330 |
+
"Adjusted UI arrangements.\n\n"
|
| 331 |
"2023/1/10:\n\n"
|
| 332 |
"数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
|
| 333 |
+
"Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
|
| 334 |
"2023/1/9:\n\n"
|
|
|
|
| 335 |
"模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
|
| 336 |
+
"Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
|
| 337 |
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
| 338 |
+
"Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
|
| 339 |
)
|
| 340 |
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|