voicegen

Running

App Files Files Community

Plachta commited on Jan 24, 2023

Commit

39c9cf7

1 Parent(s): e7b849e

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -8

app.py CHANGED Viewed

@@ -22,6 +22,8 @@ from mel_processing import spectrogram_torch
 import translators.server as tss
 import psutil
 from datetime import datetime
 def audio_postprocess(self, y):
     if y is None:
@@ -44,7 +46,7 @@ def audio_postprocess(self, y):
 gr.Audio.postprocess = audio_postprocess
 limitation = os.getenv("SYSTEM") == "spaces"  # limit text and audio length in huggingface spaces
-languages = ['日本語', '简体中文', 'English']
 characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
               '4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
               '8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
@@ -126,19 +128,73 @@ def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, i
         text = tss.google(text_raw, from_language='zh', to_language='ja')
     elif language == 'English':
         text = tss.google(text_raw, from_language='en', to_language='ja')
     char_id = int(character.split(':')[0])
     stn_tst = get_text(text, hps, is_symbol)
     with torch.no_grad():
         x_tst = stn_tst.unsqueeze(0)
         x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
         sid = torch.LongTensor([char_id])
         audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
     currentDateAndTime = datetime.now()
     print(f"Character {character} inference successful: {text}\n")
     if language != '日本語':
         print(f"translate from {language}: {text_raw}")
     show_memory_info(str(currentDateAndTime) + " infer调用后")
-    return (text, (22050, audio))
 download_audio_js = """
 () =>{{
@@ -173,7 +229,8 @@ if __name__ == "__main__":
                     "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
                     "This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
                     "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
-                    "! ! ! 若有bug��迎及时反馈 ! ! ! QQ:1925208426 \n\n"
                     "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
                     "如果您的输入语言不是日语，则会由谷歌翻译自动翻译为日语，但是准确性不能保证。\n\n"
                     )
@@ -181,7 +238,7 @@ if __name__ == "__main__":
             with gr.Column():
                 # We instantiate the Textbox class
                 textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
-                with gr.Accordion(label="Advanced Options", open=False):
                     temp_text_var = gr.Variable()
                     symbol_input = gr.Checkbox(value=False, label="Symbol input")
                     symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
@@ -226,9 +283,23 @@ if __name__ == "__main__":
                 text_output = gr.Textbox(label="Output Text")
                 audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
                 btn = gr.Button("Generate!")
-                btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown,
-                                         duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
-                          outputs=[text_output, audio_output])
                 download = gr.Button("Download Audio")
                 download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
         examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
@@ -246,16 +317,24 @@ if __name__ == "__main__":
             fn=infer
         )
         gr.Markdown("# Updates Logs 更新日志：\n\n"
                    "2023/1/13：\n\n"
                    "增加了音素输入的example（米浴喘气）\n\n"
                    "2023/1/12：\n\n"
                    "增加了音素输入的功能，可以对语气和语调做到一定程度的精细控制。\n\n"
                    "调整了UI的布局。\n\n"
                    "2023/1/10：\n\n"
                    "数据集已上传，您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
                    "2023/1/9：\n\n"
-                   "人物全是特别周的bug已修复，对此带来的不便感到十分抱歉。\n\n"
                    "模型推理已全面转为onnxruntime，现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
                    "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
                    )
     app.queue(concurrency_count=3).launch(show_api=False, share=args.share)

 import translators.server as tss
 import psutil
 from datetime import datetime
+import romajitable
+from text.cleaners import japanese_cleaners
 def audio_postprocess(self, y):
     if y is None:
 gr.Audio.postprocess = audio_postprocess
 limitation = os.getenv("SYSTEM") == "spaces"  # limit text and audio length in huggingface spaces
+languages = ['日本語', '简体中文', 'English', 'English2Katakana']
 characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
               '4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
               '8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
         text = tss.google(text_raw, from_language='zh', to_language='ja')
     elif language == 'English':
         text = tss.google(text_raw, from_language='en', to_language='ja')
+    elif language == "English2Katakana":
+        text = romajitable.to_kana(text_raw).katakana
     char_id = int(character.split(':')[0])
     stn_tst = get_text(text, hps, is_symbol)
     with torch.no_grad():
         x_tst = stn_tst.unsqueeze(0)
         x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
         sid = torch.LongTensor([char_id])
+        jp2phoneme = japanese_cleaners(text)
+        durations = net_g.predict_duration(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
+                                           noise_scale_w=noise_scale_w, length_scale=duration)
+        char_dur_list = []
+        for i, char in enumerate(jp2phoneme):
+            char_pos = i * 2 + 1
+            char_dur = durations[char_pos]
+            char_dur_list.append(char_dur)
+        char_spacing_dur_list = []
+        char_spacings = []
+        for i in range(len(durations)):
+            if i % 2 == 0:  # spacing
+                char_spacings.append("spacing")
+            elif i % 2 == 1:  # char
+                char_spacings.append(jp2phoneme[int((i - 1) / 2)])
+            char_spacing_dur_list.append(int(durations[i]))
+        # convert duration information to string
+        duration_info_str = ""
+        for i in range(len(char_spacings)):
+            if char_spacings[i] == "spacing":
+                duration_info_str += str(char_spacing_dur_list[i])
+            else:
+                duration_info_str += "{" + char_spacings[i] + ":" + str(char_spacing_dur_list[i]) + "}"
+            if i != len(char_spacings)-1:
+                duration_info_str += ", "
         audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
     currentDateAndTime = datetime.now()
     print(f"Character {character} inference successful: {text}\n")
     if language != '日本語':
         print(f"translate from {language}: {text_raw}")
     show_memory_info(str(currentDateAndTime) + " infer调用后")
+    return (text,(22050, audio), jp2phoneme, duration_info_str)
+def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale, noise_scale_w):
+    try:
+        phonemes = duration_info_str.split(", ")
+        recons_durs = []
+        recons_phonemes = ""
+        for item in phonemes:
+            if "{" not in item:  # spacing
+                recons_durs.append(int(item))
+            else:
+                recons_phonemes += item.strip("{}").split(":")[0]
+                recons_durs.append(int(item.strip("{}").split(":")[1]))
+    except ValueError:
+        return ("Error: Format must not be changed!", None)
+    except AssertionError:
+        return ("Error: Format must not be changed!", None)
+    char_id = int(character.split(':')[0])
+    stn_tst = get_text(recons_phonemes, hps, is_symbol=True)
+    with torch.no_grad():
+        x_tst = stn_tst.unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
+        sid = torch.LongTensor([char_id])
+        print(len(recons_durs))
+        print(x_tst.shape[1])
+        audio = net_g.infer_with_duration(x_tst, x_tst_lengths, w_ceil=recons_durs, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
+                            length_scale=duration)[0][0, 0].data.cpu().float().numpy()
+    return (recons_phonemes, (22050, audio))
 download_audio_js = """
 () =>{{
                     "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
                     "This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
                     "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
+                    "If you have any suggestions or bug reports, feel free to open discussion in Community.\n\n"
+                    "若有bug反馈或建议，请在Community下开启一个新的Discussion。 \n\n"
                     "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
                     "如果您的输入语言不是日语，则会由谷歌翻译自动翻译为日语，但是准确性不能保证。\n\n"
                     )
             with gr.Column():
                 # We instantiate the Textbox class
                 textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
+                with gr.Accordion(label="Phoneme Input", open=False):
                     temp_text_var = gr.Variable()
                     symbol_input = gr.Checkbox(value=False, label="Symbol input")
                     symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
                 text_output = gr.Textbox(label="Output Text")
                 audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
                 btn = gr.Button("Generate!")
+                with gr.Accordion(label="Speaking Pace Control", open=True):
+                    phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
+                    duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here. You can edit phoneme durations here and click regenerate for more precise control.",
+                                                interactive = True)
+                    gr.Markdown(
+                        "\{ \}内的数字代表每个音素在生成的音频中的长度，\{ \}外的数字代表音素之间间隔的长度。"
+                        "您可以手动修改这些数字来控制每个音素以及间隔的长度，从而完全控制合成音频的说话节奏。"
+                        "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
+                        "The numbers inside \{ \} represent the length for each phoneme in the generated audio, while the numbers out of \{ \} represent the length of spacings between phonemes."
+                        "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled."
+                        "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
+                    )
+                cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
+                btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
+                  outputs=[text_output, audio_output, phoneme_output, duration_output])
+                cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
+                          outputs=[phoneme_output, audio_output])
                 download = gr.Button("Download Audio")
                 download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
         examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
             fn=infer
         )
         gr.Markdown("# Updates Logs 更新日志：\n\n"
+                   "2023/1/24：\n\n"
+                   "增加了对说话节奏的音素级控制。\n\n"
+                   "Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
                    "2023/1/13：\n\n"
                    "增加了音素输入的example（米浴喘气）\n\n"
+                   "Added one example of phoneme input.\n\n"
                    "2023/1/12：\n\n"
                    "增加了音素输入的功能，可以对语气和语调做到一定程度的精细控制。\n\n"
+                   "Added phoneme input, which enables more precise control on output audio.\n\n"
                    "调整了UI的布局。\n\n"
+                   "Adjusted UI arrangements.\n\n"
                    "2023/1/10：\n\n"
                    "数据集已上传，您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
+                   "Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
                    "2023/1/9：\n\n"
                    "模型推理已全面转为onnxruntime，现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
+                   "Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
                    "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
+                   "Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
                    )
     app.queue(concurrency_count=3).launch(show_api=False, share=args.share)