Spaces:
Sleeping
Sleeping
Commit
·
d476512
1
Parent(s):
787f71f
feat: enhance UI/UX and fix custom_speaker_ref_text_input empty bug
Browse files
DEMO.md
CHANGED
|
@@ -3,5 +3,6 @@
|
|
| 3 |
ILRDF Formosan Text-To-Speech System
|
| 4 |
|
| 5 |
\
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
| 3 |
ILRDF Formosan Text-To-Speech System
|
| 4 |
|
| 5 |
\
|
| 6 |
+
這是「將文字轉換為聲音」的系統,請按照下方步驟操作,或查看操作手冊及操作影片。
|
| 7 |
+
本系統為初步開發成果的測試版,**合成結果可能於拼寫、斷句處有不盡理想之處,甚至可能出現錯誤**。
|
| 8 |
+
試用時請務必**謹慎檢視合成結果**,切勿直接作為正式或關鍵資訊使用,感謝您的理解與支持,並請不吝留下系統回報與建議。
|
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import tempfile
|
| 2 |
from importlib.resources import files
|
| 3 |
|
|
@@ -123,6 +124,8 @@ examples_config = OmegaConf.to_object(OmegaConf.load("configs/examples.yaml"))
|
|
| 123 |
|
| 124 |
DEFAULT_MODEL_ID = list(models_config.keys())[0]
|
| 125 |
|
|
|
|
|
|
|
| 126 |
|
| 127 |
@gpu_decorator
|
| 128 |
def infer(
|
|
@@ -219,19 +222,21 @@ with demo:
|
|
| 219 |
with gr.Tab("預設配音員"):
|
| 220 |
with gr.Row():
|
| 221 |
with gr.Column():
|
| 222 |
-
|
| 223 |
-
choices=
|
| 224 |
-
label="
|
| 225 |
-
value="阿美
|
|
|
|
| 226 |
)
|
| 227 |
|
| 228 |
-
def
|
| 229 |
-
return [r for r in refs_config.keys() if r.startswith(
|
| 230 |
|
| 231 |
default_speaker_refs = gr.Dropdown(
|
| 232 |
-
choices=
|
| 233 |
label="步驟二:選擇配音員",
|
| 234 |
-
value=
|
|
|
|
| 235 |
)
|
| 236 |
|
| 237 |
default_speaker_gen_text_input = gr.Textbox(
|
|
@@ -244,26 +249,46 @@ with demo:
|
|
| 244 |
)
|
| 245 |
|
| 246 |
with gr.Column():
|
| 247 |
-
default_speaker_audio_output = gr.Audio(
|
|
|
|
|
|
|
| 248 |
|
| 249 |
with gr.Tab("自己當配音員"):
|
| 250 |
with gr.Row():
|
| 251 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
custom_speaker_language = gr.Dropdown(
|
| 253 |
-
choices=
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
)
|
| 257 |
|
| 258 |
custom_speaker_ref_text_input = gr.Textbox(
|
| 259 |
-
value=refs_config
|
| 260 |
-
|
| 261 |
-
|
|
|
|
| 262 |
label="步驟二:點選🎙️錄製下方句子,或上傳與句子相符的音檔",
|
| 263 |
)
|
| 264 |
|
| 265 |
custom_speaker_audio_input = gr.Audio(
|
| 266 |
type="filepath",
|
|
|
|
| 267 |
waveform_options=gr.WaveformOptions(
|
| 268 |
sample_rate=24000,
|
| 269 |
),
|
|
@@ -280,23 +305,25 @@ with demo:
|
|
| 280 |
)
|
| 281 |
|
| 282 |
with gr.Column():
|
| 283 |
-
custom_speaker_audio_output = gr.Audio(
|
|
|
|
|
|
|
| 284 |
|
| 285 |
-
|
| 286 |
-
lambda
|
| 287 |
-
choices=
|
| 288 |
-
value=
|
| 289 |
),
|
| 290 |
-
inputs=[
|
| 291 |
outputs=[default_speaker_refs],
|
| 292 |
)
|
| 293 |
|
| 294 |
@gpu_decorator
|
| 295 |
def default_speaker_tts(
|
| 296 |
-
language: str,
|
| 297 |
ref: str,
|
| 298 |
gen_text_input: str,
|
| 299 |
):
|
|
|
|
| 300 |
ref_text_input = refs_config[ref]["text"]
|
| 301 |
ref_audio_input = refs_config[ref]["wav"]
|
| 302 |
|
|
@@ -304,6 +331,9 @@ with demo:
|
|
| 304 |
if len(gen_text_input) == 0:
|
| 305 |
raise gr.Error("請勿輸入空字串。")
|
| 306 |
|
|
|
|
|
|
|
|
|
|
| 307 |
ignore_punctuation = False
|
| 308 |
ipa_with_ng = False
|
| 309 |
|
|
@@ -325,16 +355,25 @@ with demo:
|
|
| 325 |
default_speaker_generate_btn.click(
|
| 326 |
default_speaker_tts,
|
| 327 |
inputs=[
|
| 328 |
-
default_speaker_language,
|
| 329 |
default_speaker_refs,
|
| 330 |
default_speaker_gen_text_input,
|
| 331 |
],
|
| 332 |
outputs=[default_speaker_audio_output],
|
| 333 |
)
|
| 334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
custom_speaker_language.change(
|
| 336 |
lambda lang: gr.Textbox(
|
| 337 |
-
value=refs_config
|
| 338 |
),
|
| 339 |
inputs=[custom_speaker_language],
|
| 340 |
outputs=[custom_speaker_ref_text_input],
|
|
@@ -358,6 +397,9 @@ with demo:
|
|
| 358 |
ignore_punctuation = False
|
| 359 |
ipa_with_ng = False
|
| 360 |
|
|
|
|
|
|
|
|
|
|
| 361 |
ref_text_input = text_to_ipa(
|
| 362 |
ref_text_input, language, ignore_punctuation, ipa_with_ng
|
| 363 |
)
|
|
|
|
| 1 |
+
import re
|
| 2 |
import tempfile
|
| 3 |
from importlib.resources import files
|
| 4 |
|
|
|
|
| 124 |
|
| 125 |
DEFAULT_MODEL_ID = list(models_config.keys())[0]
|
| 126 |
|
| 127 |
+
ETHNICITIES = list(set([k.split("_")[0] for k in g2p_object.keys()]))
|
| 128 |
+
|
| 129 |
|
| 130 |
@gpu_decorator
|
| 131 |
def infer(
|
|
|
|
| 222 |
with gr.Tab("預設配音員"):
|
| 223 |
with gr.Row():
|
| 224 |
with gr.Column():
|
| 225 |
+
default_speaker_ethnicity = gr.Dropdown(
|
| 226 |
+
choices=ETHNICITIES,
|
| 227 |
+
label="步驟一:選擇族別",
|
| 228 |
+
value="阿美",
|
| 229 |
+
filterable=False,
|
| 230 |
)
|
| 231 |
|
| 232 |
+
def get_refs_by_perfix(prefix: str):
|
| 233 |
+
return [r for r in refs_config.keys() if r.startswith(prefix)]
|
| 234 |
|
| 235 |
default_speaker_refs = gr.Dropdown(
|
| 236 |
+
choices=get_refs_by_perfix(default_speaker_ethnicity.value),
|
| 237 |
label="步驟二:選擇配音員",
|
| 238 |
+
value=get_refs_by_perfix(default_speaker_ethnicity.value)[0],
|
| 239 |
+
filterable=False,
|
| 240 |
)
|
| 241 |
|
| 242 |
default_speaker_gen_text_input = gr.Textbox(
|
|
|
|
| 249 |
)
|
| 250 |
|
| 251 |
with gr.Column():
|
| 252 |
+
default_speaker_audio_output = gr.Audio(
|
| 253 |
+
label="合成結果", show_share_button=False, show_download_button=True
|
| 254 |
+
)
|
| 255 |
|
| 256 |
with gr.Tab("自己當配音員"):
|
| 257 |
with gr.Row():
|
| 258 |
with gr.Column():
|
| 259 |
+
custom_speaker_ethnicity = gr.Dropdown(
|
| 260 |
+
choices=ETHNICITIES,
|
| 261 |
+
label="步驟一:選擇族別與語別",
|
| 262 |
+
value="阿美",
|
| 263 |
+
filterable=False,
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
custom_speaker_language = gr.Dropdown(
|
| 267 |
+
choices=[
|
| 268 |
+
k
|
| 269 |
+
for k in g2p_object.keys()
|
| 270 |
+
if k.startswith(custom_speaker_ethnicity.value)
|
| 271 |
+
],
|
| 272 |
+
value=[
|
| 273 |
+
k
|
| 274 |
+
for k in g2p_object.keys()
|
| 275 |
+
if k.startswith(custom_speaker_ethnicity.value)
|
| 276 |
+
][0],
|
| 277 |
+
filterable=False,
|
| 278 |
+
show_label=False,
|
| 279 |
)
|
| 280 |
|
| 281 |
custom_speaker_ref_text_input = gr.Textbox(
|
| 282 |
+
value=refs_config[
|
| 283 |
+
get_refs_by_perfix(custom_speaker_language.value)[0]
|
| 284 |
+
]["text"],
|
| 285 |
+
interactive=False,
|
| 286 |
label="步驟二:點選🎙️錄製下方句子,或上傳與句子相符的音檔",
|
| 287 |
)
|
| 288 |
|
| 289 |
custom_speaker_audio_input = gr.Audio(
|
| 290 |
type="filepath",
|
| 291 |
+
sources=["microphone", "upload"],
|
| 292 |
waveform_options=gr.WaveformOptions(
|
| 293 |
sample_rate=24000,
|
| 294 |
),
|
|
|
|
| 305 |
)
|
| 306 |
|
| 307 |
with gr.Column():
|
| 308 |
+
custom_speaker_audio_output = gr.Audio(
|
| 309 |
+
label="合成結果", show_share_button=False, show_download_button=True
|
| 310 |
+
)
|
| 311 |
|
| 312 |
+
default_speaker_ethnicity.change(
|
| 313 |
+
lambda ethnicity: gr.Dropdown(
|
| 314 |
+
choices=get_refs_by_perfix(ethnicity),
|
| 315 |
+
value=get_refs_by_perfix(ethnicity)[0],
|
| 316 |
),
|
| 317 |
+
inputs=[default_speaker_ethnicity],
|
| 318 |
outputs=[default_speaker_refs],
|
| 319 |
)
|
| 320 |
|
| 321 |
@gpu_decorator
|
| 322 |
def default_speaker_tts(
|
|
|
|
| 323 |
ref: str,
|
| 324 |
gen_text_input: str,
|
| 325 |
):
|
| 326 |
+
language = re.sub(r"_[男女]聲[12]", "", ref)
|
| 327 |
ref_text_input = refs_config[ref]["text"]
|
| 328 |
ref_audio_input = refs_config[ref]["wav"]
|
| 329 |
|
|
|
|
| 331 |
if len(gen_text_input) == 0:
|
| 332 |
raise gr.Error("請勿輸入空字串。")
|
| 333 |
|
| 334 |
+
if gen_text_input[-1] not in [".", "?", "!", ",", ";", ":"]:
|
| 335 |
+
gen_text_input += "."
|
| 336 |
+
|
| 337 |
ignore_punctuation = False
|
| 338 |
ipa_with_ng = False
|
| 339 |
|
|
|
|
| 355 |
default_speaker_generate_btn.click(
|
| 356 |
default_speaker_tts,
|
| 357 |
inputs=[
|
|
|
|
| 358 |
default_speaker_refs,
|
| 359 |
default_speaker_gen_text_input,
|
| 360 |
],
|
| 361 |
outputs=[default_speaker_audio_output],
|
| 362 |
)
|
| 363 |
|
| 364 |
+
custom_speaker_ethnicity.change(
|
| 365 |
+
lambda ethnicity: gr.Dropdown(
|
| 366 |
+
choices=[k for k in g2p_object.keys() if k.startswith(ethnicity)],
|
| 367 |
+
value=[k for k in g2p_object.keys() if k.startswith(ethnicity)][0],
|
| 368 |
+
visible=len([k for k in g2p_object.keys() if k.startswith(ethnicity)]) > 1,
|
| 369 |
+
),
|
| 370 |
+
inputs=[custom_speaker_ethnicity],
|
| 371 |
+
outputs=[custom_speaker_language],
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
custom_speaker_language.change(
|
| 375 |
lambda lang: gr.Textbox(
|
| 376 |
+
value=refs_config[get_refs_by_perfix(lang)[0]]["text"],
|
| 377 |
),
|
| 378 |
inputs=[custom_speaker_language],
|
| 379 |
outputs=[custom_speaker_ref_text_input],
|
|
|
|
| 397 |
ignore_punctuation = False
|
| 398 |
ipa_with_ng = False
|
| 399 |
|
| 400 |
+
if gen_text_input[-1] not in [".", "?", "!", ",", ";", ":"]:
|
| 401 |
+
gen_text_input += "."
|
| 402 |
+
|
| 403 |
ref_text_input = text_to_ipa(
|
| 404 |
ref_text_input, language, ignore_punctuation, ipa_with_ng
|
| 405 |
)
|