Spaces:
Running
Running
show TTS into Expression tab
Browse files
app.py
CHANGED
@@ -466,10 +466,11 @@ def audionar_tts(text=None,
|
|
466 |
}
|
467 |
|
468 |
if text and text.strip():
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
|
|
473 |
|
474 |
else: # VITS
|
475 |
|
@@ -555,7 +556,7 @@ def audionar_tts(text=None,
|
|
555 |
wavfile = '_vits_.wav'
|
556 |
audiofile.write(wavfile, final_audio, 16000)
|
557 |
|
558 |
-
return wavfile
|
559 |
|
560 |
|
561 |
# -- EXPRESSIO
|
@@ -1338,10 +1339,6 @@ def _stylett2(text='Hallov worlds Far over the',
|
|
1338 |
|
1339 |
return speech_audio
|
1340 |
|
1341 |
-
def update_selected_voice(voice_filename):
|
1342 |
-
return 'wav/' + voice_filename + '.wav'
|
1343 |
-
|
1344 |
-
|
1345 |
description = (
|
1346 |
"Estimate **age**, **gender**, and **expression** "
|
1347 |
"of the speaker contained in an audio file or microphone recording. \n"
|
@@ -1353,8 +1350,21 @@ description = (
|
|
1353 |
"recognises the expression dimensions arousal, dominance, and valence. "
|
1354 |
)
|
1355 |
|
|
|
1356 |
|
1357 |
with gr.Blocks(theme='huggingface') as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1358 |
with gr.Tab(label="TTS"):
|
1359 |
with gr.Row():
|
1360 |
text_input = gr.Textbox(
|
@@ -1363,13 +1373,10 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
1363 |
lines=4,
|
1364 |
value="Farover the misty mountains cold too dungeons deep and caverns old.",
|
1365 |
)
|
1366 |
-
# Unified dropdown for both voices and languages
|
1367 |
-
# You'll need to handle the logic to determine if it's a voice or a language
|
1368 |
-
# based on the selection. A single list of choices is used here.
|
1369 |
choice_dropdown = gr.Dropdown(
|
1370 |
choices=language_names + VOICES,
|
1371 |
label="Select Voice or Language",
|
1372 |
-
value=VOICES[0]
|
1373 |
)
|
1374 |
soundscape_input = gr.Textbox(
|
1375 |
lines=1,
|
@@ -1384,31 +1391,45 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
1384 |
|
1385 |
output_audio = gr.Audio(label="TTS Output")
|
1386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1387 |
generate_button.click(
|
1388 |
-
fn=
|
1389 |
-
inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
|
1390 |
-
outputs=output_audio
|
1391 |
)
|
|
|
1392 |
with gr.Tab(label="Speech Analysis"):
|
1393 |
with gr.Row():
|
1394 |
with gr.Column():
|
1395 |
gr.Markdown(description)
|
1396 |
-
|
1397 |
sources=["upload", "microphone"],
|
1398 |
type="filepath",
|
1399 |
label="Audio input",
|
1400 |
-
min_length=0.025,
|
1401 |
)
|
1402 |
-
|
1403 |
-
|
1404 |
-
|
1405 |
-
|
1406 |
-
"wav/
|
1407 |
-
"wav/
|
|
|
|
|
1408 |
],
|
1409 |
-
[
|
1410 |
label="Examples from CREMA-D, ODbL v1.0 license",
|
1411 |
)
|
|
|
1412 |
gr.Markdown("Only the first two seconds of the audio will be processed.")
|
1413 |
submit_btn = gr.Button(value="Submit")
|
1414 |
with gr.Column():
|
@@ -1417,6 +1438,19 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
1417 |
output_expression = gr.Image(label="Expression")
|
1418 |
|
1419 |
outputs = [output_age, output_gender, output_expression]
|
1420 |
-
submit_btn.click(recognize, input, outputs)
|
1421 |
|
1422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
}
|
467 |
|
468 |
if text and text.strip():
|
469 |
+
|
470 |
+
if lang not in language_names:
|
471 |
+
|
472 |
+
speech_audio = _styletts2(text=text, # Eng.
|
473 |
+
ref_s='wav/' + lang + '.wav')
|
474 |
|
475 |
else: # VITS
|
476 |
|
|
|
556 |
wavfile = '_vits_.wav'
|
557 |
audiofile.write(wavfile, final_audio, 16000)
|
558 |
|
559 |
+
return wavfile, wavfile # 2x file for [audio out & state to pass to the Emotion reco tAB]
|
560 |
|
561 |
|
562 |
# -- EXPRESSIO
|
|
|
1339 |
|
1340 |
return speech_audio
|
1341 |
|
|
|
|
|
|
|
|
|
1342 |
description = (
|
1343 |
"Estimate **age**, **gender**, and **expression** "
|
1344 |
"of the speaker contained in an audio file or microphone recording. \n"
|
|
|
1350 |
"recognises the expression dimensions arousal, dominance, and valence. "
|
1351 |
)
|
1352 |
|
1353 |
+
# =============
|
1354 |
|
1355 |
with gr.Blocks(theme='huggingface') as demo:
|
1356 |
+
# This state will be used to hold the generated TTS file path
|
1357 |
+
tts_file = gr.State(value=None)
|
1358 |
+
# This state will hold the list of examples, including the generated one
|
1359 |
+
audio_examples_state = gr.State(
|
1360 |
+
value=[
|
1361 |
+
["wav/female-46-neutral.wav"],
|
1362 |
+
["wav/female-20-happy.wav"],
|
1363 |
+
["wav/male-60-angry.wav"],
|
1364 |
+
["wav/male-27-sad.wav"],
|
1365 |
+
]
|
1366 |
+
)
|
1367 |
+
|
1368 |
with gr.Tab(label="TTS"):
|
1369 |
with gr.Row():
|
1370 |
text_input = gr.Textbox(
|
|
|
1373 |
lines=4,
|
1374 |
value="Farover the misty mountains cold too dungeons deep and caverns old.",
|
1375 |
)
|
|
|
|
|
|
|
1376 |
choice_dropdown = gr.Dropdown(
|
1377 |
choices=language_names + VOICES,
|
1378 |
label="Select Voice or Language",
|
1379 |
+
value=VOICES[0]
|
1380 |
)
|
1381 |
soundscape_input = gr.Textbox(
|
1382 |
lines=1,
|
|
|
1391 |
|
1392 |
output_audio = gr.Audio(label="TTS Output")
|
1393 |
|
1394 |
+
def generate_and_update_state(text, choice, soundscape, kv, current_examples):
|
1395 |
+
# This function calls the TTS and updates the state
|
1396 |
+
audio_path = audionar_tts(text, choice, soundscape, kv)
|
1397 |
+
|
1398 |
+
# Append the new audio path to the existing list of examples
|
1399 |
+
updated_examples = current_examples + [[audio_path]]
|
1400 |
+
|
1401 |
+
# Return the generated audio path for the output and the updated list for the state
|
1402 |
+
return audio_path, updated_examples
|
1403 |
+
|
1404 |
generate_button.click(
|
1405 |
+
fn=generate_and_update_state,
|
1406 |
+
inputs=[text_input, choice_dropdown, soundscape_input, kv_input, audio_examples_state],
|
1407 |
+
outputs=[output_audio, audio_examples_state]
|
1408 |
)
|
1409 |
+
|
1410 |
with gr.Tab(label="Speech Analysis"):
|
1411 |
with gr.Row():
|
1412 |
with gr.Column():
|
1413 |
gr.Markdown(description)
|
1414 |
+
input_audio_analysis = gr.Audio(
|
1415 |
sources=["upload", "microphone"],
|
1416 |
type="filepath",
|
1417 |
label="Audio input",
|
1418 |
+
min_length=0.025,
|
1419 |
)
|
1420 |
+
|
1421 |
+
# The gr.Examples component that will be dynamically updated
|
1422 |
+
audio_examples = gr.Examples(
|
1423 |
+
examples=[
|
1424 |
+
["wav/female-46-neutral.wav"],
|
1425 |
+
["wav/female-20-happy.wav"],
|
1426 |
+
["wav/male-60-angry.wav"],
|
1427 |
+
["wav/male-27-sad.wav"],
|
1428 |
],
|
1429 |
+
inputs=[input_audio_analysis],
|
1430 |
label="Examples from CREMA-D, ODbL v1.0 license",
|
1431 |
)
|
1432 |
+
|
1433 |
gr.Markdown("Only the first two seconds of the audio will be processed.")
|
1434 |
submit_btn = gr.Button(value="Submit")
|
1435 |
with gr.Column():
|
|
|
1438 |
output_expression = gr.Image(label="Expression")
|
1439 |
|
1440 |
outputs = [output_age, output_gender, output_expression]
|
|
|
1441 |
|
1442 |
+
# Function to update the examples from the state
|
1443 |
+
def load_examples_from_state(examples_list):
|
1444 |
+
return gr.Examples.update(examples=examples_list)
|
1445 |
+
|
1446 |
+
# This is the key: an event listener that triggers when the tab is selected
|
1447 |
+
demo.load(
|
1448 |
+
fn=load_examples_from_state,
|
1449 |
+
inputs=audio_examples_state,
|
1450 |
+
outputs=audio_examples,
|
1451 |
+
queue=False,
|
1452 |
+
)
|
1453 |
+
|
1454 |
+
submit_btn.click(recognize, input_audio_analysis, outputs)
|
1455 |
+
|
1456 |
+
demo.launch(debug=True)
|