Dionyssos commited on
Commit
9b2426f
·
1 Parent(s): 6c9a684

show TTS into Expression tab

Browse files
Files changed (1) hide show
  1. app.py +61 -27
app.py CHANGED
@@ -466,10 +466,11 @@ def audionar_tts(text=None,
466
  }
467
 
468
  if text and text.strip():
469
- if 'wav/' in lang:
470
- # call StyleTTS2
471
- speech_audio = _styletts2(text=text,
472
- ref_s=lang)
 
473
 
474
  else: # VITS
475
 
@@ -555,7 +556,7 @@ def audionar_tts(text=None,
555
  wavfile = '_vits_.wav'
556
  audiofile.write(wavfile, final_audio, 16000)
557
 
558
- return wavfile
559
 
560
 
561
  # -- EXPRESSIO
@@ -1338,10 +1339,6 @@ def _stylett2(text='Hallov worlds Far over the',
1338
 
1339
  return speech_audio
1340
 
1341
- def update_selected_voice(voice_filename):
1342
- return 'wav/' + voice_filename + '.wav'
1343
-
1344
-
1345
  description = (
1346
  "Estimate **age**, **gender**, and **expression** "
1347
  "of the speaker contained in an audio file or microphone recording. \n"
@@ -1353,8 +1350,21 @@ description = (
1353
  "recognises the expression dimensions arousal, dominance, and valence. "
1354
  )
1355
 
 
1356
 
1357
  with gr.Blocks(theme='huggingface') as demo:
 
 
 
 
 
 
 
 
 
 
 
 
1358
  with gr.Tab(label="TTS"):
1359
  with gr.Row():
1360
  text_input = gr.Textbox(
@@ -1363,13 +1373,10 @@ with gr.Blocks(theme='huggingface') as demo:
1363
  lines=4,
1364
  value="Farover the misty mountains cold too dungeons deep and caverns old.",
1365
  )
1366
- # Unified dropdown for both voices and languages
1367
- # You'll need to handle the logic to determine if it's a voice or a language
1368
- # based on the selection. A single list of choices is used here.
1369
  choice_dropdown = gr.Dropdown(
1370
  choices=language_names + VOICES,
1371
  label="Select Voice or Language",
1372
- value=VOICES[0] # Set a default value
1373
  )
1374
  soundscape_input = gr.Textbox(
1375
  lines=1,
@@ -1384,31 +1391,45 @@ with gr.Blocks(theme='huggingface') as demo:
1384
 
1385
  output_audio = gr.Audio(label="TTS Output")
1386
 
 
 
 
 
 
 
 
 
 
 
1387
  generate_button.click(
1388
- fn=audionar_tts,
1389
- inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
1390
- outputs=output_audio
1391
  )
 
1392
  with gr.Tab(label="Speech Analysis"):
1393
  with gr.Row():
1394
  with gr.Column():
1395
  gr.Markdown(description)
1396
- input = gr.Audio(
1397
  sources=["upload", "microphone"],
1398
  type="filepath",
1399
  label="Audio input",
1400
- min_length=0.025, # seconds
1401
  )
1402
- gr.Examples(
1403
- [
1404
- "wav/female-46-neutral.wav",
1405
- "wav/female-20-happy.wav",
1406
- "wav/male-60-angry.wav",
1407
- "wav/male-27-sad.wav",
 
 
1408
  ],
1409
- [input],
1410
  label="Examples from CREMA-D, ODbL v1.0 license",
1411
  )
 
1412
  gr.Markdown("Only the first two seconds of the audio will be processed.")
1413
  submit_btn = gr.Button(value="Submit")
1414
  with gr.Column():
@@ -1417,6 +1438,19 @@ with gr.Blocks(theme='huggingface') as demo:
1417
  output_expression = gr.Image(label="Expression")
1418
 
1419
  outputs = [output_age, output_gender, output_expression]
1420
- submit_btn.click(recognize, input, outputs)
1421
 
1422
- demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  }
467
 
468
  if text and text.strip():
469
+
470
+ if lang not in language_names:
471
+
472
+ speech_audio = _styletts2(text=text, # Eng.
473
+ ref_s='wav/' + lang + '.wav')
474
 
475
  else: # VITS
476
 
 
556
  wavfile = '_vits_.wav'
557
  audiofile.write(wavfile, final_audio, 16000)
558
 
559
+ return wavfile, wavfile # 2x file for [audio out & state to pass to the Emotion reco tAB]
560
 
561
 
562
  # -- EXPRESSIO
 
1339
 
1340
  return speech_audio
1341
 
 
 
 
 
1342
  description = (
1343
  "Estimate **age**, **gender**, and **expression** "
1344
  "of the speaker contained in an audio file or microphone recording. \n"
 
1350
  "recognises the expression dimensions arousal, dominance, and valence. "
1351
  )
1352
 
1353
+ # =============
1354
 
1355
  with gr.Blocks(theme='huggingface') as demo:
1356
+ # This state will be used to hold the generated TTS file path
1357
+ tts_file = gr.State(value=None)
1358
+ # This state will hold the list of examples, including the generated one
1359
+ audio_examples_state = gr.State(
1360
+ value=[
1361
+ ["wav/female-46-neutral.wav"],
1362
+ ["wav/female-20-happy.wav"],
1363
+ ["wav/male-60-angry.wav"],
1364
+ ["wav/male-27-sad.wav"],
1365
+ ]
1366
+ )
1367
+
1368
  with gr.Tab(label="TTS"):
1369
  with gr.Row():
1370
  text_input = gr.Textbox(
 
1373
  lines=4,
1374
  value="Farover the misty mountains cold too dungeons deep and caverns old.",
1375
  )
 
 
 
1376
  choice_dropdown = gr.Dropdown(
1377
  choices=language_names + VOICES,
1378
  label="Select Voice or Language",
1379
+ value=VOICES[0]
1380
  )
1381
  soundscape_input = gr.Textbox(
1382
  lines=1,
 
1391
 
1392
  output_audio = gr.Audio(label="TTS Output")
1393
 
1394
+ def generate_and_update_state(text, choice, soundscape, kv, current_examples):
1395
+ # This function calls the TTS and updates the state
1396
+ audio_path = audionar_tts(text, choice, soundscape, kv)
1397
+
1398
+ # Append the new audio path to the existing list of examples
1399
+ updated_examples = current_examples + [[audio_path]]
1400
+
1401
+ # Return the generated audio path for the output and the updated list for the state
1402
+ return audio_path, updated_examples
1403
+
1404
  generate_button.click(
1405
+ fn=generate_and_update_state,
1406
+ inputs=[text_input, choice_dropdown, soundscape_input, kv_input, audio_examples_state],
1407
+ outputs=[output_audio, audio_examples_state]
1408
  )
1409
+
1410
  with gr.Tab(label="Speech Analysis"):
1411
  with gr.Row():
1412
  with gr.Column():
1413
  gr.Markdown(description)
1414
+ input_audio_analysis = gr.Audio(
1415
  sources=["upload", "microphone"],
1416
  type="filepath",
1417
  label="Audio input",
1418
+ min_length=0.025,
1419
  )
1420
+
1421
+ # The gr.Examples component that will be dynamically updated
1422
+ audio_examples = gr.Examples(
1423
+ examples=[
1424
+ ["wav/female-46-neutral.wav"],
1425
+ ["wav/female-20-happy.wav"],
1426
+ ["wav/male-60-angry.wav"],
1427
+ ["wav/male-27-sad.wav"],
1428
  ],
1429
+ inputs=[input_audio_analysis],
1430
  label="Examples from CREMA-D, ODbL v1.0 license",
1431
  )
1432
+
1433
  gr.Markdown("Only the first two seconds of the audio will be processed.")
1434
  submit_btn = gr.Button(value="Submit")
1435
  with gr.Column():
 
1438
  output_expression = gr.Image(label="Expression")
1439
 
1440
  outputs = [output_age, output_gender, output_expression]
 
1441
 
1442
+ # Function to update the examples from the state
1443
+ def load_examples_from_state(examples_list):
1444
+ return gr.Examples.update(examples=examples_list)
1445
+
1446
+ # This is the key: an event listener that triggers when the tab is selected
1447
+ demo.load(
1448
+ fn=load_examples_from_state,
1449
+ inputs=audio_examples_state,
1450
+ outputs=audio_examples,
1451
+ queue=False,
1452
+ )
1453
+
1454
+ submit_btn.click(recognize, input_audio_analysis, outputs)
1455
+
1456
+ demo.launch(debug=True)