Spaces:
Running
Running
only tts
Browse files- app.py +35 -48
- female-20-happy.wav +0 -0
- female-46-neutral.wav +0 -0
- male-27-sad.wav +0 -0
- male-60-angry.wav +0 -0
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -62,7 +62,7 @@ def audionar_tts(text=None,
|
|
| 62 |
text = 'No Audio or Txt Input'
|
| 63 |
|
| 64 |
|
| 65 |
-
|
| 66 |
|
| 67 |
if lang not in language_names: # StyleTTS2
|
| 68 |
|
|
@@ -126,26 +126,26 @@ def audionar_tts(text=None,
|
|
| 126 |
|
| 127 |
# PAD
|
| 128 |
|
| 129 |
-
len_speech = len(
|
| 130 |
len_background = len(background_audio)
|
| 131 |
|
| 132 |
if len_background > len_speech:
|
| 133 |
padding = np.zeros(len_background - len_speech,
|
| 134 |
dtype=np.float32)
|
| 135 |
-
|
| 136 |
elif len_speech > len_background:
|
| 137 |
padding = np.zeros(len_speech - len_background,
|
| 138 |
dtype=np.float32)
|
| 139 |
background_audio = np.concatenate([background_audio, padding])
|
| 140 |
|
| 141 |
|
| 142 |
-
|
| 143 |
background_audio = background_audio[None, :]
|
| 144 |
|
| 145 |
|
| 146 |
final_audio = np.concatenate([
|
| 147 |
-
0.49 *
|
| 148 |
-
0.51 * background_audio + 0.49 *
|
| 149 |
], 0)
|
| 150 |
|
| 151 |
else:
|
|
@@ -155,7 +155,7 @@ def audionar_tts(text=None,
|
|
| 155 |
|
| 156 |
wavfile = '_vits_.wav'
|
| 157 |
audiofile.write(wavfile, final_audio, 16000)
|
| 158 |
-
return wavfile
|
| 159 |
|
| 160 |
|
| 161 |
|
|
@@ -495,46 +495,33 @@ VOICES = [t[:-4] for t in VOICES] # crop .wav for visuals in gr.DropDown
|
|
| 495 |
|
| 496 |
_tts = StyleTTS2().to('cpu')
|
| 497 |
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
value=24,
|
| 521 |
-
)
|
| 522 |
-
generate_button = gr.Button("Generate Audio", variant="primary")
|
| 523 |
-
|
| 524 |
-
output_audio = gr.Audio(label="TTS Output")
|
| 525 |
-
|
| 526 |
-
generate_button.click(
|
| 527 |
-
fn=audionar_tts,
|
| 528 |
-
inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
|
| 529 |
-
outputs=[output_audio]
|
| 530 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
|
| 532 |
-
|
| 533 |
-
with gr.Row():
|
| 534 |
-
with gr.Column():
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
gr.Markdown("Only the first two seconds of the audio will be processed.")
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
demo.launch(debug=True)
|
|
|
|
| 62 |
text = 'No Audio or Txt Input'
|
| 63 |
|
| 64 |
|
| 65 |
+
|
| 66 |
|
| 67 |
if lang not in language_names: # StyleTTS2
|
| 68 |
|
|
|
|
| 126 |
|
| 127 |
# PAD
|
| 128 |
|
| 129 |
+
len_speech = len(x)
|
| 130 |
len_background = len(background_audio)
|
| 131 |
|
| 132 |
if len_background > len_speech:
|
| 133 |
padding = np.zeros(len_background - len_speech,
|
| 134 |
dtype=np.float32)
|
| 135 |
+
x = np.concatenate([x, padding])
|
| 136 |
elif len_speech > len_background:
|
| 137 |
padding = np.zeros(len_speech - len_background,
|
| 138 |
dtype=np.float32)
|
| 139 |
background_audio = np.concatenate([background_audio, padding])
|
| 140 |
|
| 141 |
|
| 142 |
+
x = x[None, :]
|
| 143 |
background_audio = background_audio[None, :]
|
| 144 |
|
| 145 |
|
| 146 |
final_audio = np.concatenate([
|
| 147 |
+
0.49 * x + 0.51 * background_audio,
|
| 148 |
+
0.51 * background_audio + 0.49 * x
|
| 149 |
], 0)
|
| 150 |
|
| 151 |
else:
|
|
|
|
| 155 |
|
| 156 |
wavfile = '_vits_.wav'
|
| 157 |
audiofile.write(wavfile, final_audio, 16000)
|
| 158 |
+
return wavfile # 2x file for [audio out & state to pass to the Emotion reco tAB]
|
| 159 |
|
| 160 |
|
| 161 |
|
|
|
|
| 495 |
|
| 496 |
_tts = StyleTTS2().to('cpu')
|
| 497 |
|
| 498 |
+
demo = gr.Interface(
|
| 499 |
+
fn=audionar_tts,
|
| 500 |
+
inputs=[
|
| 501 |
+
gr.Textbox(
|
| 502 |
+
label="Type text for TTS:",
|
| 503 |
+
placeholder="Type Text for TTS",
|
| 504 |
+
lines=4,
|
| 505 |
+
value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
|
| 506 |
+
),
|
| 507 |
+
gr.Dropdown(
|
| 508 |
+
choices=language_names + VOICES,
|
| 509 |
+
label="TTS lang",
|
| 510 |
+
value=language_names[0]
|
| 511 |
+
),
|
| 512 |
+
gr.Textbox(
|
| 513 |
+
lines=1,
|
| 514 |
+
value="frogs",
|
| 515 |
+
label="AudioGen Txt"
|
| 516 |
+
),
|
| 517 |
+
gr.Number(
|
| 518 |
+
label="kv cache",
|
| 519 |
+
value=24,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
)
|
| 521 |
+
],
|
| 522 |
+
outputs=gr.Audio(label="TTS Output"),
|
| 523 |
+
title="TTS with a Simple Interface",
|
| 524 |
+
theme="huggingface"
|
| 525 |
+
)
|
| 526 |
|
| 527 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
female-20-happy.wav
DELETED
|
Binary file (51 kB)
|
|
|
female-46-neutral.wav
DELETED
|
Binary file (37.6 kB)
|
|
|
male-27-sad.wav
DELETED
|
Binary file (50.4 kB)
|
|
|
male-60-angry.wav
DELETED
|
Binary file (60.5 kB)
|
|
|
requirements.txt
CHANGED
|
@@ -3,7 +3,6 @@ nltk
|
|
| 3 |
librosa
|
| 4 |
phonemizer
|
| 5 |
audiofile
|
| 6 |
-
matplotlib
|
| 7 |
audresample
|
| 8 |
num2words
|
| 9 |
numpy<2.0.0
|
|
|
|
| 3 |
librosa
|
| 4 |
phonemizer
|
| 5 |
audiofile
|
|
|
|
| 6 |
audresample
|
| 7 |
num2words
|
| 8 |
numpy<2.0.0
|