Spaces:
Running
Running
only tts
Browse files- app.py +35 -48
- female-20-happy.wav +0 -0
- female-46-neutral.wav +0 -0
- male-27-sad.wav +0 -0
- male-60-angry.wav +0 -0
- requirements.txt +0 -1
app.py
CHANGED
@@ -62,7 +62,7 @@ def audionar_tts(text=None,
|
|
62 |
text = 'No Audio or Txt Input'
|
63 |
|
64 |
|
65 |
-
|
66 |
|
67 |
if lang not in language_names: # StyleTTS2
|
68 |
|
@@ -126,26 +126,26 @@ def audionar_tts(text=None,
|
|
126 |
|
127 |
# PAD
|
128 |
|
129 |
-
len_speech = len(
|
130 |
len_background = len(background_audio)
|
131 |
|
132 |
if len_background > len_speech:
|
133 |
padding = np.zeros(len_background - len_speech,
|
134 |
dtype=np.float32)
|
135 |
-
|
136 |
elif len_speech > len_background:
|
137 |
padding = np.zeros(len_speech - len_background,
|
138 |
dtype=np.float32)
|
139 |
background_audio = np.concatenate([background_audio, padding])
|
140 |
|
141 |
|
142 |
-
|
143 |
background_audio = background_audio[None, :]
|
144 |
|
145 |
|
146 |
final_audio = np.concatenate([
|
147 |
-
0.49 *
|
148 |
-
0.51 * background_audio + 0.49 *
|
149 |
], 0)
|
150 |
|
151 |
else:
|
@@ -155,7 +155,7 @@ def audionar_tts(text=None,
|
|
155 |
|
156 |
wavfile = '_vits_.wav'
|
157 |
audiofile.write(wavfile, final_audio, 16000)
|
158 |
-
return wavfile
|
159 |
|
160 |
|
161 |
|
@@ -495,46 +495,33 @@ VOICES = [t[:-4] for t in VOICES] # crop .wav for visuals in gr.DropDown
|
|
495 |
|
496 |
_tts = StyleTTS2().to('cpu')
|
497 |
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
value=24,
|
521 |
-
)
|
522 |
-
generate_button = gr.Button("Generate Audio", variant="primary")
|
523 |
-
|
524 |
-
output_audio = gr.Audio(label="TTS Output")
|
525 |
-
|
526 |
-
generate_button.click(
|
527 |
-
fn=audionar_tts,
|
528 |
-
inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
|
529 |
-
outputs=[output_audio]
|
530 |
)
|
|
|
|
|
|
|
|
|
|
|
531 |
|
532 |
-
|
533 |
-
with gr.Row():
|
534 |
-
with gr.Column():
|
535 |
-
|
536 |
-
|
537 |
-
gr.Markdown("Only the first two seconds of the audio will be processed.")
|
538 |
-
|
539 |
-
|
540 |
-
demo.launch(debug=True)
|
|
|
62 |
text = 'No Audio or Txt Input'
|
63 |
|
64 |
|
65 |
+
|
66 |
|
67 |
if lang not in language_names: # StyleTTS2
|
68 |
|
|
|
126 |
|
127 |
# PAD
|
128 |
|
129 |
+
len_speech = len(x)
|
130 |
len_background = len(background_audio)
|
131 |
|
132 |
if len_background > len_speech:
|
133 |
padding = np.zeros(len_background - len_speech,
|
134 |
dtype=np.float32)
|
135 |
+
x = np.concatenate([x, padding])
|
136 |
elif len_speech > len_background:
|
137 |
padding = np.zeros(len_speech - len_background,
|
138 |
dtype=np.float32)
|
139 |
background_audio = np.concatenate([background_audio, padding])
|
140 |
|
141 |
|
142 |
+
x = x[None, :]
|
143 |
background_audio = background_audio[None, :]
|
144 |
|
145 |
|
146 |
final_audio = np.concatenate([
|
147 |
+
0.49 * x + 0.51 * background_audio,
|
148 |
+
0.51 * background_audio + 0.49 * x
|
149 |
], 0)
|
150 |
|
151 |
else:
|
|
|
155 |
|
156 |
wavfile = '_vits_.wav'
|
157 |
audiofile.write(wavfile, final_audio, 16000)
|
158 |
+
return wavfile # 2x file for [audio out & state to pass to the Emotion reco tAB]
|
159 |
|
160 |
|
161 |
|
|
|
495 |
|
496 |
_tts = StyleTTS2().to('cpu')
|
497 |
|
498 |
+
demo = gr.Interface(
|
499 |
+
fn=audionar_tts,
|
500 |
+
inputs=[
|
501 |
+
gr.Textbox(
|
502 |
+
label="Type text for TTS:",
|
503 |
+
placeholder="Type Text for TTS",
|
504 |
+
lines=4,
|
505 |
+
value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
|
506 |
+
),
|
507 |
+
gr.Dropdown(
|
508 |
+
choices=language_names + VOICES,
|
509 |
+
label="TTS lang",
|
510 |
+
value=language_names[0]
|
511 |
+
),
|
512 |
+
gr.Textbox(
|
513 |
+
lines=1,
|
514 |
+
value="frogs",
|
515 |
+
label="AudioGen Txt"
|
516 |
+
),
|
517 |
+
gr.Number(
|
518 |
+
label="kv cache",
|
519 |
+
value=24,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
)
|
521 |
+
],
|
522 |
+
outputs=gr.Audio(label="TTS Output"),
|
523 |
+
title="TTS with a Simple Interface",
|
524 |
+
theme="huggingface"
|
525 |
+
)
|
526 |
|
527 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
female-20-happy.wav
DELETED
Binary file (51 kB)
|
|
female-46-neutral.wav
DELETED
Binary file (37.6 kB)
|
|
male-27-sad.wav
DELETED
Binary file (50.4 kB)
|
|
male-60-angry.wav
DELETED
Binary file (60.5 kB)
|
|
requirements.txt
CHANGED
@@ -3,7 +3,6 @@ nltk
|
|
3 |
librosa
|
4 |
phonemizer
|
5 |
audiofile
|
6 |
-
matplotlib
|
7 |
audresample
|
8 |
num2words
|
9 |
numpy<2.0.0
|
|
|
3 |
librosa
|
4 |
phonemizer
|
5 |
audiofile
|
|
|
6 |
audresample
|
7 |
num2words
|
8 |
numpy<2.0.0
|