Spaces:

Dionyssos
/

speech-analysis2

Running

App Files Files Community

Dionyssos commited on Sep 14

Commit

a183432

1 Parent(s): c63ff96

only tts

Browse files

Files changed (6) hide show

app.py +35 -48
female-20-happy.wav +0 -0
female-46-neutral.wav +0 -0
male-27-sad.wav +0 -0
male-60-angry.wav +0 -0
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -62,7 +62,7 @@ def audionar_tts(text=None,
         text = 'No Audio or Txt Input'
-    print(lang, lang in language_names)
     if lang not in language_names:  # StyleTTS2
@@ -126,26 +126,26 @@ def audionar_tts(text=None,
         # PAD
-        len_speech = len(speech_audio)
         len_background = len(background_audio)
         if len_background > len_speech:
             padding = np.zeros(len_background - len_speech,
                                 dtype=np.float32)
-            speech_audio = np.concatenate([speech_audio, padding])
         elif len_speech > len_background:
             padding = np.zeros(len_speech - len_background,
                                 dtype=np.float32)
             background_audio = np.concatenate([background_audio, padding])
-        speech_audio = speech_audio[None, :]
         background_audio = background_audio[None, :]
         final_audio = np.concatenate([
-            0.49 * speech_audio + 0.51 * background_audio,
-            0.51 * background_audio + 0.49 * speech_audio
         ], 0)
     else:
@@ -155,7 +155,7 @@ def audionar_tts(text=None,
     wavfile = '_vits_.wav'
     audiofile.write(wavfile, final_audio, 16000)
-    return wavfile, wavfile  # 2x file for [audio out & state to pass to the Emotion reco tAB]
@@ -495,46 +495,33 @@ VOICES = [t[:-4] for t in VOICES]  # crop .wav for visuals in gr.DropDown
 _tts = StyleTTS2().to('cpu')
-with gr.Blocks(theme='huggingface') as demo:
-    with gr.Tab(label="TTS"):
-        with gr.Row():
-            text_input = gr.Textbox(
-                label="Type text for TTS:",
-                placeholder="Type Text for TTS",
-                lines=4,
-                value="Farover the misty mountains cold too dungeons deep and caverns old.",
-            )
-            choice_dropdown = gr.Dropdown(
-                choices=language_names + VOICES,
-                label="Select Voice or Language",
-                value=VOICES[0]
-            )
-            soundscape_input = gr.Textbox(
-                lines=1,
-                value="frogs",
-                label="AudioGen Txt"
-            )
-            kv_input = gr.Number(
-                label="kv Period",
-                value=24,
-            )
-            generate_button = gr.Button("Generate Audio", variant="primary")
-        output_audio = gr.Audio(label="TTS Output")
-        generate_button.click(
-            fn=audionar_tts,
-            inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
-            outputs=[output_audio]
         )
-    with gr.Tab(label="API"):
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("Only the first two seconds of the audio will be processed.")
-demo.launch(debug=True)

         text = 'No Audio or Txt Input'
     if lang not in language_names:  # StyleTTS2
         # PAD
+        len_speech = len(x)
         len_background = len(background_audio)
         if len_background > len_speech:
             padding = np.zeros(len_background - len_speech,
                                 dtype=np.float32)
+            x = np.concatenate([x, padding])
         elif len_speech > len_background:
             padding = np.zeros(len_speech - len_background,
                                 dtype=np.float32)
             background_audio = np.concatenate([background_audio, padding])
+        x = x[None, :]
         background_audio = background_audio[None, :]
         final_audio = np.concatenate([
+            0.49 * x + 0.51 * background_audio,
+            0.51 * background_audio + 0.49 * x
         ], 0)
     else:
     wavfile = '_vits_.wav'
     audiofile.write(wavfile, final_audio, 16000)
+    return wavfile  # 2x file for [audio out & state to pass to the Emotion reco tAB]
 _tts = StyleTTS2().to('cpu')
+demo = gr.Interface(
+    fn=audionar_tts,
+    inputs=[
+        gr.Textbox(
+            label="Type text for TTS:",
+            placeholder="Type Text for TTS",
+            lines=4,
+            value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
+        ),
+        gr.Dropdown(
+            choices=language_names + VOICES,
+            label="TTS lang",
+            value=language_names[0]
+        ),
+        gr.Textbox(
+            lines=1,
+            value="frogs",
+            label="AudioGen Txt"
+        ),
+        gr.Number(
+            label="kv cache",
+            value=24,
         )
+    ],
+    outputs=gr.Audio(label="TTS Output"),
+    title="TTS with a Simple Interface",
+    theme="huggingface"
+)
+demo.launch()

female-20-happy.wav DELETED Viewed

Binary file (51 kB)

female-46-neutral.wav DELETED Viewed

Binary file (37.6 kB)

male-27-sad.wav DELETED Viewed

Binary file (50.4 kB)

male-60-angry.wav DELETED Viewed

Binary file (60.5 kB)

requirements.txt CHANGED Viewed

@@ -3,7 +3,6 @@ nltk
 librosa
 phonemizer
 audiofile
-matplotlib
 audresample
 num2words
 numpy<2.0.0

 librosa
 phonemizer
 audiofile
 audresample
 num2words
 numpy<2.0.0