Spaces:
Running
Running
Gabriel Vidal-Ayrinhac
commited on
Commit
·
9d076e3
1
Parent(s):
50d4732
process audio in memory
Browse files- src/hackathon/server/server.py +3 -15
- src/hackathon/speech/speech.py +7 -11
src/hackathon/server/server.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import os
|
| 2 |
from typing import Annotated, Dict, List
|
| 3 |
|
| 4 |
from dotenv import load_dotenv
|
|
@@ -22,11 +21,7 @@ from hackathon.server.schemas import (
|
|
| 22 |
StartRequest,
|
| 23 |
StartResponse,
|
| 24 |
)
|
| 25 |
-
from hackathon.speech.speech import
|
| 26 |
-
read_audio_config,
|
| 27 |
-
read_audio_file,
|
| 28 |
-
text_to_speech_file,
|
| 29 |
-
)
|
| 30 |
|
| 31 |
load_dotenv()
|
| 32 |
|
|
@@ -149,7 +144,7 @@ async def infer(
|
|
| 149 |
current_speaker.update_emotions(input_text)
|
| 150 |
msg = current_speaker.respond(input_text)
|
| 151 |
|
| 152 |
-
|
| 153 |
text=msg,
|
| 154 |
voice_id=current_audio_config["voice_id"],
|
| 155 |
stability=current_audio_config["stability"],
|
|
@@ -158,9 +153,6 @@ async def infer(
|
|
| 158 |
base_path=str(data_folder),
|
| 159 |
)
|
| 160 |
|
| 161 |
-
audio_signal = read_audio_file(audio_file_path) # base64
|
| 162 |
-
os.remove(audio_file_path)
|
| 163 |
-
|
| 164 |
return {
|
| 165 |
"generated_text": msg,
|
| 166 |
"anger": current_speaker.emotions["anger"],
|
|
@@ -247,7 +239,7 @@ async def cards(
|
|
| 247 |
|
| 248 |
data_folder = game_engine.data_folder
|
| 249 |
|
| 250 |
-
|
| 251 |
text=msg,
|
| 252 |
voice_id=current_audio_config["voice_id"],
|
| 253 |
stability=current_audio_config["stability"],
|
|
@@ -256,10 +248,6 @@ async def cards(
|
|
| 256 |
base_path=str(data_folder),
|
| 257 |
)
|
| 258 |
|
| 259 |
-
audio_signal = read_audio_file(audio_file_path) # base64
|
| 260 |
-
|
| 261 |
-
os.remove(audio_file_path)
|
| 262 |
-
|
| 263 |
return {"presenter_question": msg, "audio": audio_signal}
|
| 264 |
|
| 265 |
|
|
|
|
|
|
|
| 1 |
from typing import Annotated, Dict, List
|
| 2 |
|
| 3 |
from dotenv import load_dotenv
|
|
|
|
| 21 |
StartRequest,
|
| 22 |
StartResponse,
|
| 23 |
)
|
| 24 |
+
from hackathon.speech.speech import read_audio_config, text_to_speech_file
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
load_dotenv()
|
| 27 |
|
|
|
|
| 144 |
current_speaker.update_emotions(input_text)
|
| 145 |
msg = current_speaker.respond(input_text)
|
| 146 |
|
| 147 |
+
audio_signal = text_to_speech_file(
|
| 148 |
text=msg,
|
| 149 |
voice_id=current_audio_config["voice_id"],
|
| 150 |
stability=current_audio_config["stability"],
|
|
|
|
| 153 |
base_path=str(data_folder),
|
| 154 |
)
|
| 155 |
|
|
|
|
|
|
|
|
|
|
| 156 |
return {
|
| 157 |
"generated_text": msg,
|
| 158 |
"anger": current_speaker.emotions["anger"],
|
|
|
|
| 239 |
|
| 240 |
data_folder = game_engine.data_folder
|
| 241 |
|
| 242 |
+
audio_signal = text_to_speech_file(
|
| 243 |
text=msg,
|
| 244 |
voice_id=current_audio_config["voice_id"],
|
| 245 |
stability=current_audio_config["stability"],
|
|
|
|
| 248 |
base_path=str(data_folder),
|
| 249 |
)
|
| 250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
return {"presenter_question": msg, "audio": audio_signal}
|
| 252 |
|
| 253 |
|
src/hackathon/speech/speech.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
# Code copied from https://elevenlabs.io/docs/cookbooks/text-to-speech/streaming
|
| 2 |
|
| 3 |
import base64
|
| 4 |
-
import uuid
|
| 5 |
from io import BytesIO
|
| 6 |
from typing import IO
|
| 7 |
|
|
@@ -57,16 +56,13 @@ def text_to_speech_file(
|
|
| 57 |
),
|
| 58 |
)
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
print(f"{save_file_path}: audio file successfully saved !")
|
| 68 |
-
|
| 69 |
-
return save_file_path
|
| 70 |
|
| 71 |
|
| 72 |
def text_to_speech_stream(
|
|
|
|
| 1 |
# Code copied from https://elevenlabs.io/docs/cookbooks/text-to-speech/streaming
|
| 2 |
|
| 3 |
import base64
|
|
|
|
| 4 |
from io import BytesIO
|
| 5 |
from typing import IO
|
| 6 |
|
|
|
|
| 56 |
),
|
| 57 |
)
|
| 58 |
|
| 59 |
+
audio_data = BytesIO()
|
| 60 |
+
for chunk in response:
|
| 61 |
+
if chunk:
|
| 62 |
+
audio_data.write(chunk)
|
| 63 |
+
audio_data.seek(0)
|
| 64 |
+
audio_base64 = base64.b64encode(audio_data.read()).decode("utf-8")
|
| 65 |
+
return audio_base64
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
|
| 68 |
def text_to_speech_stream(
|