Spaces:

Mistral-AI-Game-Jam
/

Team15

Running

Gabriel Vidal-Ayrinhac commited on Jan 28

Commit

9d076e3

1 Parent(s): 50d4732

process audio in memory

Files changed (2) hide show

src/hackathon/server/server.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 from typing import Annotated, Dict, List
 from dotenv import load_dotenv
@@ -22,11 +21,7 @@ from hackathon.server.schemas import (
     StartRequest,
     StartResponse,
 )
-from hackathon.speech.speech import (
-    read_audio_config,
-    read_audio_file,
-    text_to_speech_file,
-)
 load_dotenv()
@@ -149,7 +144,7 @@ async def infer(
     current_speaker.update_emotions(input_text)
     msg = current_speaker.respond(input_text)
-    audio_file_path = text_to_speech_file(
         text=msg,
         voice_id=current_audio_config["voice_id"],
         stability=current_audio_config["stability"],
@@ -158,9 +153,6 @@ async def infer(
         base_path=str(data_folder),
     )
-    audio_signal = read_audio_file(audio_file_path)  # base64
-    os.remove(audio_file_path)
     return {
         "generated_text": msg,
         "anger": current_speaker.emotions["anger"],
@@ -247,7 +239,7 @@ async def cards(
     data_folder = game_engine.data_folder
-    audio_file_path = text_to_speech_file(
         text=msg,
         voice_id=current_audio_config["voice_id"],
         stability=current_audio_config["stability"],
@@ -256,10 +248,6 @@ async def cards(
         base_path=str(data_folder),
     )
-    audio_signal = read_audio_file(audio_file_path)  # base64
-    os.remove(audio_file_path)
     return {"presenter_question": msg, "audio": audio_signal}

 from typing import Annotated, Dict, List
 from dotenv import load_dotenv
     StartRequest,
     StartResponse,
 )
+from hackathon.speech.speech import read_audio_config, text_to_speech_file
 load_dotenv()
     current_speaker.update_emotions(input_text)
     msg = current_speaker.respond(input_text)
+    audio_signal = text_to_speech_file(
         text=msg,
         voice_id=current_audio_config["voice_id"],
         stability=current_audio_config["stability"],
         base_path=str(data_folder),
     )
     return {
         "generated_text": msg,
         "anger": current_speaker.emotions["anger"],
     data_folder = game_engine.data_folder
+    audio_signal = text_to_speech_file(
         text=msg,
         voice_id=current_audio_config["voice_id"],
         stability=current_audio_config["stability"],
         base_path=str(data_folder),
     )
     return {"presenter_question": msg, "audio": audio_signal}

src/hackathon/speech/speech.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Code copied from https://elevenlabs.io/docs/cookbooks/text-to-speech/streaming
 import base64
-import uuid
 from io import BytesIO
 from typing import IO
@@ -57,16 +56,13 @@ def text_to_speech_file(
         ),
     )
-    save_file_path = f"./{uuid.uuid4()}.mp3"
-    with open(save_file_path, "wb") as f:
-        for chunk in response:
-            if chunk:
-                f.write(chunk)
-    print(f"{save_file_path}: audio file successfully saved !")
-    return save_file_path
 def text_to_speech_stream(

 # Code copied from https://elevenlabs.io/docs/cookbooks/text-to-speech/streaming
 import base64
 from io import BytesIO
 from typing import IO
         ),
     )
+    audio_data = BytesIO()
+    for chunk in response:
+        if chunk:
+            audio_data.write(chunk)
+    audio_data.seek(0)
+    audio_base64 = base64.b64encode(audio_data.read()).decode("utf-8")
+    return audio_base64
 def text_to_speech_stream(