Spaces:
Running
on
L40S
Running
on
L40S
Streaming agent
#2
by
SpicyqSama007
- opened
app.py
CHANGED
|
@@ -2,6 +2,8 @@ import re
|
|
| 2 |
import gradio as gr
|
| 3 |
import numpy as np
|
| 4 |
import os
|
|
|
|
|
|
|
| 5 |
import threading
|
| 6 |
import subprocess
|
| 7 |
import sys
|
|
@@ -52,6 +54,17 @@ class ChatState:
|
|
| 52 |
def clear_fn():
|
| 53 |
return [], ChatState(), None, None, None
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
async def process_audio_input(
|
| 57 |
sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
|
|
@@ -72,11 +85,9 @@ async def process_audio_input(
|
|
| 72 |
|
| 73 |
if isinstance(sys_audio_input, tuple):
|
| 74 |
sr, sys_audio_data = sys_audio_input
|
| 75 |
-
|
| 76 |
sr = 44100
|
| 77 |
sys_audio_data = None
|
| 78 |
-
else:
|
| 79 |
-
raise gr.Error("Invalid audio format")
|
| 80 |
|
| 81 |
def append_to_chat_ctx(
|
| 82 |
part: ServeTextPart | ServeVQPart, role: str = "assistant"
|
|
@@ -106,22 +117,16 @@ async def process_audio_input(
|
|
| 106 |
):
|
| 107 |
if event.type == FishE2EEventType.USER_CODES:
|
| 108 |
append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
|
|
|
|
| 109 |
elif event.type == FishE2EEventType.SPEECH_SEGMENT:
|
| 110 |
-
result_audio += event.frame.data
|
| 111 |
-
np_audio = np.frombuffer(result_audio, dtype=np.int16)
|
| 112 |
append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
|
| 113 |
-
|
| 114 |
-
|
| 115 |
elif event.type == FishE2EEventType.TEXT_SEGMENT:
|
| 116 |
append_to_chat_ctx(ServeTextPart(text=event.text))
|
| 117 |
-
|
| 118 |
-
np_audio = np.frombuffer(result_audio, dtype=np.int16)
|
| 119 |
-
yield state.get_history(), (44100, np_audio), None, None
|
| 120 |
-
else:
|
| 121 |
-
yield state.get_history(), None, None, None
|
| 122 |
|
| 123 |
-
|
| 124 |
-
yield state.get_history(), (44100, np_audio), None, None
|
| 125 |
|
| 126 |
|
| 127 |
async def process_text_input(
|
|
@@ -179,7 +184,12 @@ def create_demo():
|
|
| 179 |
|
| 180 |
text_input = gr.Textbox(label="Or type your message", type="text",value="Can you give a brief introduction of yourself?")
|
| 181 |
|
| 182 |
-
output_audio = gr.Audio(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
send_button = gr.Button("Send", variant="primary")
|
| 185 |
clear_button = gr.Button("Clear")
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import numpy as np
|
| 4 |
import os
|
| 5 |
+
import io
|
| 6 |
+
import wave
|
| 7 |
import threading
|
| 8 |
import subprocess
|
| 9 |
import sys
|
|
|
|
| 54 |
def clear_fn():
|
| 55 |
return [], ChatState(), None, None, None
|
| 56 |
|
| 57 |
+
def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
|
| 58 |
+
buffer = io.BytesIO()
|
| 59 |
+
|
| 60 |
+
with wave.open(buffer, "wb") as wav_file:
|
| 61 |
+
wav_file.setnchannels(channels)
|
| 62 |
+
wav_file.setsampwidth(bit_depth // 8)
|
| 63 |
+
wav_file.setframerate(sample_rate)
|
| 64 |
+
|
| 65 |
+
wav_header_bytes = buffer.getvalue()
|
| 66 |
+
buffer.close()
|
| 67 |
+
return wav_header_bytes
|
| 68 |
|
| 69 |
async def process_audio_input(
|
| 70 |
sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
|
|
|
|
| 85 |
|
| 86 |
if isinstance(sys_audio_input, tuple):
|
| 87 |
sr, sys_audio_data = sys_audio_input
|
| 88 |
+
else:
|
| 89 |
sr = 44100
|
| 90 |
sys_audio_data = None
|
|
|
|
|
|
|
| 91 |
|
| 92 |
def append_to_chat_ctx(
|
| 93 |
part: ServeTextPart | ServeVQPart, role: str = "assistant"
|
|
|
|
| 117 |
):
|
| 118 |
if event.type == FishE2EEventType.USER_CODES:
|
| 119 |
append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
|
| 120 |
+
|
| 121 |
elif event.type == FishE2EEventType.SPEECH_SEGMENT:
|
|
|
|
|
|
|
| 122 |
append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
|
| 123 |
+
yield state.get_history(), wav_chunk_header() + event.frame.data, None, None
|
| 124 |
+
|
| 125 |
elif event.type == FishE2EEventType.TEXT_SEGMENT:
|
| 126 |
append_to_chat_ctx(ServeTextPart(text=event.text))
|
| 127 |
+
yield state.get_history(), None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
+
yield state.get_history(), None, None, None
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
async def process_text_input(
|
|
|
|
| 184 |
|
| 185 |
text_input = gr.Textbox(label="Or type your message", type="text",value="Can you give a brief introduction of yourself?")
|
| 186 |
|
| 187 |
+
output_audio = gr.Audio(
|
| 188 |
+
label="Assistant's Voice",
|
| 189 |
+
streaming=True,
|
| 190 |
+
autoplay=True,
|
| 191 |
+
interactive=False,
|
| 192 |
+
)
|
| 193 |
|
| 194 |
send_button = gr.Button("Send", variant="primary")
|
| 195 |
clear_button = gr.Button("Clear")
|