Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -30,7 +30,7 @@ import gradio as gr
|
|
| 30 |
import librosa
|
| 31 |
import numpy as np
|
| 32 |
import requests
|
| 33 |
-
from gradio_webrtc import
|
| 34 |
from huggingface_hub import snapshot_download
|
| 35 |
from pydub import AudioSegment
|
| 36 |
from twilio.rest import Client
|
|
@@ -67,102 +67,13 @@ if account_sid and auth_token:
|
|
| 67 |
else:
|
| 68 |
rtc_configuration = None
|
| 69 |
|
| 70 |
-
# recording parameters
|
| 71 |
-
IN_CHANNELS = 1
|
| 72 |
-
IN_RATE = 24000
|
| 73 |
-
IN_CHUNK = 1024
|
| 74 |
-
IN_SAMPLE_WIDTH = 2
|
| 75 |
-
VAD_STRIDE = 0.5
|
| 76 |
-
|
| 77 |
-
# playing parameters
|
| 78 |
OUT_CHANNELS = 1
|
| 79 |
OUT_RATE = 24000
|
| 80 |
OUT_SAMPLE_WIDTH = 2
|
| 81 |
OUT_CHUNK = 20 * 4096
|
| 82 |
|
| 83 |
|
| 84 |
-
def
|
| 85 |
-
_st = time.time()
|
| 86 |
-
try:
|
| 87 |
-
audio = ori_audio
|
| 88 |
-
audio = audio.astype(np.float32) / 32768.0
|
| 89 |
-
sampling_rate = 16000
|
| 90 |
-
if sr != sampling_rate:
|
| 91 |
-
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
|
| 92 |
-
|
| 93 |
-
vad_parameters = {}
|
| 94 |
-
vad_parameters = VadOptions(**vad_parameters)
|
| 95 |
-
speech_chunks = get_speech_timestamps(audio, vad_parameters)
|
| 96 |
-
audio = collect_chunks(audio, speech_chunks)
|
| 97 |
-
duration_after_vad = audio.shape[0] / sampling_rate
|
| 98 |
-
|
| 99 |
-
if sr != sampling_rate:
|
| 100 |
-
# resample to original sampling rate
|
| 101 |
-
vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
|
| 102 |
-
else:
|
| 103 |
-
vad_audio = audio
|
| 104 |
-
vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
|
| 105 |
-
vad_audio_bytes = vad_audio.tobytes()
|
| 106 |
-
|
| 107 |
-
return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
|
| 108 |
-
except Exception as e:
|
| 109 |
-
msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
|
| 110 |
-
print(msg)
|
| 111 |
-
return -1, ori_audio, round(time.time() - _st, 4)
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
def warm_up():
|
| 115 |
-
frames = np.zeros((1, 1600)) # 1024 frames of 2 bytes each
|
| 116 |
-
_, frames, tcost = run_vad(frames, 16000)
|
| 117 |
-
print(f"warm up done, time_cost: {tcost:.3f} s")
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
# warm_up()
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
@dataclass
|
| 124 |
-
class AppState:
|
| 125 |
-
stream: np.ndarray | None = None
|
| 126 |
-
sampling_rate: int = 0
|
| 127 |
-
pause_detected: bool = False
|
| 128 |
-
started_talking: bool = False
|
| 129 |
-
responding: bool = False
|
| 130 |
-
stopped: bool = False
|
| 131 |
-
buffer: np.ndarray | None = None
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
|
| 135 |
-
"""Take in the stream, determine if a pause happened"""
|
| 136 |
-
duration = len(audio) / sampling_rate
|
| 137 |
-
|
| 138 |
-
dur_vad, _, _ = run_vad(audio, sampling_rate)
|
| 139 |
-
|
| 140 |
-
if duration >= 0.60:
|
| 141 |
-
if dur_vad > 0.2 and not state.started_talking:
|
| 142 |
-
print("started talking")
|
| 143 |
-
state.started_talking = True
|
| 144 |
-
if state.started_talking:
|
| 145 |
-
if state.stream is None:
|
| 146 |
-
state.stream = audio
|
| 147 |
-
else:
|
| 148 |
-
state.stream = np.concatenate((state.stream, audio))
|
| 149 |
-
state.buffer = None
|
| 150 |
-
if dur_vad < 0.1 and state.started_talking:
|
| 151 |
-
segment = AudioSegment(
|
| 152 |
-
state.stream.tobytes(),
|
| 153 |
-
frame_rate=sampling_rate,
|
| 154 |
-
sample_width=audio.dtype.itemsize,
|
| 155 |
-
channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
|
| 156 |
-
)
|
| 157 |
-
|
| 158 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 159 |
-
segment.export(f.name, format="wav")
|
| 160 |
-
print("input file written", f.name)
|
| 161 |
-
return True
|
| 162 |
-
return False
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
def speaking(audio_bytes: str):
|
| 166 |
base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
|
| 167 |
files = {"audio": base64_encoded}
|
| 168 |
byte_buffer = b""
|
|
@@ -194,73 +105,24 @@ def speaking(audio_bytes: str):
|
|
| 194 |
raise gr.Error(f"Error during audio streaming: {e}")
|
| 195 |
|
| 196 |
|
| 197 |
-
def process_audio(audio: tuple, state: AppState) -> None:
|
| 198 |
-
frame_rate, array = audio
|
| 199 |
-
array = np.squeeze(array)
|
| 200 |
-
if not state.sampling_rate:
|
| 201 |
-
state.sampling_rate = frame_rate
|
| 202 |
-
if state.buffer is None:
|
| 203 |
-
state.buffer = array
|
| 204 |
-
else:
|
| 205 |
-
state.buffer = np.concatenate((state.buffer, array))
|
| 206 |
-
|
| 207 |
-
pause_detected = determine_pause(state.buffer, state.sampling_rate, state)
|
| 208 |
-
state.pause_detected = pause_detected
|
| 209 |
-
|
| 210 |
|
| 211 |
-
def response(
|
| 212 |
-
|
| 213 |
-
|
| 214 |
|
| 215 |
audio_buffer = io.BytesIO()
|
| 216 |
segment = AudioSegment(
|
| 217 |
-
|
| 218 |
-
frame_rate=
|
| 219 |
-
sample_width=
|
| 220 |
-
channels=
|
| 221 |
-
|
| 222 |
segment.export(audio_buffer, format="wav")
|
| 223 |
|
| 224 |
for numpy_array in speaking(audio_buffer.getvalue()):
|
| 225 |
yield (OUT_RATE, numpy_array, "mono")
|
| 226 |
|
| 227 |
|
| 228 |
-
class OmniHandler(StreamHandler):
|
| 229 |
-
def __init__(self) -> None:
|
| 230 |
-
super().__init__(
|
| 231 |
-
expected_layout="mono", output_sample_rate=OUT_RATE, output_frame_size=480
|
| 232 |
-
)
|
| 233 |
-
self.event = Event()
|
| 234 |
-
self.state = AppState()
|
| 235 |
-
self.generator = None
|
| 236 |
-
self.duration = 0
|
| 237 |
-
|
| 238 |
-
def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
| 239 |
-
if self.state.responding:
|
| 240 |
-
return
|
| 241 |
-
process_audio(frame, self.state)
|
| 242 |
-
if self.state.pause_detected:
|
| 243 |
-
self.event.set()
|
| 244 |
-
|
| 245 |
-
def reset(self):
|
| 246 |
-
self.generator = None
|
| 247 |
-
self.event.clear()
|
| 248 |
-
self.state = AppState()
|
| 249 |
-
self.duration = 0
|
| 250 |
-
|
| 251 |
-
def emit(self):
|
| 252 |
-
if not self.event.is_set():
|
| 253 |
-
return None
|
| 254 |
-
else:
|
| 255 |
-
if not self.generator:
|
| 256 |
-
self.generator = response(self.state)
|
| 257 |
-
self.state.responding = True
|
| 258 |
-
try:
|
| 259 |
-
return next(self.generator)
|
| 260 |
-
except StopIteration:
|
| 261 |
-
self.reset()
|
| 262 |
-
|
| 263 |
-
|
| 264 |
with gr.Blocks() as demo:
|
| 265 |
gr.HTML(
|
| 266 |
"""
|
|
@@ -277,7 +139,7 @@ with gr.Blocks() as demo:
|
|
| 277 |
mode="send-receive",
|
| 278 |
modality="audio",
|
| 279 |
)
|
| 280 |
-
audio.stream(fn=
|
| 281 |
|
| 282 |
|
| 283 |
demo.launch(ssr_mode=False)
|
|
|
|
| 30 |
import librosa
|
| 31 |
import numpy as np
|
| 32 |
import requests
|
| 33 |
+
from gradio_webrtc import ReplyOnPause, WebRTC
|
| 34 |
from huggingface_hub import snapshot_download
|
| 35 |
from pydub import AudioSegment
|
| 36 |
from twilio.rest import Client
|
|
|
|
| 67 |
else:
|
| 68 |
rtc_configuration = None
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
OUT_CHANNELS = 1
|
| 71 |
OUT_RATE = 24000
|
| 72 |
OUT_SAMPLE_WIDTH = 2
|
| 73 |
OUT_CHUNK = 20 * 4096
|
| 74 |
|
| 75 |
|
| 76 |
+
def speaking(audio_bytes: bytes):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
|
| 78 |
files = {"audio": base64_encoded}
|
| 79 |
byte_buffer = b""
|
|
|
|
| 105 |
raise gr.Error(f"Error during audio streaming: {e}")
|
| 106 |
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
+
def response(audio: tuple[int, np.ndarray]):
|
| 110 |
+
sampling_rate, audio_np = audio
|
| 111 |
+
audio_np = audio_np.squeeze()
|
| 112 |
|
| 113 |
audio_buffer = io.BytesIO()
|
| 114 |
segment = AudioSegment(
|
| 115 |
+
audio_np.tobytes(),
|
| 116 |
+
frame_rate=sampling_rate,
|
| 117 |
+
sample_width=audio_np.dtype.itemsize,
|
| 118 |
+
channels=1)
|
| 119 |
+
|
| 120 |
segment.export(audio_buffer, format="wav")
|
| 121 |
|
| 122 |
for numpy_array in speaking(audio_buffer.getvalue()):
|
| 123 |
yield (OUT_RATE, numpy_array, "mono")
|
| 124 |
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
with gr.Blocks() as demo:
|
| 127 |
gr.HTML(
|
| 128 |
"""
|
|
|
|
| 139 |
mode="send-receive",
|
| 140 |
modality="audio",
|
| 141 |
)
|
| 142 |
+
audio.stream(fn=ReplyOnPause(response), inputs=[audio], outputs=[audio], time_limit=60)
|
| 143 |
|
| 144 |
|
| 145 |
demo.launch(ssr_mode=False)
|