salomonsky commited on
Commit
504ccf4
verified
1 Parent(s): d2c7775

Upload vad.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. vad.py +195 -0
vad.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import speech_recognition as sr
2
+ import threading
3
+ import time
4
+ import pygame
5
+ from response_handler import ResponseHandler
6
+
7
+ class VoiceDetector:
8
+ def __init__(self, on_activation=None, on_speech=None, on_timeout=None):
9
+ self.recognizer = sr.Recognizer()
10
+ self.is_active = True
11
+ self.is_listening = True
12
+ self.last_interaction = time.time()
13
+ self.TIMEOUT_SECONDS = 20
14
+ self.clock = pygame.time.Clock()
15
+ self.waiting_for_activation = True
16
+ self.audio_utils = None
17
+ self.last_interrupt_time = 0
18
+ self.INTERRUPT_COOLDOWN = 1.0
19
+
20
+ # Configuraci贸n de umbrales
21
+ self.BASE_ENERGY_THRESHOLD = 300
22
+ self.HIGH_ENERGY_THRESHOLD = 600
23
+ self.current_energy_threshold = self.BASE_ENERGY_THRESHOLD
24
+
25
+ # Configuraci贸n del reconocedor
26
+ self.recognizer.energy_threshold = self.current_energy_threshold
27
+ self.recognizer.dynamic_energy_threshold = True
28
+ self.recognizer.dynamic_energy_adjustment_damping = 0.15
29
+ self.recognizer.dynamic_energy_ratio = 1.5
30
+ self.recognizer.pause_threshold = 0.8
31
+ self.recognizer.non_speaking_duration = 0.5
32
+ self.recognizer.phrase_threshold = 0.3
33
+
34
+ # Umbrales de interrupci贸n
35
+ self.INTERRUPT_ENERGY_MULTIPLIER = 2.0
36
+ self.INTERRUPT_DURATION = 0.3
37
+ self.INTERRUPT_SAMPLES = 3
38
+ self.INTERRUPT_SUCCESS_THRESHOLD = 2
39
+
40
+ self.on_activation = on_activation
41
+ self.on_speech = on_speech
42
+ self.on_timeout = on_timeout
43
+
44
+ # Control de eco y auto-activaci贸n
45
+ self.last_audio_output_time = 0
46
+ self.AUDIO_OUTPUT_COOLDOWN = 0.3 # Reducido a 0.3 segundos
47
+ self.is_high_threshold_mode = False
48
+
49
+ # Buffer circular para detecci贸n de eco
50
+ self.audio_buffer = []
51
+ self.BUFFER_SIZE = 5
52
+ self.last_played_audio = None
53
+
54
+ def set_audio_utils(self, audio_utils):
55
+ self.audio_utils = audio_utils
56
+
57
+ def set_high_threshold_mode(self, enabled):
58
+ """Activa o desactiva el modo de umbral alto para escucha durante reproducci贸n"""
59
+ self.is_high_threshold_mode = enabled
60
+ self.current_energy_threshold = self.HIGH_ENERGY_THRESHOLD if enabled else self.BASE_ENERGY_THRESHOLD
61
+ self.recognizer.energy_threshold = self.current_energy_threshold
62
+ print(f"Umbral de energ铆a ajustado a: {self.current_energy_threshold}")
63
+
64
+ def start(self):
65
+ self.is_active = True
66
+ self.is_listening = True
67
+ threading.Thread(target=self.listen_continuously, daemon=True).start()
68
+
69
+ def stop(self):
70
+ self.is_active = False
71
+ self.is_listening = False
72
+
73
+ def listen_continuously(self):
74
+ while self.is_active and self.is_listening:
75
+ try:
76
+ with sr.Microphone() as source:
77
+ # Ajustar para ruido ambiental solo si no estamos en modo de umbral alto
78
+ if not self.is_high_threshold_mode:
79
+ self.recognizer.adjust_for_ambient_noise(source, duration=0.2)
80
+
81
+ try:
82
+ audio = self.recognizer.listen(
83
+ source,
84
+ timeout=1,
85
+ phrase_time_limit=5
86
+ )
87
+
88
+ if not self.is_active or not self.is_listening:
89
+ break
90
+
91
+ # Verificar si estamos reproduciendo audio
92
+ if self.audio_utils and self.audio_utils.is_speaking:
93
+ current_time = time.time()
94
+
95
+ # Verificar cooldown de eco
96
+ if current_time - self.last_audio_output_time < self.AUDIO_OUTPUT_COOLDOWN:
97
+ continue
98
+
99
+ # Verificar interrupci贸n con umbral actual
100
+ if self.check_for_interruption(audio.frame_data):
101
+ try:
102
+ # Intentar reconocer comando de interrupci贸n
103
+ text = self.recognizer.recognize_google(
104
+ audio,
105
+ language="es-ES"
106
+ ).lower()
107
+
108
+ # Verificar que no es eco comparando con buffer
109
+ if not self.is_echo(text):
110
+ if ResponseHandler.is_stop_command(text):
111
+ print(f"Comando de interrupci贸n detectado: {text}")
112
+ self.audio_utils.stop_speaking()
113
+ self.last_interrupt_time = current_time
114
+ except sr.UnknownValueError:
115
+ # Si no se reconoce texto pero la energ铆a es alta, interrumpir
116
+ if self.is_high_threshold_mode:
117
+ self.audio_utils.stop_speaking()
118
+ self.last_interrupt_time = current_time
119
+ continue
120
+
121
+ # Procesar audio normal (no interrupci贸n)
122
+ if not self.audio_utils or not self.audio_utils.is_speaking:
123
+ text = self.recognizer.recognize_google(
124
+ audio,
125
+ language="es-ES"
126
+ ).lower()
127
+
128
+ # Verificar que no es eco
129
+ if not self.is_echo(text):
130
+ if self.waiting_for_activation:
131
+ if ResponseHandler.is_activation_phrase(text):
132
+ self.waiting_for_activation = False
133
+ if self.on_activation:
134
+ self.on_activation()
135
+ else:
136
+ if self.on_speech:
137
+ self.on_speech(text)
138
+
139
+ except sr.WaitTimeoutError:
140
+ continue
141
+ except sr.UnknownValueError:
142
+ continue
143
+
144
+ except Exception as e:
145
+ print(f"Error en reconocimiento continuo: {e}")
146
+ time.sleep(1)
147
+
148
+ self.clock.tick(30)
149
+
150
+ def is_echo(self, text):
151
+ """Verifica si el texto detectado es un eco del audio reproducido"""
152
+ # Comparar con el buffer de audio reciente
153
+ for recent_audio in self.audio_buffer:
154
+ if text.lower() in recent_audio.lower() or recent_audio.lower() in text.lower():
155
+ print("Eco detectado y filtrado")
156
+ return True
157
+ return False
158
+
159
+ def update_last_audio_output(self, text=None):
160
+ """Actualizar el timestamp del 煤ltimo audio reproducido y el buffer"""
161
+ self.last_audio_output_time = time.time()
162
+ if text:
163
+ self.audio_buffer.append(text)
164
+ if len(self.audio_buffer) > self.BUFFER_SIZE:
165
+ self.audio_buffer.pop(0)
166
+
167
+ def check_for_interruption(self, audio_data):
168
+ """Verificar si hay una interrupci贸n v谩lida usando el umbral actual"""
169
+ if not audio_data or len(audio_data) < 1000:
170
+ return False
171
+
172
+ # Calcular energ铆a en ventanas
173
+ window_size = 500
174
+ windows = [audio_data[i:i+window_size] for i in range(0, len(audio_data), window_size)]
175
+ energies = []
176
+
177
+ for window in windows:
178
+ if len(window) >= 2:
179
+ energy = sum(abs(int.from_bytes(window[i:i+2], 'little', signed=True))
180
+ for i in range(0, len(window), 2)) / (len(window)/2)
181
+ energies.append(energy)
182
+
183
+ if not energies:
184
+ return False
185
+
186
+ # Usar el umbral actual seg煤n el modo
187
+ threshold = self.current_energy_threshold * self.INTERRUPT_ENERGY_MULTIPLIER
188
+ high_energy_windows = sum(1 for e in energies if e > threshold)
189
+
190
+ # Requerir que al menos 70% de las ventanas tengan alta energ铆a
191
+ return high_energy_windows >= len(energies) * 0.7
192
+
193
+ def is_speaking_check(self):
194
+ """Verificar si el sistema est谩 reproduciendo audio"""
195
+ return self.audio_utils and self.audio_utils.is_speaking