salomonsky commited on
Commit
d2c7775
verified
1 Parent(s): f7f1518

Upload tts_utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tts_utils.py +468 -0
tts_utils.py ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pygame
2
+ import tempfile
3
+ import uuid
4
+ import threading
5
+ import asyncio
6
+ from pathlib import Path
7
+ from TTS.api import TTS
8
+ from gtts import gTTS
9
+ import edge_tts
10
+ import logging
11
+ import time
12
+ import os
13
+
14
+ class TTSUtils:
15
+ AVAILABLE_MODELS = {
16
+ 'EDGE': {
17
+ 'name': "es-MX-JorgeNeural",
18
+ 'description': "Voz masculina de Microsoft Edge (MX)",
19
+ 'type': 'edge',
20
+ 'rate': '+25%'
21
+ },
22
+ 'EDGE_ES': {
23
+ 'name': "es-ES-AlvaroNeural",
24
+ 'description': "Voz masculina de Microsoft Edge (ES)",
25
+ 'type': 'edge',
26
+ 'rate': '+25%'
27
+ },
28
+ 'VITS': {
29
+ 'name': "tts_models/es/css10/vits",
30
+ 'description': "Voz masculina de VITS (ES)",
31
+ 'type': 'local',
32
+ 'config': {
33
+ 'speed': 1.25,
34
+ 'model_path': "tts_models/es/css10/vits"
35
+ }
36
+ }
37
+ }
38
+
39
+ def __init__(self, model_name='EDGE', elevenlabs_api_key=None):
40
+ """Inicializa el motor TTS"""
41
+ self.is_speaking = False
42
+ self.should_stop = False
43
+ self.temp_dir = Path(tempfile.gettempdir()) / "chatbot_audio"
44
+ self.temp_dir.mkdir(exist_ok=True)
45
+ self.tts = None
46
+ self.audio_initialized = False
47
+ self.current_model = model_name
48
+ print(f"Inicializando TTS con modelo: {model_name}")
49
+
50
+ try:
51
+ if pygame.mixer.get_init():
52
+ pygame.mixer.quit()
53
+ pygame.mixer.init(frequency=16000, size=-16, channels=1, buffer=2048)
54
+ pygame.mixer.music.set_volume(0.8)
55
+ self.audio_initialized = True
56
+ print("Audio inicializado correctamente")
57
+ except Exception as e:
58
+ print(f"Error inicializando audio: {str(e)}")
59
+ self.audio_initialized = False
60
+
61
+ self.play_lock = threading.Lock()
62
+ self.clock = pygame.time.Clock()
63
+ self.init_audio()
64
+
65
+ # Limpiar archivos temporales antiguos
66
+ self._cleanup_old_files()
67
+
68
+ def _cleanup_old_files(self, max_age_hours=1):
69
+ """Limpia archivos temporales antiguos"""
70
+ try:
71
+ current_time = time.time()
72
+ for file in self.temp_dir.glob("*"):
73
+ if file.is_file():
74
+ file_age = current_time - file.stat().st_mtime
75
+ if file_age > max_age_hours * 3600: # Convertir horas a segundos
76
+ try:
77
+ file.unlink()
78
+ except:
79
+ pass
80
+ except Exception as e:
81
+ print(f"Error limpiando archivos temporales: {e}")
82
+
83
+ def _verify_audio_system(self):
84
+ """Verifica el estado del sistema de audio"""
85
+ if not self.audio_initialized or not pygame.mixer.get_init():
86
+ try:
87
+ if pygame.mixer.get_init():
88
+ pygame.mixer.quit()
89
+ pygame.mixer.init(frequency=44100, size=-16, channels=2, buffer=4096)
90
+ pygame.mixer.music.set_volume(1.0)
91
+ self.audio_initialized = True
92
+ return True
93
+ except Exception as e:
94
+ print(f"Error reinicializando audio: {e}")
95
+ return False
96
+ return True
97
+
98
+ def init_audio(self):
99
+ """Inicializa el modelo TTS seleccionado"""
100
+ try:
101
+ if self.current_model == 'VITS':
102
+ model_info = self.AVAILABLE_MODELS[self.current_model]
103
+ print(f"Cargando modelo VITS: {model_info['name']}")
104
+ try:
105
+ self.tts = TTS(model_name=model_info['name'])
106
+ if hasattr(self.tts, 'synthesizer') and hasattr(self.tts.synthesizer, 'tts_config'):
107
+ self.tts.synthesizer.tts_config.update(model_info['config'])
108
+ print("Modelo VITS cargado correctamente")
109
+ return True
110
+ else:
111
+ print("Error: El modelo VITS no tiene la estructura esperada")
112
+ self.current_model = 'EDGE' # Fallback a Edge si hay error
113
+ return False
114
+ except Exception as vits_error:
115
+ print(f"Error cargando modelo VITS: {vits_error}")
116
+ self.current_model = 'EDGE' # Fallback a Edge si hay error
117
+ return False
118
+ return True
119
+ except Exception as e:
120
+ print(f"Error inicializando audio: {e}")
121
+ self.current_model = 'EDGE' # Fallback a Edge si hay error
122
+ return False
123
+
124
+ def _number_to_words(self, number):
125
+ """Convierte un n煤mero a palabras en espa帽ol"""
126
+ UNITS = ['', 'uno', 'dos', 'tres', 'cuatro', 'cinco', 'seis', 'siete', 'ocho', 'nueve']
127
+ TENS = ['', 'diez', 'veinte', 'treinta', 'cuarenta', 'cincuenta', 'sesenta', 'setenta', 'ochenta', 'noventa']
128
+ TEENS = ['diez', 'once', 'doce', 'trece', 'catorce', 'quince', 'diecis茅is', 'diecisiete', 'dieciocho', 'diecinueve']
129
+
130
+ try:
131
+ num = int(number)
132
+ if num == 0:
133
+ return 'cero'
134
+ elif num < 0:
135
+ return f"menos {self._number_to_words(abs(num))}"
136
+ elif num < 10:
137
+ return UNITS[num]
138
+ elif num < 20:
139
+ return TEENS[num - 10]
140
+ elif num < 100:
141
+ tens = num // 10
142
+ units = num % 10
143
+ if units == 0:
144
+ return TENS[tens]
145
+ else:
146
+ return f"{TENS[tens]} y {UNITS[units]}"
147
+ else:
148
+ return str(num) # Para n煤meros mayores a 99, mantener d铆gitos
149
+ except:
150
+ return number # Si hay error, devolver el n煤mero original
151
+
152
+ def _clean_text(self, text):
153
+ """Limpia el texto de caracteres especiales antes de la s铆ntesis"""
154
+ if not text:
155
+ return text
156
+
157
+ # Reemplazar asteriscos y otros caracteres especiales
158
+ replacements = {
159
+ '*': '',
160
+ '#': '',
161
+ '`': '',
162
+ '~': '',
163
+ '|': '',
164
+ '>': '',
165
+ '<': '',
166
+ '\\': '',
167
+ '&': 'y',
168
+ '_': ' ',
169
+ '...': ',',
170
+ '..': ',',
171
+ '---': ',',
172
+ '--': ',',
173
+ '%': ' por ciento',
174
+ '$': ' pesos',
175
+ '=': ' igual a ',
176
+ '+': ' m谩s ',
177
+ '@': ' arroba ',
178
+ }
179
+
180
+ cleaned_text = text
181
+ for char, replacement in replacements.items():
182
+ cleaned_text = cleaned_text.replace(char, replacement)
183
+
184
+ # Convertir n煤meros a palabras
185
+ words = []
186
+ for word in cleaned_text.split():
187
+ # Verificar si es un n煤mero (entero o decimal)
188
+ if word.replace('.', '').replace('-', '').isdigit():
189
+ # Si es decimal
190
+ if '.' in word:
191
+ parts = word.split('.')
192
+ if len(parts) == 2:
193
+ integer_part = self._number_to_words(parts[0])
194
+ decimal_part = self._number_to_words(parts[1])
195
+ words.append(f"{integer_part} punto {decimal_part}")
196
+ else:
197
+ words.append(word)
198
+ else:
199
+ words.append(self._number_to_words(word))
200
+ else:
201
+ words.append(word)
202
+
203
+ cleaned_text = ' '.join(words)
204
+
205
+ # Eliminar espacios m煤ltiples
206
+ cleaned_text = ' '.join(cleaned_text.split())
207
+
208
+ return cleaned_text
209
+
210
+ def text_to_speech(self, text, save_path=None):
211
+ """Genera audio a partir de texto con verificaciones mejoradas"""
212
+ if not text:
213
+ return None
214
+
215
+ # Limpiar el texto antes de procesarlo
216
+ text = self._clean_text(text)
217
+ if not text:
218
+ return None
219
+
220
+ if not self._verify_audio_system():
221
+ print("Sistema de audio no disponible")
222
+ return None
223
+
224
+ try:
225
+ temp_file = save_path or str(self.temp_dir / f"{uuid.uuid4()}.mp3")
226
+ print(f"Generando audio para modelo: {self.current_model}")
227
+
228
+ # Verificar modelo actual y generar audio
229
+ try:
230
+ if self.current_model == 'VITS':
231
+ print("Usando modelo VITS")
232
+ if not self.tts:
233
+ print("Inicializando modelo VITS...")
234
+ if not self.init_audio():
235
+ print("Fallback a Edge debido a error en inicializaci贸n de VITS")
236
+ return self.fallback_to_edge(text, temp_file)
237
+ try:
238
+ self.tts.tts_to_file(
239
+ text=text,
240
+ file_path=temp_file,
241
+ speed=self.AVAILABLE_MODELS['VITS']['config']['speed']
242
+ )
243
+ if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0:
244
+ print(f"Audio generado correctamente con VITS: {os.path.getsize(temp_file)} bytes")
245
+ return temp_file
246
+ raise Exception("Archivo de audio VITS inv谩lido")
247
+ except Exception as vits_error:
248
+ print(f"Error generando audio con VITS: {vits_error}")
249
+ return self.fallback_to_edge(text, temp_file)
250
+
251
+ elif self.current_model in ['EDGE', 'EDGE_ES']:
252
+ return self.fallback_to_edge(text, temp_file)
253
+
254
+ except Exception as primary_error:
255
+ print(f"Error con el modelo primario {self.current_model}: {primary_error}")
256
+ return self.fallback_to_gtts(text, temp_file)
257
+
258
+ return temp_file
259
+
260
+ except Exception as e:
261
+ print(f"Error en text_to_speech: {e}")
262
+ return None
263
+ finally:
264
+ self._cleanup_old_files()
265
+
266
+ def fallback_to_edge(self, text, temp_file):
267
+ """M茅todo de respaldo usando Edge TTS"""
268
+ try:
269
+ voice = self.AVAILABLE_MODELS['EDGE']['name']
270
+ print(f"Usando voz Edge como respaldo: {voice}")
271
+ for attempt in range(3):
272
+ try:
273
+ async def tts_with_timeout():
274
+ return await asyncio.wait_for(
275
+ self.edge_tts_speak(text, voice, temp_file),
276
+ timeout=15.0
277
+ )
278
+ asyncio.run(tts_with_timeout())
279
+ if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0:
280
+ print(f"Audio generado correctamente con Edge: {os.path.getsize(temp_file)} bytes")
281
+ return temp_file
282
+ raise Exception("Archivo de audio Edge inv谩lido")
283
+ except Exception as e:
284
+ print(f"Intento {attempt + 1} fallido con Edge: {e}")
285
+ if attempt == 2:
286
+ return self.fallback_to_gtts(text, temp_file)
287
+ time.sleep(2 ** attempt)
288
+ except Exception as edge_error:
289
+ print(f"Error con Edge TTS: {edge_error}")
290
+ return self.fallback_to_gtts(text, temp_file)
291
+
292
+ def fallback_to_gtts(self, text, temp_file):
293
+ """M茅todo final de respaldo usando gTTS"""
294
+ print("Usando gTTS como 煤ltimo respaldo")
295
+ try:
296
+ tts = gTTS(text=text, lang='es', slow=False)
297
+ temp_normal = str(self.temp_dir / f"temp_normal_{uuid.uuid4()}.mp3")
298
+ tts.save(temp_normal)
299
+
300
+ import ffmpeg
301
+ stream = ffmpeg.input(temp_normal)
302
+ stream = ffmpeg.output(stream, temp_file, acodec='libmp3lame', atempo=1.25)
303
+ ffmpeg.run(stream, overwrite_output=True, capture_stdout=True, capture_stderr=True)
304
+
305
+ try:
306
+ os.remove(temp_normal)
307
+ except:
308
+ pass
309
+
310
+ if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0:
311
+ print(f"Audio generado correctamente con gTTS: {os.path.getsize(temp_file)} bytes")
312
+ return temp_file
313
+ raise Exception("Archivo de audio gTTS inv谩lido")
314
+ except Exception as gtts_error:
315
+ print(f"Error con gTTS: {gtts_error}")
316
+ return None
317
+
318
+ async def edge_tts_speak(self, text, voice, output_file):
319
+ """Genera audio usando edge-tts"""
320
+ try:
321
+ print(f"Generando audio con voz: {voice}")
322
+ rate = self.AVAILABLE_MODELS[self.current_model]['rate']
323
+ print(f"Usando rate: {rate}")
324
+ communicate = edge_tts.Communicate(text, voice, rate=rate)
325
+ await communicate.save(str(output_file))
326
+ print(f"Audio generado correctamente con {voice}")
327
+ return True
328
+ except Exception as e:
329
+ print(f"Error generando audio con edge-tts: {e}")
330
+ raise
331
+
332
+ def stop_speaking(self):
333
+ """Detiene la reproducci贸n actual"""
334
+ if self.is_speaking:
335
+ try:
336
+ self.should_stop = True
337
+ pygame.mixer.music.stop()
338
+ pygame.mixer.music.unload()
339
+ self.is_speaking = False
340
+ print("Reproducci贸n detenida por interrupci贸n")
341
+ except Exception as e:
342
+ print(f"Error al detener el audio: {e}")
343
+ finally:
344
+ self.is_speaking = False
345
+ self.should_stop = False
346
+
347
+ def change_model(self, model_name):
348
+ """Cambia el modelo TTS actual"""
349
+ if model_name not in self.AVAILABLE_MODELS:
350
+ print(f"Modelo {model_name} no disponible")
351
+ return False
352
+
353
+ try:
354
+ print(f"Cambiando a modelo {model_name}...")
355
+ self.current_model = model_name
356
+ self.init_audio()
357
+ return True
358
+ except Exception as e:
359
+ print(f"Error cambiando modelo: {e}")
360
+ return False
361
+
362
+ def is_currently_speaking(self):
363
+ """Verifica si hay audio reproduci茅ndose"""
364
+ return self.is_speaking
365
+
366
+ def create_audio_file(self, text, output_file):
367
+ """Crea un archivo de audio permanente"""
368
+ try:
369
+ if 'EDGE' in self.current_model:
370
+ voice = self.AVAILABLE_MODELS[self.current_model]['name']
371
+ async def tts_with_timeout():
372
+ return await asyncio.wait_for(
373
+ self.edge_tts_speak(text, voice, output_file),
374
+ timeout=5.0
375
+ )
376
+ asyncio.run(tts_with_timeout())
377
+ elif self.current_model == 'gTTS':
378
+ tts = gTTS(text=text, lang='es', slow=False)
379
+ tts.save(str(output_file))
380
+ else: # VITS
381
+ self.tts.tts_to_file(
382
+ text=text,
383
+ file_path=str(output_file),
384
+ speaker_wav=None,
385
+ split_sentences=False
386
+ )
387
+ return str(output_file)
388
+ except Exception as e:
389
+ print(f"Error creando archivo de audio: {e}")
390
+ return None
391
+
392
+ def play_audio(self, file_path):
393
+ """Reproduce un archivo de audio con verificaciones mejoradas y escucha activa"""
394
+ if not self._verify_audio_system():
395
+ raise Exception("Sistema de audio no disponible")
396
+
397
+ try:
398
+ if not Path(file_path).exists():
399
+ raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
400
+
401
+ if not Path(file_path).stat().st_size > 0:
402
+ raise ValueError("Archivo de audio vac铆o o corrupto")
403
+
404
+ with self.play_lock:
405
+ if self.is_speaking:
406
+ self.stop_speaking()
407
+
408
+ self.is_speaking = True
409
+ pygame.mixer.music.load(file_path)
410
+ pygame.mixer.music.play()
411
+
412
+ # Notificar al detector de voz que estamos reproduciendo
413
+ if hasattr(self, 'voice_detector'):
414
+ self.voice_detector.update_last_audio_output()
415
+ # Mantener la escucha activa pero con umbral m谩s alto
416
+ self.voice_detector.set_high_threshold_mode(True)
417
+
418
+ # Configurar callback para cuando termine la reproducci贸n
419
+ def on_music_end():
420
+ self.is_speaking = False
421
+ self.should_stop = False
422
+ # Restaurar umbral normal de escucha
423
+ if hasattr(self, 'voice_detector'):
424
+ self.voice_detector.set_high_threshold_mode(False)
425
+
426
+ pygame.mixer.music.set_endevent(pygame.USEREVENT)
427
+ pygame.event.set_allowed(pygame.USEREVENT)
428
+
429
+ # Esperar a que termine sin bloquear
430
+ while pygame.mixer.music.get_busy() and not self.should_stop:
431
+ for event in pygame.event.get():
432
+ if event.type == pygame.USEREVENT:
433
+ on_music_end()
434
+ self.clock.tick(30)
435
+
436
+ if self.should_stop:
437
+ self.stop_speaking()
438
+ # Restaurar umbral normal al interrumpir
439
+ if hasattr(self, 'voice_detector'):
440
+ self.voice_detector.set_high_threshold_mode(False)
441
+
442
+ except Exception as e:
443
+ print(f"Error reproduciendo audio: {e}")
444
+ self.is_speaking = False
445
+ self.audio_initialized = False
446
+ if hasattr(self, 'voice_detector'):
447
+ self.voice_detector.set_high_threshold_mode(False)
448
+ raise
449
+ finally:
450
+ self.is_speaking = False
451
+ self.should_stop = False
452
+
453
+ def set_voice_detector(self, voice_detector):
454
+ """Establece el detector de voz para coordinar interrupciones"""
455
+ self.voice_detector = voice_detector
456
+
457
+ def __del__(self):
458
+ try:
459
+ pygame.mixer.quit()
460
+ if self.temp_dir.exists():
461
+ for file in self.temp_dir.glob("*"):
462
+ try:
463
+ file.unlink()
464
+ except:
465
+ pass
466
+ self.temp_dir.rmdir()
467
+ except:
468
+ pass