Spaces:

salomonsky
/

chatbot-web-app

Runtime error

App Files Files Community

salomonsky commited on 29 days ago

Commit

d2c7775

verified ·

1 Parent(s): f7f1518

Upload tts_utils.py with huggingface_hub

Browse files

Files changed (1) hide show

tts_utils.py +468 -0

tts_utils.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import pygame
+import tempfile
+import uuid
+import threading
+import asyncio
+from pathlib import Path
+from TTS.api import TTS
+from gtts import gTTS
+import edge_tts
+import logging
+import time
+import os
+class TTSUtils:
+    AVAILABLE_MODELS = {
+        'EDGE': {
+            'name': "es-MX-JorgeNeural",
+            'description': "Voz masculina de Microsoft Edge (MX)",
+            'type': 'edge',
+            'rate': '+25%'
+        },
+        'EDGE_ES': {
+            'name': "es-ES-AlvaroNeural",
+            'description': "Voz masculina de Microsoft Edge (ES)",
+            'type': 'edge',
+            'rate': '+25%'
+        },
+        'VITS': {
+            'name': "tts_models/es/css10/vits",
+            'description': "Voz masculina de VITS (ES)",
+            'type': 'local',
+            'config': {
+                'speed': 1.25,
+                'model_path': "tts_models/es/css10/vits"
+            }
+        }
+    }
+    def __init__(self, model_name='EDGE', elevenlabs_api_key=None):
+        """Inicializa el motor TTS"""
+        self.is_speaking = False
+        self.should_stop = False
+        self.temp_dir = Path(tempfile.gettempdir()) / "chatbot_audio"
+        self.temp_dir.mkdir(exist_ok=True)
+        self.tts = None
+        self.audio_initialized = False
+        self.current_model = model_name
+        print(f"Inicializando TTS con modelo: {model_name}")
+        try:
+            if pygame.mixer.get_init():
+                pygame.mixer.quit()
+            pygame.mixer.init(frequency=16000, size=-16, channels=1, buffer=2048)
+            pygame.mixer.music.set_volume(0.8)
+            self.audio_initialized = True
+            print("Audio inicializado correctamente")
+        except Exception as e:
+            print(f"Error inicializando audio: {str(e)}")
+            self.audio_initialized = False
+        self.play_lock = threading.Lock()
+        self.clock = pygame.time.Clock()
+        self.init_audio()
+        # Limpiar archivos temporales antiguos
+        self._cleanup_old_files()
+    def _cleanup_old_files(self, max_age_hours=1):
+        """Limpia archivos temporales antiguos"""
+        try:
+            current_time = time.time()
+            for file in self.temp_dir.glob("*"):
+                if file.is_file():
+                    file_age = current_time - file.stat().st_mtime
+                    if file_age > max_age_hours * 3600:  # Convertir horas a segundos
+                        try:
+                            file.unlink()
+                        except:
+                            pass
+        except Exception as e:
+            print(f"Error limpiando archivos temporales: {e}")
+    def _verify_audio_system(self):
+        """Verifica el estado del sistema de audio"""
+        if not self.audio_initialized or not pygame.mixer.get_init():
+            try:
+                if pygame.mixer.get_init():
+                    pygame.mixer.quit()
+                pygame.mixer.init(frequency=44100, size=-16, channels=2, buffer=4096)
+                pygame.mixer.music.set_volume(1.0)
+                self.audio_initialized = True
+                return True
+            except Exception as e:
+                print(f"Error reinicializando audio: {e}")
+                return False
+        return True
+    def init_audio(self):
+        """Inicializa el modelo TTS seleccionado"""
+        try:
+            if self.current_model == 'VITS':
+                model_info = self.AVAILABLE_MODELS[self.current_model]
+                print(f"Cargando modelo VITS: {model_info['name']}")
+                try:
+                    self.tts = TTS(model_name=model_info['name'])
+                    if hasattr(self.tts, 'synthesizer') and hasattr(self.tts.synthesizer, 'tts_config'):
+                        self.tts.synthesizer.tts_config.update(model_info['config'])
+                        print("Modelo VITS cargado correctamente")
+                        return True
+                    else:
+                        print("Error: El modelo VITS no tiene la estructura esperada")
+                        self.current_model = 'EDGE'  # Fallback a Edge si hay error
+                        return False
+                except Exception as vits_error:
+                    print(f"Error cargando modelo VITS: {vits_error}")
+                    self.current_model = 'EDGE'  # Fallback a Edge si hay error
+                    return False
+            return True
+        except Exception as e:
+            print(f"Error inicializando audio: {e}")
+            self.current_model = 'EDGE'  # Fallback a Edge si hay error
+            return False
+    def _number_to_words(self, number):
+        """Convierte un número a palabras en español"""
+        UNITS = ['', 'uno', 'dos', 'tres', 'cuatro', 'cinco', 'seis', 'siete', 'ocho', 'nueve']
+        TENS = ['', 'diez', 'veinte', 'treinta', 'cuarenta', 'cincuenta', 'sesenta', 'setenta', 'ochenta', 'noventa']
+        TEENS = ['diez', 'once', 'doce', 'trece', 'catorce', 'quince', 'dieciséis', 'diecisiete', 'dieciocho', 'diecinueve']
+        try:
+            num = int(number)
+            if num == 0:
+                return 'cero'
+            elif num < 0:
+                return f"menos {self._number_to_words(abs(num))}"
+            elif num < 10:
+                return UNITS[num]
+            elif num < 20:
+                return TEENS[num - 10]
+            elif num < 100:
+                tens = num // 10
+                units = num % 10
+                if units == 0:
+                    return TENS[tens]
+                else:
+                    return f"{TENS[tens]} y {UNITS[units]}"
+            else:
+                return str(num)  # Para números mayores a 99, mantener dígitos
+        except:
+            return number  # Si hay error, devolver el número original
+    def _clean_text(self, text):
+        """Limpia el texto de caracteres especiales antes de la síntesis"""
+        if not text:
+            return text
+        # Reemplazar asteriscos y otros caracteres especiales
+        replacements = {
+            '*': '',
+            '#': '',
+            '`': '',
+            '~': '',
+            '|': '',
+            '>': '',
+            '<': '',
+            '\\': '',
+            '&': 'y',
+            '_': ' ',
+            '...': ',',
+            '..': ',',
+            '---': ',',
+            '--': ',',
+            '%': ' por ciento',
+            '$': ' pesos',
+            '=': ' igual a ',
+            '+': ' más ',
+            '@': ' arroba ',
+        }
+        cleaned_text = text
+        for char, replacement in replacements.items():
+            cleaned_text = cleaned_text.replace(char, replacement)
+        # Convertir números a palabras
+        words = []
+        for word in cleaned_text.split():
+            # Verificar si es un número (entero o decimal)
+            if word.replace('.', '').replace('-', '').isdigit():
+                # Si es decimal
+                if '.' in word:
+                    parts = word.split('.')
+                    if len(parts) == 2:
+                        integer_part = self._number_to_words(parts[0])
+                        decimal_part = self._number_to_words(parts[1])
+                        words.append(f"{integer_part} punto {decimal_part}")
+                    else:
+                        words.append(word)
+                else:
+                    words.append(self._number_to_words(word))
+            else:
+                words.append(word)
+        cleaned_text = ' '.join(words)
+        # Eliminar espacios múltiples
+        cleaned_text = ' '.join(cleaned_text.split())
+        return cleaned_text
+    def text_to_speech(self, text, save_path=None):
+        """Genera audio a partir de texto con verificaciones mejoradas"""
+        if not text:
+            return None
+        # Limpiar el texto antes de procesarlo
+        text = self._clean_text(text)
+        if not text:
+            return None
+        if not self._verify_audio_system():
+            print("Sistema de audio no disponible")
+            return None
+        try:
+            temp_file = save_path or str(self.temp_dir / f"{uuid.uuid4()}.mp3")
+            print(f"Generando audio para modelo: {self.current_model}")
+            # Verificar modelo actual y generar audio
+            try:
+                if self.current_model == 'VITS':
+                    print("Usando modelo VITS")
+                    if not self.tts:
+                        print("Inicializando modelo VITS...")
+                        if not self.init_audio():
+                            print("Fallback a Edge debido a error en inicialización de VITS")
+                            return self.fallback_to_edge(text, temp_file)
+                    try:
+                        self.tts.tts_to_file(
+                            text=text,
+                            file_path=temp_file,
+                            speed=self.AVAILABLE_MODELS['VITS']['config']['speed']
+                        )
+                        if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0:
+                            print(f"Audio generado correctamente con VITS: {os.path.getsize(temp_file)} bytes")
+                            return temp_file
+                        raise Exception("Archivo de audio VITS inválido")
+                    except Exception as vits_error:
+                        print(f"Error generando audio con VITS: {vits_error}")
+                        return self.fallback_to_edge(text, temp_file)
+                elif self.current_model in ['EDGE', 'EDGE_ES']:
+                    return self.fallback_to_edge(text, temp_file)
+            except Exception as primary_error:
+                print(f"Error con el modelo primario {self.current_model}: {primary_error}")
+                return self.fallback_to_gtts(text, temp_file)
+            return temp_file
+        except Exception as e:
+            print(f"Error en text_to_speech: {e}")
+            return None
+        finally:
+            self._cleanup_old_files()
+    def fallback_to_edge(self, text, temp_file):
+        """Método de respaldo usando Edge TTS"""
+        try:
+            voice = self.AVAILABLE_MODELS['EDGE']['name']
+            print(f"Usando voz Edge como respaldo: {voice}")
+            for attempt in range(3):
+                try:
+                    async def tts_with_timeout():
+                        return await asyncio.wait_for(
+                            self.edge_tts_speak(text, voice, temp_file),
+                            timeout=15.0
+                        )
+                    asyncio.run(tts_with_timeout())
+                    if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0:
+                        print(f"Audio generado correctamente con Edge: {os.path.getsize(temp_file)} bytes")
+                        return temp_file
+                    raise Exception("Archivo de audio Edge inválido")
+                except Exception as e:
+                    print(f"Intento {attempt + 1} fallido con Edge: {e}")
+                    if attempt == 2:
+                        return self.fallback_to_gtts(text, temp_file)
+                    time.sleep(2 ** attempt)
+        except Exception as edge_error:
+            print(f"Error con Edge TTS: {edge_error}")
+            return self.fallback_to_gtts(text, temp_file)
+    def fallback_to_gtts(self, text, temp_file):
+        """Método final de respaldo usando gTTS"""
+        print("Usando gTTS como último respaldo")
+        try:
+            tts = gTTS(text=text, lang='es', slow=False)
+            temp_normal = str(self.temp_dir / f"temp_normal_{uuid.uuid4()}.mp3")
+            tts.save(temp_normal)
+            import ffmpeg
+            stream = ffmpeg.input(temp_normal)
+            stream = ffmpeg.output(stream, temp_file, acodec='libmp3lame', atempo=1.25)
+            ffmpeg.run(stream, overwrite_output=True, capture_stdout=True, capture_stderr=True)
+            try:
+                os.remove(temp_normal)
+            except:
+                pass
+            if os.path.exists(temp_file) and os.path.getsize(temp_file) > 0:
+                print(f"Audio generado correctamente con gTTS: {os.path.getsize(temp_file)} bytes")
+                return temp_file
+            raise Exception("Archivo de audio gTTS inválido")
+        except Exception as gtts_error:
+            print(f"Error con gTTS: {gtts_error}")
+            return None
+    async def edge_tts_speak(self, text, voice, output_file):
+        """Genera audio usando edge-tts"""
+        try:
+            print(f"Generando audio con voz: {voice}")
+            rate = self.AVAILABLE_MODELS[self.current_model]['rate']
+            print(f"Usando rate: {rate}")
+            communicate = edge_tts.Communicate(text, voice, rate=rate)
+            await communicate.save(str(output_file))
+            print(f"Audio generado correctamente con {voice}")
+            return True
+        except Exception as e:
+            print(f"Error generando audio con edge-tts: {e}")
+            raise
+    def stop_speaking(self):
+        """Detiene la reproducción actual"""
+        if self.is_speaking:
+            try:
+                self.should_stop = True
+                pygame.mixer.music.stop()
+                pygame.mixer.music.unload()
+                self.is_speaking = False
+                print("Reproducción detenida por interrupción")
+            except Exception as e:
+                print(f"Error al detener el audio: {e}")
+            finally:
+                self.is_speaking = False
+                self.should_stop = False
+    def change_model(self, model_name):
+        """Cambia el modelo TTS actual"""
+        if model_name not in self.AVAILABLE_MODELS:
+            print(f"Modelo {model_name} no disponible")
+            return False
+        try:
+            print(f"Cambiando a modelo {model_name}...")
+            self.current_model = model_name
+            self.init_audio()
+            return True
+        except Exception as e:
+            print(f"Error cambiando modelo: {e}")
+            return False
+    def is_currently_speaking(self):
+        """Verifica si hay audio reproduciéndose"""
+        return self.is_speaking
+    def create_audio_file(self, text, output_file):
+        """Crea un archivo de audio permanente"""
+        try:
+            if 'EDGE' in self.current_model:
+                voice = self.AVAILABLE_MODELS[self.current_model]['name']
+                async def tts_with_timeout():
+                    return await asyncio.wait_for(
+                        self.edge_tts_speak(text, voice, output_file),
+                        timeout=5.0
+                    )
+                asyncio.run(tts_with_timeout())
+            elif self.current_model == 'gTTS':
+                tts = gTTS(text=text, lang='es', slow=False)
+                tts.save(str(output_file))
+            else:  # VITS
+                self.tts.tts_to_file(
+                    text=text,
+                    file_path=str(output_file),
+                    speaker_wav=None,
+                    split_sentences=False
+                )
+            return str(output_file)
+        except Exception as e:
+            print(f"Error creando archivo de audio: {e}")
+            return None
+    def play_audio(self, file_path):
+        """Reproduce un archivo de audio con verificaciones mejoradas y escucha activa"""
+        if not self._verify_audio_system():
+            raise Exception("Sistema de audio no disponible")
+        try:
+            if not Path(file_path).exists():
+                raise FileNotFoundError(f"Archivo no encontrado: {file_path}")
+            if not Path(file_path).stat().st_size > 0:
+                raise ValueError("Archivo de audio vacío o corrupto")
+            with self.play_lock:
+                if self.is_speaking:
+                    self.stop_speaking()
+                self.is_speaking = True
+                pygame.mixer.music.load(file_path)
+                pygame.mixer.music.play()
+                # Notificar al detector de voz que estamos reproduciendo
+                if hasattr(self, 'voice_detector'):
+                    self.voice_detector.update_last_audio_output()
+                    # Mantener la escucha activa pero con umbral más alto
+                    self.voice_detector.set_high_threshold_mode(True)
+                # Configurar callback para cuando termine la reproducción
+                def on_music_end():
+                    self.is_speaking = False
+                    self.should_stop = False
+                    # Restaurar umbral normal de escucha
+                    if hasattr(self, 'voice_detector'):
+                        self.voice_detector.set_high_threshold_mode(False)
+                pygame.mixer.music.set_endevent(pygame.USEREVENT)
+                pygame.event.set_allowed(pygame.USEREVENT)
+                # Esperar a que termine sin bloquear
+                while pygame.mixer.music.get_busy() and not self.should_stop:
+                    for event in pygame.event.get():
+                        if event.type == pygame.USEREVENT:
+                            on_music_end()
+                    self.clock.tick(30)
+                if self.should_stop:
+                    self.stop_speaking()
+                    # Restaurar umbral normal al interrumpir
+                    if hasattr(self, 'voice_detector'):
+                        self.voice_detector.set_high_threshold_mode(False)
+        except Exception as e:
+            print(f"Error reproduciendo audio: {e}")
+            self.is_speaking = False
+            self.audio_initialized = False
+            if hasattr(self, 'voice_detector'):
+                self.voice_detector.set_high_threshold_mode(False)
+            raise
+        finally:
+            self.is_speaking = False
+            self.should_stop = False
+    def set_voice_detector(self, voice_detector):
+        """Establece el detector de voz para coordinar interrupciones"""
+        self.voice_detector = voice_detector
+    def __del__(self):
+        try:
+            pygame.mixer.quit()
+            if self.temp_dir.exists():
+                for file in self.temp_dir.glob("*"):
+                    try:
+                        file.unlink()
+                    except:
+                        pass
+                self.temp_dir.rmdir()
+        except:
+            pass