Spaces:

Remsky
/

Kokoro-TTS-Zero

Running on Zero

App Files Files Community

Remsky commited on 15 days ago

Commit

80c0dbf

1 Parent(s): bb43905

Refactor TTSModelV1 to load voice mappings from JSON and simplify voice selection

Browse files

Files changed (2) hide show

tts_model_v1.py +10 -60
voices/v1_voices.json +32 -0

tts_model_v1.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import torch
 import numpy as np
 import time
@@ -6,54 +7,32 @@ from typing import Tuple, List
 import soundfile as sf
 from kokoro import KPipeline
 import spaces
-from lib.file_utils import download_voice_files, ensure_dir
 class TTSModelV1:
     """KPipeline-based TTS model for v1.0.0"""
     def __init__(self):
         self.pipeline = None
-        self.model_repo = "hexgrad/Kokoro-82M"
-        # Use v1 voices from Kokoro-82M repo
-        self.voices_dir = os.path.join(os.path.dirname(__file__), "voices")
     def initialize(self) -> bool:
-        """Initialize KPipeline and verify voices"""
         try:
             print("Initializing v1.0.0 model...")
             self.pipeline = None # cannot be initialized outside of GPU decorator
-            # Download v1 voices if needed
-            ensure_dir(self.voices_dir)
-            if not os.path.exists(os.path.join(self.voices_dir, "voices")):
-                print("Downloading v1 voices...")
-                download_voice_files(self.model_repo, "voices", self.voices_dir)
-            # Verify voices were downloaded successfully
-            available_voices = self.list_voices()
-            if not available_voices:
-                print("Warning: No voices found after initialization")
-            else:
-                print(f"Found {len(available_voices)} voices")
             print("Model initialization complete")
             return True
         except Exception as e:
             print(f"Error initializing model: {str(e)}")
             return False
     def list_voices(self) -> List[str]:
         """List available voices"""
-        voices = []
-        voices_dir = os.path.join(self.voices_dir, "voices")
-        if os.path.exists(voices_dir):
-            for file in os.listdir(voices_dir):
-                if file.endswith(".pt"):
-                    voice_name = file[:-3]
-                    voices.append(voice_name)
-        return voices
     @spaces.GPU(duration=None)  # Duration will be set by the UI
     def generate_speech(self, text: str, voice_names: list[str], speed: float = 1.0, gpu_timeout: int = 60, progress_callback=None, progress_state=None, progress=None) -> Tuple[np.ndarray, float]:
@@ -76,35 +55,12 @@ class TTSModelV1:
             if not text or not voice_names:
                 raise ValueError("Text and voice name are required")
-            # Handle voice mixing
             if isinstance(voice_names, list) and len(voice_names) > 1:
-                t_voices = []
-                for voice in voice_names:
-                    try:
-                        voice_path = os.path.join(self.voices_dir, "voices", f"{voice}.pt")
-                        try:
-                            voicepack = torch.load(voice_path, weights_only=True)
-                        except Exception as e:
-                            print(f"Warning: weights_only load failed, attempting full load: {str(e)}")
-                            voicepack = torch.load(voice_path, weights_only=False)
-                        t_voices.append(voicepack)
-                    except Exception as e:
-                        print(f"Warning: Failed to load voice {voice}: {str(e)}")
-                # Combine voices by taking mean
-                voicepack = torch.mean(torch.stack(t_voices), dim=0)
                 voice_name = "_".join(voice_names)
-                # Save mixed voice temporarily
-                mixed_voice_path = os.path.join(self.voices_dir, "voices", f"{voice_name}.pt")
-                torch.save(voicepack, mixed_voice_path)
             else:
                 voice_name = voice_names[0]
-                voice_path = os.path.join(self.voices_dir, "voices", f"{voice_name}.pt")
-                try:
-                    voicepack = torch.load(voice_path, weights_only=True)
-                except Exception as e:
-                    print(f"Warning: weights_only load failed, attempting full load: {str(e)}")
-                    voicepack = torch.load(voice_path, weights_only=False)
             # Initialize tracking
             audio_chunks = []
@@ -172,12 +128,6 @@ class TTSModelV1:
             # Concatenate audio chunks
             audio = np.concatenate(audio_chunks)
-            # Cleanup temporary mixed voice if created
-            if len(voice_names) > 1:
-                try:
-                    os.remove(mixed_voice_path)
-                except:
-                    pass
             # Return audio and metrics
             return (

 import os
+import json
 import torch
 import numpy as np
 import time
 import soundfile as sf
 from kokoro import KPipeline
 import spaces
 class TTSModelV1:
     """KPipeline-based TTS model for v1.0.0"""
     def __init__(self):
         self.pipeline = None
+        # Load v1 voice mappings
+        voice_map_path = os.path.join(os.path.dirname(__file__), "voices", "v1_voices.json")
+        with open(voice_map_path) as f:
+            self.voice_map = json.load(f)
     def initialize(self) -> bool:
+        """Initialize KPipeline"""
         try:
             print("Initializing v1.0.0 model...")
             self.pipeline = None # cannot be initialized outside of GPU decorator
             print("Model initialization complete")
             return True
         except Exception as e:
             print(f"Error initializing model: {str(e)}")
             return False
     def list_voices(self) -> List[str]:
         """List available voices"""
+        # Return all voices from voice map
+        return self.voice_map["american"] + self.voice_map["british"]
     @spaces.GPU(duration=None)  # Duration will be set by the UI
     def generate_speech(self, text: str, voice_names: list[str], speed: float = 1.0, gpu_timeout: int = 60, progress_callback=None, progress_state=None, progress=None) -> Tuple[np.ndarray, float]:
             if not text or not voice_names:
                 raise ValueError("Text and voice name are required")
+            # Handle voice selection
             if isinstance(voice_names, list) and len(voice_names) > 1:
+                # For multiple voices, join them with underscore
                 voice_name = "_".join(voice_names)
             else:
                 voice_name = voice_names[0]
             # Initialize tracking
             audio_chunks = []
             # Concatenate audio chunks
             audio = np.concatenate(audio_chunks)
             # Return audio and metrics
             return (

voices/v1_voices.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "american": [
+        "af_alloy",
+        "af_aoede",
+        "af_bella",
+        "af_jessica",
+        "af_kore",
+        "af_nicole",
+        "af_nova",
+        "af_river",
+        "af_sarah",
+        "af_sky",
+        "am_adam",
+        "am_echo",
+        "am_eric",
+        "am_fenrir",
+        "am_liam",
+        "am_michael",
+        "am_onyx",
+        "am_puck"
+    ],
+    "british": [
+        "bf_alice",
+        "bf_emma",
+        "bf_isabella",
+        "bf_lily",
+        "bm_daniel",
+        "bm_fable",
+        "bm_george",
+        "bm_lewis"
+    ]
+}