Spaces:

Remsky
/

Kokoro-TTS-Zero

Running on Zero

App Files Files Community

Remsky commited on 15 days ago

Commit

27f8803

1 Parent(s): 3a912ba

Add v1.0.0 model support with KPipeline implementation

Browse files

Files changed (5) hide show

README.md +1 -0
app.py +36 -16
requirements.txt +5 -1
tts_factory.py +22 -0
tts_model_v1.py +168 -0

README.md CHANGED Viewed

@@ -10,6 +10,7 @@ pinned: true
 short_description: Accelerated Text-To-Speech on Kokoro-82M
 models:
 - hexgrad/kLegacy
 ---
 # Kokoro TTS Demo Space

 short_description: Accelerated Text-To-Speech on Kokoro-82M
 models:
 - hexgrad/kLegacy
+- hexgrad/Kokoro-82M
 ---
 # Kokoro TTS Demo Space

app.py CHANGED Viewed

@@ -9,13 +9,13 @@ from lib import format_audio_output
 from lib.ui_content import header_html, demo_text_info, styling
 from lib.book_utils import get_available_books, get_book_info, get_chapter_text
 from lib.text_utils import count_tokens
-from tts_model import TTSModel
 # Set HF_HOME for faster restarts with cached models/voices
 os.environ["HF_HOME"] = "/data/.huggingface"
-# Create TTS model instance
-model = TTSModel()
 # Configure logging
 logging.basicConfig(level=logging.DEBUG)
@@ -24,21 +24,24 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 logger.debug("Starting app initialization...")
-model = TTSModel()
-def initialize_model():
     """Initialize model and get voices"""
-    if model.model is None:
         if not model.initialize():
             raise gr.Error("Failed to initialize model")
-    voices = model.list_voices()
-    if not voices:
-        raise gr.Error("No voices found. Please check the voices directory.")
-    default_voice = 'af_sky' if 'af_sky' in voices else voices[0] if voices else None
-    return gr.update(choices=voices, value=default_voice)
 def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf, progress_state, start_time, gpu_timeout, progress):
     # Calculate time metrics
@@ -382,6 +385,14 @@ with gr.Blocks(title="Kokoro TTS Demo", css=styling) as demo:
             )
             with gr.Group():
                 voice_dropdown = gr.Dropdown(
                     label="Voice(s)",
                     choices=[],  # Start empty, will be populated after initialization
@@ -390,6 +401,15 @@ with gr.Blocks(title="Kokoro TTS Demo", css=styling) as demo:
                     multiselect=True
                 )
                 speed_slider = gr.Slider(
                     label="Speed",
                     minimum=0.5,
@@ -436,9 +456,9 @@ with gr.Blocks(title="Kokoro TTS Demo", css=styling) as demo:
         with gr.Column():
             gr.Markdown(demo_text_info)
-    # Initialize voices on load
     demo.load(
-        fn=initialize_model,
         outputs=[voice_dropdown]
     )

 from lib.ui_content import header_html, demo_text_info, styling
 from lib.book_utils import get_available_books, get_book_info, get_chapter_text
 from lib.text_utils import count_tokens
+from tts_factory import TTSFactory
 # Set HF_HOME for faster restarts with cached models/voices
 os.environ["HF_HOME"] = "/data/.huggingface"
+# Initialize model variable
+model = None
 # Configure logging
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 logger.debug("Starting app initialization...")
+def initialize_model(version="v0.19"):
     """Initialize model and get voices"""
+    global model
+    try:
+        # Create model instance using factory
+        model = TTSFactory.create_model(version)
         if not model.initialize():
             raise gr.Error("Failed to initialize model")
+        voices = model.list_voices()
+        if not voices:
+            raise gr.Error("No voices found. Please check the voices directory.")
+        default_voice = 'af_sky' if 'af_sky' in voices else voices[0] if voices else None
+        return gr.update(choices=voices, value=default_voice)
+    except Exception as e:
+        raise gr.Error(f"Failed to initialize model: {str(e)}")
 def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf, progress_state, start_time, gpu_timeout, progress):
     # Calculate time metrics
             )
             with gr.Group():
+                version_dropdown = gr.Dropdown(
+                    label="Model Version",
+                    choices=["v0.19", "v1.0.0"],
+                    value="v0.19",
+                    allow_custom_value=False,
+                    multiselect=False
+                )
                 voice_dropdown = gr.Dropdown(
                     label="Voice(s)",
                     choices=[],  # Start empty, will be populated after initialization
                     multiselect=True
                 )
+                def on_version_change(version):
+                    return initialize_model(version)
+                version_dropdown.change(
+                    fn=on_version_change,
+                    inputs=[version_dropdown],
+                    outputs=[voice_dropdown]
+                )
                 speed_slider = gr.Slider(
                     label="Speed",
                     minimum=0.5,
         with gr.Column():
             gr.Markdown(demo_text_info)
+    # Initialize voices on load with default version
     demo.load(
+        fn=lambda: initialize_model("v0.19"),
         outputs=[voice_dropdown]
     )

requirements.txt CHANGED Viewed

@@ -9,4 +9,8 @@ regex==2024.11.6
 tiktoken==0.8.0
 transformers==4.47.1
 munch==4.0.0
-matplotlib==3.4.3

 tiktoken==0.8.0
 transformers==4.47.1
 munch==4.0.0
+matplotlib==3.4.3
+# v1.0.0 dependencies
+kokoro>=1.0.0
+misaki[en]>=0.1.0

tts_factory.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from tts_model import TTSModel
+from tts_model_v1 import TTSModelV1
+class TTSFactory:
+    """Factory class to create appropriate TTS model version"""
+    @staticmethod
+    def create_model(version="v0.19"):
+        """Create TTS model instance for specified version
+        Args:
+            version: Model version to use ("v0.19" or "v1.0.0")
+        Returns:
+            TTSModel or TTSModelV1 instance
+        """
+        if version == "v0.19":
+            return TTSModel()
+        elif version == "v1.0.0":
+            return TTSModelV1()
+        else:
+            raise ValueError(f"Unsupported version: {version}")

tts_model_v1.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import torch
+import numpy as np
+import time
+from typing import Tuple, List
+import soundfile as sf
+from kokoro import KPipeline
+import spaces
+class TTSModelV1:
+    """KPipeline-based TTS model for v1.0.0"""
+    def __init__(self):
+        self.pipeline = None
+        self.voices_dir = "voices"
+        self.model_repo = "hexgrad/Kokoro-82M"
+    def initialize(self) -> bool:
+        """Initialize KPipeline and verify voices"""
+        try:
+            print("Initializing v1.0.0 model...")
+            # Initialize KPipeline with American English
+            self.pipeline = KPipeline(lang_code='a')
+            # Verify local voice files are available
+            voices_dir = os.path.join(self.voices_dir, "voices")
+            if not os.path.exists(voices_dir):
+                raise ValueError("Voice files not found")
+            # Verify voices were downloaded successfully
+            available_voices = self.list_voices()
+            if not available_voices:
+                print("Warning: No voices found after initialization")
+            else:
+                print(f"Found {len(available_voices)} voices")
+            print("Model initialization complete")
+            return True
+        except Exception as e:
+            print(f"Error initializing model: {str(e)}")
+            return False
+    def list_voices(self) -> List[str]:
+        """List available voices"""
+        voices = []
+        voices_subdir = os.path.join(self.voices_dir, "voices")
+        if os.path.exists(voices_subdir):
+            for file in os.listdir(voices_subdir):
+                if file.endswith(".pt"):
+                    voice_name = file[:-3]
+                    voices.append(voice_name)
+        return voices
+    @spaces.GPU(duration=None)  # Duration will be set by the UI
+    def generate_speech(self, text: str, voice_names: list[str], speed: float = 1.0, gpu_timeout: int = 60, progress_callback=None, progress_state=None, progress=None) -> Tuple[np.ndarray, float]:
+        """Generate speech from text using KPipeline
+        Args:
+            text: Input text to convert to speech
+            voice_names: List of voice names to use (will be mixed if multiple)
+            speed: Speech speed multiplier
+            progress_callback: Optional callback function
+            progress_state: Dictionary tracking generation progress metrics
+            progress: Progress callback from Gradio
+        """
+        try:
+            start_time = time.time()
+            if not text or not voice_names:
+                raise ValueError("Text and voice name are required")
+            # Handle voice mixing
+            if isinstance(voice_names, list) and len(voice_names) > 1:
+                t_voices = []
+                for voice in voice_names:
+                    try:
+                        voice_path = os.path.join(self.voices_dir, "voices", f"{voice}.pt")
+                        try:
+                            voicepack = torch.load(voice_path, weights_only=True)
+                        except Exception as e:
+                            print(f"Warning: weights_only load failed, attempting full load: {str(e)}")
+                            voicepack = torch.load(voice_path, weights_only=False)
+                        t_voices.append(voicepack)
+                    except Exception as e:
+                        print(f"Warning: Failed to load voice {voice}: {str(e)}")
+                # Combine voices by taking mean
+                voicepack = torch.mean(torch.stack(t_voices), dim=0)
+                voice_name = "_".join(voice_names)
+                # Save mixed voice temporarily
+                mixed_voice_path = os.path.join(self.voices_dir, "voices", f"{voice_name}.pt")
+                torch.save(voicepack, mixed_voice_path)
+            else:
+                voice_name = voice_names[0]
+            # Generate speech using KPipeline
+            generator = self.pipeline(
+                text,
+                voice=voice_name,
+                speed=speed,
+                split_pattern=r'\n+'  # Default chunking pattern
+            )
+            # Process chunks and collect metrics
+            audio_chunks = []
+            chunk_times = []
+            chunk_sizes = []
+            total_tokens = 0
+            for i, (gs, ps, audio) in enumerate(generator):
+                chunk_start = time.time()
+                # Store chunk audio
+                audio_chunks.append(audio)
+                # Calculate metrics
+                chunk_time = time.time() - chunk_start
+                chunk_times.append(chunk_time)
+                chunk_sizes.append(len(gs))  # Use grapheme length as chunk size
+                # Update progress if callback provided
+                if progress_callback:
+                    chunk_duration = len(audio) / 24000
+                    rtf = chunk_time / chunk_duration
+                    progress_callback(
+                        i + 1,
+                        -1,  # Total chunks unknown with generator
+                        len(gs) / chunk_time,  # tokens/sec
+                        rtf,
+                        progress_state,
+                        start_time,
+                        gpu_timeout,
+                        progress
+                    )
+                print(f"Chunk {i+1} processed in {chunk_time:.2f}s")
+                print(f"Graphemes: {gs}")
+                print(f"Phonemes: {ps}")
+            # Concatenate audio chunks
+            audio = np.concatenate(audio_chunks)
+            # Cleanup temporary mixed voice if created
+            if len(voice_names) > 1:
+                try:
+                    os.remove(mixed_voice_path)
+                except:
+                    pass
+            # Return audio and metrics
+            return (
+                audio,
+                len(audio) / 24000,
+                {
+                    "chunk_times": chunk_times,
+                    "chunk_sizes": chunk_sizes,
+                    "tokens_per_sec": [float(x) for x in progress_state["tokens_per_sec"]] if progress_state else [],
+                    "rtf": [float(x) for x in progress_state["rtf"]] if progress_state else [],
+                    "total_tokens": total_tokens,
+                    "total_time": time.time() - start_time
+                }
+            )
+        except Exception as e:
+            print(f"Error generating speech: {str(e)}")
+            raise