Spaces:

ronedgecomb
/

KittenTTS

Running on CPU Upgrade

@@ -4,13 +4,15 @@ emoji: 😻
 colorFrom: red
 colorTo: pink
 sdk: gradio
-python_version: 3.12
 sdk_version: 5.43.1
 suggested_hardware: cpu-basic
 app_file: app.py
 pinned: true
 license: apache-2.0
 short_description: Generate natural speech from text on any CPU
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: red
 colorTo: pink
 sdk: gradio
+python_version: 3.13
 sdk_version: 5.43.1
 suggested_hardware: cpu-basic
 app_file: app.py
 pinned: true
 license: apache-2.0
 short_description: Generate natural speech from text on any CPU
+models:
+- KittenML/kitten-tts-nano-0.2
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,269 @@

+from __future__ import annotations
+import json
+import os
+import re
+from typing import Dict, List, Tuple, Optional
+import numpy as np
+import onnxruntime as ort
+import phonemizer
+import soundfile as sf
+from huggingface_hub import hf_hub_download
+import gradio as gr
+# ---------------------------
+# Utility: tokenization + cleaning
+# ---------------------------
+_TOKENIZER_RE = re.compile(r"\w+|[^\w\s]")
+def basic_english_tokenize(text: str) -> List[str]:
+    """Simple whitespace + punctuation tokenizer."""
+    return _TOKENIZER_RE.findall(text)
+class TextCleaner:
+    """Character-to-index mapper matching the original symbol inventory."""
+    def __init__(self) -> None:
+        _pad = "$"
+        _punctuation = ';:,.!?¡¿—…"«»"" '
+        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+        _letters_ipa = (
+            "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+        )
+        symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+        self._dict: Dict[str, int] = {ch: i for i, ch in enumerate(symbols)}
+    def __call__(self, text: str) -> List[int]:
+        # Unknown chars are dropped to mirror original behavior.
+        return [self._dict[c] for c in text if c in self._dict]
+# ---------------------------
+# Core model
+# ---------------------------
+class KittenTTS_1_Onnx:
+    """
+    ONNX-based KittenTTS inference.
+    Matches the original interface:
+      - generate(text, voice, speed) -> np.ndarray
+      - generate_to_file(...)
+    """
+    # Original voice set kept for compatibility.
+    _DEFAULT_VOICES = [
+        "expr-voice-2-m", "expr-voice-2-f",
+        "expr-voice-3-m", "expr-voice-3-f",
+        "expr-voice-4-m", "expr-voice-4-f",
+        "expr-voice-5-m", "expr-voice-5-f",
+    ]
+    def __init__(
+        self,
+        model_path: str = "kitten_tts_nano_preview.onnx",
+        voices_path: str = "voices.npz",
+        providers: Optional[List[str]] = None,
+    ) -> None:
+        self.model_path = model_path
+        self.voices = np.load(voices_path)
+        self._phonemizer = phonemizer.backend.EspeakBackend(
+            language="en-us", preserve_punctuation=True, with_stress=True
+        )
+        self._cleaner = TextCleaner()
+        # Derive available voices from file when possible, else fall back to defaults.
+        try:
+            files = list(getattr(self.voices, "files", []))
+        except Exception:
+            files = []
+        self.available_voices: List[str] = (
+            [v for v in self._DEFAULT_VOICES if v in files] or (files or self._DEFAULT_VOICES)
+        )
+        # ONNX Runtime session with aggressive graph optimizations.
+        sess_opt = ort.SessionOptions()
+        sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        # Respect ORT thread env vars when present. Otherwise leave defaults.
+        # This avoids over-constraining environments like Spaces.
+        # providers selection
+        chosen_providers = (
+            providers
+            if providers
+            else ["CPUExecutionProvider"]
+        )
+        # Keep only supported providers to avoid runtime errors.
+        supported = set(ort.get_available_providers())
+        chosen_providers = [p for p in chosen_providers if p in supported] or list(supported)
+        self.session = ort.InferenceSession(
+            self.model_path,
+            sess_options=sess_opt,
+            providers=chosen_providers,
+        )
+    def _prepare_inputs(self, text: str, voice: str, speed: float) -> Dict[str, np.ndarray]:
+        if voice not in self.available_voices:
+            raise ValueError(
+                f"Voice '{voice}' not available. Choose from: {self.available_voices}"
+            )
+        # Phonemize then map to token IDs.
+        phonemes_list = self._phonemizer.phonemize([text])
+        phonemes = " ".join(basic_english_tokenize(phonemes_list[0]))
+        tokens = self._cleaner(phonemes)
+        # Start/end tokens as in the original.
+        tokens.insert(0, 0)
+        tokens.append(0)
+        input_ids = np.asarray([tokens], dtype=np.int64)
+        style_vec = self.voices[voice]
+        speed_arr = np.asarray([speed], dtype=np.float32)
+        return {"input_ids": input_ids, "style": style_vec, "speed": speed_arr}
+    def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
+        onnx_inputs = self._prepare_inputs(text, voice, speed)
+        outputs = self.session.run(None, onnx_inputs)
+        audio = np.asarray(outputs[0]).astype(np.float32)
+        # Preserve original trimming while guarding short sequences.
+        if audio.size > 15000:
+            audio = audio[5000:-10000]
+        return audio
+    def generate_to_file(
+        self,
+        text: str,
+        output_path: str,
+        voice: str = "expr-voice-5-m",
+        speed: float = 1.0,
+        sample_rate: int = 24000,
+    ) -> None:
+        audio = self.generate(text, voice, speed)
+        sf.write(output_path, audio, sample_rate)
+# ---------------------------
+# HF download wrapper (consolidated)
+# ---------------------------
+class KittenTTS:
+    """High-level wrapper that fetches model assets from Hugging Face."""
+    def __init__(
+        self,
+        model_name: str = "KittenML/kitten-tts-nano-0.1",
+        cache_dir: Optional[str] = None,
+        providers: Optional[List[str]] = None,
+    ) -> None:
+        repo_id = model_name if "/" in model_name else f"KittenML/{model_name}"
+        self._model = download_from_huggingface(repo_id=repo_id, cache_dir=cache_dir, providers=providers)
+    def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
+        return self._model.generate(text, voice=voice, speed=speed)
+    def generate_to_file(
+        self,
+        text: str,
+        output_path: str,
+        voice: str = "expr-voice-5-m",
+        speed: float = 1.0,
+        sample_rate: int = 24000,
+    ) -> None:
+        return self._model.generate_to_file(
+            text, output_path, voice=voice, speed=speed, sample_rate=sample_rate
+        )
+    @property
+    def available_voices(self) -> List[str]:
+        return self._model.available_voices
+def download_from_huggingface(
+    repo_id: str = "KittenML/kitten-tts-nano-0.1",
+    cache_dir: Optional[str] = None,
+    providers: Optional[List[str]] = None,
+) -> KittenTTS_1_Onnx:
+    """
+    Download config, model, and voices. Instantiate ONNX model.
+    """
+    config_path = hf_hub_download(repo_id=repo_id, filename="config.json", cache_dir=cache_dir)
+    with open(config_path, "r", encoding="utf-8") as f:
+        config = json.load(f)
+    if config.get("type") != "ONNX1":
+        raise ValueError("Unsupported model type in config.json.")
+    model_path = hf_hub_download(repo_id=repo_id, filename=config["model_file"], cache_dir=cache_dir)
+    voices_path = hf_hub_download(repo_id=repo_id, filename=config["voices"], cache_dir=cache_dir)
+    return KittenTTS_1_Onnx(model_path=model_path, voices_path=voices_path, providers=providers)
+def get_model(repo_id: str = "KittenML/kitten-tts-nano-0.1", cache_dir: Optional[str] = None) -> KittenTTS:
+    """Backward-compatible alias."""
+    return KittenTTS(repo_id, cache_dir)
+# ---------------------------
+# Gradio app
+# ---------------------------
+# Allow overriding model repo and providers via env on Spaces.
+_MODEL_REPO = os.getenv("MODEL_REPO", "KittenML/kitten-tts-nano-0.1")
+# Use CPU by default on Spaces; adjust if GPU EPs are available.
+_DEFAULT_PROVIDERS = os.getenv("ORT_PROVIDERS", "CPUExecutionProvider").split(",")
+# Single global instance for efficiency.
+_TTS = KittenTTS(_MODEL_REPO, providers=_DEFAULT_PROVIDERS)
+def _synthesize(text: str, voice: str, speed: float) -> Tuple[int, np.ndarray]:
+    if not text or not text.strip():
+        raise gr.Error("Please enter text.")
+    audio = _TTS.generate(text, voice=voice, speed=speed)
+    # Gradio expects (sample_rate, np.ndarray[float32])
+    return 24000, audio.astype(np.float32, copy=False)
+with gr.Blocks(title="KittenTTS Nano") as demo:
+    gr.Markdown("# KittenTTS Nano\nText-to-Speech using ONNX on CPU")
+    with gr.Row():
+        inp_text = gr.Textbox(
+            label="Text",
+            lines=6,
+            placeholder='Type something like: "The quick brown fox jumps over the lazy dog."',
+        )
+    with gr.Row():
+        voice = gr.Dropdown(
+            label="Voice",
+            choices=_TTS.available_voices,
+            value="expr-voice-5-m" if "expr-voice-5-m" in _TTS.available_voices else _TTS.available_voices[0],
+        )
+        speed = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="Speed")
+    out_audio = gr.Audio(label="Output Audio", type="numpy")
+    btn = gr.Button("Generate")
+    btn.click(_synthesize, inputs=[inp_text, voice, speed], outputs=out_audio)
+    gr.Examples(
+        examples=[
+            ["Hello from KittenTTS Nano.", "expr-voice-5-m", 1.0],
+            ["It begins with an Ugh. Another mysterious stain appears on a favorite shirt.", "expr-voice-2-f", 1.0],
+        ],
+        inputs=[inp_text, voice, speed],
+    )
+# Spaces will auto-run app.py. Local dev can still call launch().
+if __name__ == "__main__":
+    demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ espeak-ng

pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[project]
+name = "kittentts"
+version = "0.1.0"
+description = "Generate natural speech from text on any CPU"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "gradio>=5.43.1",
+    "huggingface-hub[hf-xet]>=0.34.4",
+    "numpy>=2.3.2",
+    "onnxruntime>=1.22.1",
+    "phonemizer>=3.3.0",
+    "soundfile>=0.13.1",
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff