Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Initial commit
Browse files- .gitignore +10 -0
- .python-version +1 -0
- README.md +3 -1
- app.py +269 -0
- packages.txt +1 -0
- pyproject.toml +14 -0
- uv.lock +0 -0
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python-generated files
|
2 |
+
__pycache__/
|
3 |
+
*.py[oc]
|
4 |
+
build/
|
5 |
+
dist/
|
6 |
+
wheels/
|
7 |
+
*.egg-info
|
8 |
+
|
9 |
+
# Virtual environments
|
10 |
+
.venv
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.13
|
README.md
CHANGED
@@ -4,13 +4,15 @@ emoji: 😻
|
|
4 |
colorFrom: red
|
5 |
colorTo: pink
|
6 |
sdk: gradio
|
7 |
-
python_version: 3.
|
8 |
sdk_version: 5.43.1
|
9 |
suggested_hardware: cpu-basic
|
10 |
app_file: app.py
|
11 |
pinned: true
|
12 |
license: apache-2.0
|
13 |
short_description: Generate natural speech from text on any CPU
|
|
|
|
|
14 |
---
|
15 |
|
16 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
4 |
colorFrom: red
|
5 |
colorTo: pink
|
6 |
sdk: gradio
|
7 |
+
python_version: 3.13
|
8 |
sdk_version: 5.43.1
|
9 |
suggested_hardware: cpu-basic
|
10 |
app_file: app.py
|
11 |
pinned: true
|
12 |
license: apache-2.0
|
13 |
short_description: Generate natural speech from text on any CPU
|
14 |
+
models:
|
15 |
+
- KittenML/kitten-tts-nano-0.2
|
16 |
---
|
17 |
|
18 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
from typing import Dict, List, Tuple, Optional
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
import onnxruntime as ort
|
10 |
+
import phonemizer
|
11 |
+
import soundfile as sf
|
12 |
+
from huggingface_hub import hf_hub_download
|
13 |
+
import gradio as gr
|
14 |
+
|
15 |
+
|
16 |
+
# ---------------------------
|
17 |
+
# Utility: tokenization + cleaning
|
18 |
+
# ---------------------------
|
19 |
+
|
20 |
+
_TOKENIZER_RE = re.compile(r"\w+|[^\w\s]")
|
21 |
+
|
22 |
+
|
23 |
+
def basic_english_tokenize(text: str) -> List[str]:
|
24 |
+
"""Simple whitespace + punctuation tokenizer."""
|
25 |
+
return _TOKENIZER_RE.findall(text)
|
26 |
+
|
27 |
+
|
28 |
+
class TextCleaner:
|
29 |
+
"""Character-to-index mapper matching the original symbol inventory."""
|
30 |
+
|
31 |
+
def __init__(self) -> None:
|
32 |
+
_pad = "$"
|
33 |
+
_punctuation = ';:,.!?¡¿—…"«»"" '
|
34 |
+
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
35 |
+
_letters_ipa = (
|
36 |
+
"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
37 |
+
)
|
38 |
+
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
39 |
+
self._dict: Dict[str, int] = {ch: i for i, ch in enumerate(symbols)}
|
40 |
+
|
41 |
+
def __call__(self, text: str) -> List[int]:
|
42 |
+
# Unknown chars are dropped to mirror original behavior.
|
43 |
+
return [self._dict[c] for c in text if c in self._dict]
|
44 |
+
|
45 |
+
|
46 |
+
# ---------------------------
|
47 |
+
# Core model
|
48 |
+
# ---------------------------
|
49 |
+
|
50 |
+
class KittenTTS_1_Onnx:
|
51 |
+
"""
|
52 |
+
ONNX-based KittenTTS inference.
|
53 |
+
|
54 |
+
Matches the original interface:
|
55 |
+
- generate(text, voice, speed) -> np.ndarray
|
56 |
+
- generate_to_file(...)
|
57 |
+
"""
|
58 |
+
|
59 |
+
# Original voice set kept for compatibility.
|
60 |
+
_DEFAULT_VOICES = [
|
61 |
+
"expr-voice-2-m", "expr-voice-2-f",
|
62 |
+
"expr-voice-3-m", "expr-voice-3-f",
|
63 |
+
"expr-voice-4-m", "expr-voice-4-f",
|
64 |
+
"expr-voice-5-m", "expr-voice-5-f",
|
65 |
+
]
|
66 |
+
|
67 |
+
def __init__(
|
68 |
+
self,
|
69 |
+
model_path: str = "kitten_tts_nano_preview.onnx",
|
70 |
+
voices_path: str = "voices.npz",
|
71 |
+
providers: Optional[List[str]] = None,
|
72 |
+
) -> None:
|
73 |
+
self.model_path = model_path
|
74 |
+
self.voices = np.load(voices_path)
|
75 |
+
self._phonemizer = phonemizer.backend.EspeakBackend(
|
76 |
+
language="en-us", preserve_punctuation=True, with_stress=True
|
77 |
+
)
|
78 |
+
self._cleaner = TextCleaner()
|
79 |
+
|
80 |
+
# Derive available voices from file when possible, else fall back to defaults.
|
81 |
+
try:
|
82 |
+
files = list(getattr(self.voices, "files", []))
|
83 |
+
except Exception:
|
84 |
+
files = []
|
85 |
+
self.available_voices: List[str] = (
|
86 |
+
[v for v in self._DEFAULT_VOICES if v in files] or (files or self._DEFAULT_VOICES)
|
87 |
+
)
|
88 |
+
|
89 |
+
# ONNX Runtime session with aggressive graph optimizations.
|
90 |
+
sess_opt = ort.SessionOptions()
|
91 |
+
sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
92 |
+
|
93 |
+
# Respect ORT thread env vars when present. Otherwise leave defaults.
|
94 |
+
# This avoids over-constraining environments like Spaces.
|
95 |
+
# providers selection
|
96 |
+
chosen_providers = (
|
97 |
+
providers
|
98 |
+
if providers
|
99 |
+
else ["CPUExecutionProvider"]
|
100 |
+
)
|
101 |
+
# Keep only supported providers to avoid runtime errors.
|
102 |
+
supported = set(ort.get_available_providers())
|
103 |
+
chosen_providers = [p for p in chosen_providers if p in supported] or list(supported)
|
104 |
+
|
105 |
+
self.session = ort.InferenceSession(
|
106 |
+
self.model_path,
|
107 |
+
sess_options=sess_opt,
|
108 |
+
providers=chosen_providers,
|
109 |
+
)
|
110 |
+
|
111 |
+
def _prepare_inputs(self, text: str, voice: str, speed: float) -> Dict[str, np.ndarray]:
|
112 |
+
if voice not in self.available_voices:
|
113 |
+
raise ValueError(
|
114 |
+
f"Voice '{voice}' not available. Choose from: {self.available_voices}"
|
115 |
+
)
|
116 |
+
|
117 |
+
# Phonemize then map to token IDs.
|
118 |
+
phonemes_list = self._phonemizer.phonemize([text])
|
119 |
+
phonemes = " ".join(basic_english_tokenize(phonemes_list[0]))
|
120 |
+
tokens = self._cleaner(phonemes)
|
121 |
+
|
122 |
+
# Start/end tokens as in the original.
|
123 |
+
tokens.insert(0, 0)
|
124 |
+
tokens.append(0)
|
125 |
+
|
126 |
+
input_ids = np.asarray([tokens], dtype=np.int64)
|
127 |
+
style_vec = self.voices[voice]
|
128 |
+
speed_arr = np.asarray([speed], dtype=np.float32)
|
129 |
+
|
130 |
+
return {"input_ids": input_ids, "style": style_vec, "speed": speed_arr}
|
131 |
+
|
132 |
+
def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
|
133 |
+
onnx_inputs = self._prepare_inputs(text, voice, speed)
|
134 |
+
outputs = self.session.run(None, onnx_inputs)
|
135 |
+
audio = np.asarray(outputs[0]).astype(np.float32)
|
136 |
+
|
137 |
+
# Preserve original trimming while guarding short sequences.
|
138 |
+
if audio.size > 15000:
|
139 |
+
audio = audio[5000:-10000]
|
140 |
+
return audio
|
141 |
+
|
142 |
+
def generate_to_file(
|
143 |
+
self,
|
144 |
+
text: str,
|
145 |
+
output_path: str,
|
146 |
+
voice: str = "expr-voice-5-m",
|
147 |
+
speed: float = 1.0,
|
148 |
+
sample_rate: int = 24000,
|
149 |
+
) -> None:
|
150 |
+
audio = self.generate(text, voice, speed)
|
151 |
+
sf.write(output_path, audio, sample_rate)
|
152 |
+
|
153 |
+
|
154 |
+
# ---------------------------
|
155 |
+
# HF download wrapper (consolidated)
|
156 |
+
# ---------------------------
|
157 |
+
|
158 |
+
class KittenTTS:
|
159 |
+
"""High-level wrapper that fetches model assets from Hugging Face."""
|
160 |
+
|
161 |
+
def __init__(
|
162 |
+
self,
|
163 |
+
model_name: str = "KittenML/kitten-tts-nano-0.1",
|
164 |
+
cache_dir: Optional[str] = None,
|
165 |
+
providers: Optional[List[str]] = None,
|
166 |
+
) -> None:
|
167 |
+
repo_id = model_name if "/" in model_name else f"KittenML/{model_name}"
|
168 |
+
self._model = download_from_huggingface(repo_id=repo_id, cache_dir=cache_dir, providers=providers)
|
169 |
+
|
170 |
+
def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
|
171 |
+
return self._model.generate(text, voice=voice, speed=speed)
|
172 |
+
|
173 |
+
def generate_to_file(
|
174 |
+
self,
|
175 |
+
text: str,
|
176 |
+
output_path: str,
|
177 |
+
voice: str = "expr-voice-5-m",
|
178 |
+
speed: float = 1.0,
|
179 |
+
sample_rate: int = 24000,
|
180 |
+
) -> None:
|
181 |
+
return self._model.generate_to_file(
|
182 |
+
text, output_path, voice=voice, speed=speed, sample_rate=sample_rate
|
183 |
+
)
|
184 |
+
|
185 |
+
@property
|
186 |
+
def available_voices(self) -> List[str]:
|
187 |
+
return self._model.available_voices
|
188 |
+
|
189 |
+
|
190 |
+
def download_from_huggingface(
|
191 |
+
repo_id: str = "KittenML/kitten-tts-nano-0.1",
|
192 |
+
cache_dir: Optional[str] = None,
|
193 |
+
providers: Optional[List[str]] = None,
|
194 |
+
) -> KittenTTS_1_Onnx:
|
195 |
+
"""
|
196 |
+
Download config, model, and voices. Instantiate ONNX model.
|
197 |
+
"""
|
198 |
+
config_path = hf_hub_download(repo_id=repo_id, filename="config.json", cache_dir=cache_dir)
|
199 |
+
with open(config_path, "r", encoding="utf-8") as f:
|
200 |
+
config = json.load(f)
|
201 |
+
|
202 |
+
if config.get("type") != "ONNX1":
|
203 |
+
raise ValueError("Unsupported model type in config.json.")
|
204 |
+
|
205 |
+
model_path = hf_hub_download(repo_id=repo_id, filename=config["model_file"], cache_dir=cache_dir)
|
206 |
+
voices_path = hf_hub_download(repo_id=repo_id, filename=config["voices"], cache_dir=cache_dir)
|
207 |
+
return KittenTTS_1_Onnx(model_path=model_path, voices_path=voices_path, providers=providers)
|
208 |
+
|
209 |
+
|
210 |
+
def get_model(repo_id: str = "KittenML/kitten-tts-nano-0.1", cache_dir: Optional[str] = None) -> KittenTTS:
|
211 |
+
"""Backward-compatible alias."""
|
212 |
+
return KittenTTS(repo_id, cache_dir)
|
213 |
+
|
214 |
+
|
215 |
+
# ---------------------------
|
216 |
+
# Gradio app
|
217 |
+
# ---------------------------
|
218 |
+
|
219 |
+
# Allow overriding model repo and providers via env on Spaces.
|
220 |
+
_MODEL_REPO = os.getenv("MODEL_REPO", "KittenML/kitten-tts-nano-0.1")
|
221 |
+
# Use CPU by default on Spaces; adjust if GPU EPs are available.
|
222 |
+
_DEFAULT_PROVIDERS = os.getenv("ORT_PROVIDERS", "CPUExecutionProvider").split(",")
|
223 |
+
|
224 |
+
# Single global instance for efficiency.
|
225 |
+
_TTS = KittenTTS(_MODEL_REPO, providers=_DEFAULT_PROVIDERS)
|
226 |
+
|
227 |
+
|
228 |
+
def _synthesize(text: str, voice: str, speed: float) -> Tuple[int, np.ndarray]:
|
229 |
+
if not text or not text.strip():
|
230 |
+
raise gr.Error("Please enter text.")
|
231 |
+
audio = _TTS.generate(text, voice=voice, speed=speed)
|
232 |
+
# Gradio expects (sample_rate, np.ndarray[float32])
|
233 |
+
return 24000, audio.astype(np.float32, copy=False)
|
234 |
+
|
235 |
+
|
236 |
+
with gr.Blocks(title="KittenTTS Nano") as demo:
|
237 |
+
gr.Markdown("# KittenTTS Nano\nText-to-Speech using ONNX on CPU")
|
238 |
+
|
239 |
+
with gr.Row():
|
240 |
+
inp_text = gr.Textbox(
|
241 |
+
label="Text",
|
242 |
+
lines=6,
|
243 |
+
placeholder='Type something like: "The quick brown fox jumps over the lazy dog."',
|
244 |
+
)
|
245 |
+
|
246 |
+
with gr.Row():
|
247 |
+
voice = gr.Dropdown(
|
248 |
+
label="Voice",
|
249 |
+
choices=_TTS.available_voices,
|
250 |
+
value="expr-voice-5-m" if "expr-voice-5-m" in _TTS.available_voices else _TTS.available_voices[0],
|
251 |
+
)
|
252 |
+
speed = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="Speed")
|
253 |
+
|
254 |
+
out_audio = gr.Audio(label="Output Audio", type="numpy")
|
255 |
+
btn = gr.Button("Generate")
|
256 |
+
|
257 |
+
btn.click(_synthesize, inputs=[inp_text, voice, speed], outputs=out_audio)
|
258 |
+
|
259 |
+
gr.Examples(
|
260 |
+
examples=[
|
261 |
+
["Hello from KittenTTS Nano.", "expr-voice-5-m", 1.0],
|
262 |
+
["It begins with an Ugh. Another mysterious stain appears on a favorite shirt.", "expr-voice-2-f", 1.0],
|
263 |
+
],
|
264 |
+
inputs=[inp_text, voice, speed],
|
265 |
+
)
|
266 |
+
|
267 |
+
# Spaces will auto-run app.py. Local dev can still call launch().
|
268 |
+
if __name__ == "__main__":
|
269 |
+
demo.launch()
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
espeak-ng
|
pyproject.toml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "kittentts"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Generate natural speech from text on any CPU"
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.13"
|
7 |
+
dependencies = [
|
8 |
+
"gradio>=5.43.1",
|
9 |
+
"huggingface-hub[hf-xet]>=0.34.4",
|
10 |
+
"numpy>=2.3.2",
|
11 |
+
"onnxruntime>=1.22.1",
|
12 |
+
"phonemizer>=3.3.0",
|
13 |
+
"soundfile>=0.13.1",
|
14 |
+
]
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|