ronedgecomb commited on
Commit
eebc876
·
verified ·
1 Parent(s): 3b35894

Initial commit

Browse files
Files changed (7) hide show
  1. .gitignore +10 -0
  2. .python-version +1 -0
  3. README.md +3 -1
  4. app.py +269 -0
  5. packages.txt +1 -0
  6. pyproject.toml +14 -0
  7. uv.lock +0 -0
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.13
README.md CHANGED
@@ -4,13 +4,15 @@ emoji: 😻
4
  colorFrom: red
5
  colorTo: pink
6
  sdk: gradio
7
- python_version: 3.12
8
  sdk_version: 5.43.1
9
  suggested_hardware: cpu-basic
10
  app_file: app.py
11
  pinned: true
12
  license: apache-2.0
13
  short_description: Generate natural speech from text on any CPU
 
 
14
  ---
15
 
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: red
5
  colorTo: pink
6
  sdk: gradio
7
+ python_version: 3.13
8
  sdk_version: 5.43.1
9
  suggested_hardware: cpu-basic
10
  app_file: app.py
11
  pinned: true
12
  license: apache-2.0
13
  short_description: Generate natural speech from text on any CPU
14
+ models:
15
+ - KittenML/kitten-tts-nano-0.2
16
  ---
17
 
18
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ from typing import Dict, List, Tuple, Optional
7
+
8
+ import numpy as np
9
+ import onnxruntime as ort
10
+ import phonemizer
11
+ import soundfile as sf
12
+ from huggingface_hub import hf_hub_download
13
+ import gradio as gr
14
+
15
+
16
+ # ---------------------------
17
+ # Utility: tokenization + cleaning
18
+ # ---------------------------
19
+
20
+ _TOKENIZER_RE = re.compile(r"\w+|[^\w\s]")
21
+
22
+
23
+ def basic_english_tokenize(text: str) -> List[str]:
24
+ """Simple whitespace + punctuation tokenizer."""
25
+ return _TOKENIZER_RE.findall(text)
26
+
27
+
28
+ class TextCleaner:
29
+ """Character-to-index mapper matching the original symbol inventory."""
30
+
31
+ def __init__(self) -> None:
32
+ _pad = "$"
33
+ _punctuation = ';:,.!?¡¿—…"«»"" '
34
+ _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
35
+ _letters_ipa = (
36
+ "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
37
+ )
38
+ symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
39
+ self._dict: Dict[str, int] = {ch: i for i, ch in enumerate(symbols)}
40
+
41
+ def __call__(self, text: str) -> List[int]:
42
+ # Unknown chars are dropped to mirror original behavior.
43
+ return [self._dict[c] for c in text if c in self._dict]
44
+
45
+
46
+ # ---------------------------
47
+ # Core model
48
+ # ---------------------------
49
+
50
+ class KittenTTS_1_Onnx:
51
+ """
52
+ ONNX-based KittenTTS inference.
53
+
54
+ Matches the original interface:
55
+ - generate(text, voice, speed) -> np.ndarray
56
+ - generate_to_file(...)
57
+ """
58
+
59
+ # Original voice set kept for compatibility.
60
+ _DEFAULT_VOICES = [
61
+ "expr-voice-2-m", "expr-voice-2-f",
62
+ "expr-voice-3-m", "expr-voice-3-f",
63
+ "expr-voice-4-m", "expr-voice-4-f",
64
+ "expr-voice-5-m", "expr-voice-5-f",
65
+ ]
66
+
67
+ def __init__(
68
+ self,
69
+ model_path: str = "kitten_tts_nano_preview.onnx",
70
+ voices_path: str = "voices.npz",
71
+ providers: Optional[List[str]] = None,
72
+ ) -> None:
73
+ self.model_path = model_path
74
+ self.voices = np.load(voices_path)
75
+ self._phonemizer = phonemizer.backend.EspeakBackend(
76
+ language="en-us", preserve_punctuation=True, with_stress=True
77
+ )
78
+ self._cleaner = TextCleaner()
79
+
80
+ # Derive available voices from file when possible, else fall back to defaults.
81
+ try:
82
+ files = list(getattr(self.voices, "files", []))
83
+ except Exception:
84
+ files = []
85
+ self.available_voices: List[str] = (
86
+ [v for v in self._DEFAULT_VOICES if v in files] or (files or self._DEFAULT_VOICES)
87
+ )
88
+
89
+ # ONNX Runtime session with aggressive graph optimizations.
90
+ sess_opt = ort.SessionOptions()
91
+ sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
92
+
93
+ # Respect ORT thread env vars when present. Otherwise leave defaults.
94
+ # This avoids over-constraining environments like Spaces.
95
+ # providers selection
96
+ chosen_providers = (
97
+ providers
98
+ if providers
99
+ else ["CPUExecutionProvider"]
100
+ )
101
+ # Keep only supported providers to avoid runtime errors.
102
+ supported = set(ort.get_available_providers())
103
+ chosen_providers = [p for p in chosen_providers if p in supported] or list(supported)
104
+
105
+ self.session = ort.InferenceSession(
106
+ self.model_path,
107
+ sess_options=sess_opt,
108
+ providers=chosen_providers,
109
+ )
110
+
111
+ def _prepare_inputs(self, text: str, voice: str, speed: float) -> Dict[str, np.ndarray]:
112
+ if voice not in self.available_voices:
113
+ raise ValueError(
114
+ f"Voice '{voice}' not available. Choose from: {self.available_voices}"
115
+ )
116
+
117
+ # Phonemize then map to token IDs.
118
+ phonemes_list = self._phonemizer.phonemize([text])
119
+ phonemes = " ".join(basic_english_tokenize(phonemes_list[0]))
120
+ tokens = self._cleaner(phonemes)
121
+
122
+ # Start/end tokens as in the original.
123
+ tokens.insert(0, 0)
124
+ tokens.append(0)
125
+
126
+ input_ids = np.asarray([tokens], dtype=np.int64)
127
+ style_vec = self.voices[voice]
128
+ speed_arr = np.asarray([speed], dtype=np.float32)
129
+
130
+ return {"input_ids": input_ids, "style": style_vec, "speed": speed_arr}
131
+
132
+ def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
133
+ onnx_inputs = self._prepare_inputs(text, voice, speed)
134
+ outputs = self.session.run(None, onnx_inputs)
135
+ audio = np.asarray(outputs[0]).astype(np.float32)
136
+
137
+ # Preserve original trimming while guarding short sequences.
138
+ if audio.size > 15000:
139
+ audio = audio[5000:-10000]
140
+ return audio
141
+
142
+ def generate_to_file(
143
+ self,
144
+ text: str,
145
+ output_path: str,
146
+ voice: str = "expr-voice-5-m",
147
+ speed: float = 1.0,
148
+ sample_rate: int = 24000,
149
+ ) -> None:
150
+ audio = self.generate(text, voice, speed)
151
+ sf.write(output_path, audio, sample_rate)
152
+
153
+
154
+ # ---------------------------
155
+ # HF download wrapper (consolidated)
156
+ # ---------------------------
157
+
158
+ class KittenTTS:
159
+ """High-level wrapper that fetches model assets from Hugging Face."""
160
+
161
+ def __init__(
162
+ self,
163
+ model_name: str = "KittenML/kitten-tts-nano-0.1",
164
+ cache_dir: Optional[str] = None,
165
+ providers: Optional[List[str]] = None,
166
+ ) -> None:
167
+ repo_id = model_name if "/" in model_name else f"KittenML/{model_name}"
168
+ self._model = download_from_huggingface(repo_id=repo_id, cache_dir=cache_dir, providers=providers)
169
+
170
+ def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
171
+ return self._model.generate(text, voice=voice, speed=speed)
172
+
173
+ def generate_to_file(
174
+ self,
175
+ text: str,
176
+ output_path: str,
177
+ voice: str = "expr-voice-5-m",
178
+ speed: float = 1.0,
179
+ sample_rate: int = 24000,
180
+ ) -> None:
181
+ return self._model.generate_to_file(
182
+ text, output_path, voice=voice, speed=speed, sample_rate=sample_rate
183
+ )
184
+
185
+ @property
186
+ def available_voices(self) -> List[str]:
187
+ return self._model.available_voices
188
+
189
+
190
+ def download_from_huggingface(
191
+ repo_id: str = "KittenML/kitten-tts-nano-0.1",
192
+ cache_dir: Optional[str] = None,
193
+ providers: Optional[List[str]] = None,
194
+ ) -> KittenTTS_1_Onnx:
195
+ """
196
+ Download config, model, and voices. Instantiate ONNX model.
197
+ """
198
+ config_path = hf_hub_download(repo_id=repo_id, filename="config.json", cache_dir=cache_dir)
199
+ with open(config_path, "r", encoding="utf-8") as f:
200
+ config = json.load(f)
201
+
202
+ if config.get("type") != "ONNX1":
203
+ raise ValueError("Unsupported model type in config.json.")
204
+
205
+ model_path = hf_hub_download(repo_id=repo_id, filename=config["model_file"], cache_dir=cache_dir)
206
+ voices_path = hf_hub_download(repo_id=repo_id, filename=config["voices"], cache_dir=cache_dir)
207
+ return KittenTTS_1_Onnx(model_path=model_path, voices_path=voices_path, providers=providers)
208
+
209
+
210
+ def get_model(repo_id: str = "KittenML/kitten-tts-nano-0.1", cache_dir: Optional[str] = None) -> KittenTTS:
211
+ """Backward-compatible alias."""
212
+ return KittenTTS(repo_id, cache_dir)
213
+
214
+
215
+ # ---------------------------
216
+ # Gradio app
217
+ # ---------------------------
218
+
219
+ # Allow overriding model repo and providers via env on Spaces.
220
+ _MODEL_REPO = os.getenv("MODEL_REPO", "KittenML/kitten-tts-nano-0.1")
221
+ # Use CPU by default on Spaces; adjust if GPU EPs are available.
222
+ _DEFAULT_PROVIDERS = os.getenv("ORT_PROVIDERS", "CPUExecutionProvider").split(",")
223
+
224
+ # Single global instance for efficiency.
225
+ _TTS = KittenTTS(_MODEL_REPO, providers=_DEFAULT_PROVIDERS)
226
+
227
+
228
+ def _synthesize(text: str, voice: str, speed: float) -> Tuple[int, np.ndarray]:
229
+ if not text or not text.strip():
230
+ raise gr.Error("Please enter text.")
231
+ audio = _TTS.generate(text, voice=voice, speed=speed)
232
+ # Gradio expects (sample_rate, np.ndarray[float32])
233
+ return 24000, audio.astype(np.float32, copy=False)
234
+
235
+
236
+ with gr.Blocks(title="KittenTTS Nano") as demo:
237
+ gr.Markdown("# KittenTTS Nano\nText-to-Speech using ONNX on CPU")
238
+
239
+ with gr.Row():
240
+ inp_text = gr.Textbox(
241
+ label="Text",
242
+ lines=6,
243
+ placeholder='Type something like: "The quick brown fox jumps over the lazy dog."',
244
+ )
245
+
246
+ with gr.Row():
247
+ voice = gr.Dropdown(
248
+ label="Voice",
249
+ choices=_TTS.available_voices,
250
+ value="expr-voice-5-m" if "expr-voice-5-m" in _TTS.available_voices else _TTS.available_voices[0],
251
+ )
252
+ speed = gr.Slider(minimum=0.5, maximum=1.5, step=0.05, value=1.0, label="Speed")
253
+
254
+ out_audio = gr.Audio(label="Output Audio", type="numpy")
255
+ btn = gr.Button("Generate")
256
+
257
+ btn.click(_synthesize, inputs=[inp_text, voice, speed], outputs=out_audio)
258
+
259
+ gr.Examples(
260
+ examples=[
261
+ ["Hello from KittenTTS Nano.", "expr-voice-5-m", 1.0],
262
+ ["It begins with an Ugh. Another mysterious stain appears on a favorite shirt.", "expr-voice-2-f", 1.0],
263
+ ],
264
+ inputs=[inp_text, voice, speed],
265
+ )
266
+
267
+ # Spaces will auto-run app.py. Local dev can still call launch().
268
+ if __name__ == "__main__":
269
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ espeak-ng
pyproject.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "kittentts"
3
+ version = "0.1.0"
4
+ description = "Generate natural speech from text on any CPU"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "gradio>=5.43.1",
9
+ "huggingface-hub[hf-xet]>=0.34.4",
10
+ "numpy>=2.3.2",
11
+ "onnxruntime>=1.22.1",
12
+ "phonemizer>=3.3.0",
13
+ "soundfile>=0.13.1",
14
+ ]
uv.lock ADDED
The diff for this file is too large to render. See raw diff