Spaces:
Running
Running
more fix
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
3 |
import json
|
4 |
import os
|
5 |
import re
|
6 |
-
from typing import Dict,
|
7 |
|
8 |
import gradio as gr
|
9 |
import numpy as np
|
@@ -108,6 +108,7 @@ class KittenTTS_1_Onnx:
|
|
108 |
providers=chosen_providers,
|
109 |
)
|
110 |
|
|
|
111 |
self.max_seq_len = self._infer_max_seq_len() or int(os.getenv("KITTEN_MAX_SEQ_LEN", "512"))
|
112 |
# reserve 2 slots for BOS/EOS tokens inserted below
|
113 |
self._chunk_budget = max(1, self.max_seq_len - 2)
|
@@ -141,7 +142,7 @@ class KittenTTS_1_Onnx:
|
|
141 |
Falls back to env var or 512 if unavailable. Optional dependency on 'onnx'.
|
142 |
"""
|
143 |
try:
|
144 |
-
import onnx
|
145 |
except Exception:
|
146 |
return None
|
147 |
try:
|
@@ -191,17 +192,29 @@ class KittenTTS_1_Onnx:
|
|
191 |
yield ids
|
192 |
i = j
|
193 |
|
194 |
-
def generate(
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
outputs = self.session.run(None, onnx_inputs)
|
199 |
-
audio = np.asarray(outputs[0]).astype(np.float32)
|
200 |
|
201 |
-
#
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
def generate_to_file(
|
207 |
self,
|
@@ -234,30 +247,10 @@ class KittenTTS:
|
|
234 |
repo_id=repo_id, cache_dir=cache_dir, providers=providers
|
235 |
)
|
236 |
|
237 |
-
def generate(
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
# Phonemize once, then either run single-shot or chunked
|
243 |
-
clean = self._phonemize_to_clean(text)
|
244 |
-
|
245 |
-
# Fast path: fits in one pass
|
246 |
-
if len(clean) + 2 <= self.max_seq_len:
|
247 |
-
ids = self._cleaner(clean)
|
248 |
-
ids.insert(0, 0) # BOS
|
249 |
-
ids.append(0) # EOS
|
250 |
-
return self._run_onnx(ids, voice, speed)
|
251 |
-
|
252 |
-
# Chunked path: concatenate per-chunk audio
|
253 |
-
pieces: List[np.ndarray] = []
|
254 |
-
for ids in self._chunk_token_ids(clean):
|
255 |
-
pieces.append(self._run_onnx(ids, voice, speed))
|
256 |
-
|
257 |
-
if not pieces:
|
258 |
-
return np.array([], dtype=np.float32)
|
259 |
-
return pieces[0] if len(pieces) == 1 else np.concatenate(pieces)
|
260 |
-
|
261 |
|
262 |
def generate_to_file(
|
263 |
self,
|
|
|
3 |
import json
|
4 |
import os
|
5 |
import re
|
6 |
+
from typing import Dict, Iterator, List, Optional, Tuple
|
7 |
|
8 |
import gradio as gr
|
9 |
import numpy as np
|
|
|
108 |
providers=chosen_providers,
|
109 |
)
|
110 |
|
111 |
+
# --- add: max-length detection and per-chunk budget ---
|
112 |
self.max_seq_len = self._infer_max_seq_len() or int(os.getenv("KITTEN_MAX_SEQ_LEN", "512"))
|
113 |
# reserve 2 slots for BOS/EOS tokens inserted below
|
114 |
self._chunk_budget = max(1, self.max_seq_len - 2)
|
|
|
142 |
Falls back to env var or 512 if unavailable. Optional dependency on 'onnx'.
|
143 |
"""
|
144 |
try:
|
145 |
+
import onnx # optional
|
146 |
except Exception:
|
147 |
return None
|
148 |
try:
|
|
|
192 |
yield ids
|
193 |
i = j
|
194 |
|
195 |
+
def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
|
196 |
+
"""Synthesize speech with automatic chunking at the model's max length."""
|
197 |
+
if voice not in self.available_voices:
|
198 |
+
raise ValueError(f"Voice '{voice}' not available. Choose from: {self.available_voices}")
|
|
|
|
|
199 |
|
200 |
+
# Phonemize once, then either run single-shot or chunked
|
201 |
+
clean = self._phonemize_to_clean(text)
|
202 |
+
|
203 |
+
# Fast path: fits in one pass
|
204 |
+
if len(clean) + 2 <= self.max_seq_len:
|
205 |
+
ids = self._cleaner(clean)
|
206 |
+
ids.insert(0, 0) # BOS
|
207 |
+
ids.append(0) # EOS
|
208 |
+
return self._run_onnx(ids, voice, speed)
|
209 |
+
|
210 |
+
# Chunked path: concatenate per-chunk audio
|
211 |
+
pieces: List[np.ndarray] = []
|
212 |
+
for ids in self._chunk_token_ids(clean):
|
213 |
+
pieces.append(self._run_onnx(ids, voice, speed))
|
214 |
+
|
215 |
+
if not pieces:
|
216 |
+
return np.array([], dtype=np.float32)
|
217 |
+
return pieces[0] if len(pieces) == 1 else np.concatenate(pieces)
|
218 |
|
219 |
def generate_to_file(
|
220 |
self,
|
|
|
247 |
repo_id=repo_id, cache_dir=cache_dir, providers=providers
|
248 |
)
|
249 |
|
250 |
+
def generate(
|
251 |
+
self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0
|
252 |
+
) -> np.ndarray:
|
253 |
+
return self._model.generate(text, voice=voice, speed=speed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
def generate_to_file(
|
256 |
self,
|