ronedgecomb commited on
Commit
4934471
·
verified ·
1 Parent(s): 1fb7c23
Files changed (1) hide show
  1. app.py +29 -36
app.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
  import json
4
  import os
5
  import re
6
- from typing import Dict, List, Tuple, Optional, Iterator
7
 
8
  import gradio as gr
9
  import numpy as np
@@ -108,6 +108,7 @@ class KittenTTS_1_Onnx:
108
  providers=chosen_providers,
109
  )
110
 
 
111
  self.max_seq_len = self._infer_max_seq_len() or int(os.getenv("KITTEN_MAX_SEQ_LEN", "512"))
112
  # reserve 2 slots for BOS/EOS tokens inserted below
113
  self._chunk_budget = max(1, self.max_seq_len - 2)
@@ -141,7 +142,7 @@ class KittenTTS_1_Onnx:
141
  Falls back to env var or 512 if unavailable. Optional dependency on 'onnx'.
142
  """
143
  try:
144
- import onnx # optional
145
  except Exception:
146
  return None
147
  try:
@@ -191,17 +192,29 @@ class KittenTTS_1_Onnx:
191
  yield ids
192
  i = j
193
 
194
- def generate(
195
- self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0
196
- ) -> np.ndarray:
197
- onnx_inputs = self._prepare_inputs(text, voice, speed)
198
- outputs = self.session.run(None, onnx_inputs)
199
- audio = np.asarray(outputs[0]).astype(np.float32)
200
 
201
- # Preserve original trimming while guarding short sequences.
202
- if audio.size > 15000:
203
- audio = audio[5000:-10000]
204
- return audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  def generate_to_file(
207
  self,
@@ -234,30 +247,10 @@ class KittenTTS:
234
  repo_id=repo_id, cache_dir=cache_dir, providers=providers
235
  )
236
 
237
- def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
238
- """Synthesize speech with automatic chunking at the model's max length."""
239
- if voice not in self.available_voices:
240
- raise ValueError(f"Voice '{voice}' not available. Choose from: {self.available_voices}")
241
-
242
- # Phonemize once, then either run single-shot or chunked
243
- clean = self._phonemize_to_clean(text)
244
-
245
- # Fast path: fits in one pass
246
- if len(clean) + 2 <= self.max_seq_len:
247
- ids = self._cleaner(clean)
248
- ids.insert(0, 0) # BOS
249
- ids.append(0) # EOS
250
- return self._run_onnx(ids, voice, speed)
251
-
252
- # Chunked path: concatenate per-chunk audio
253
- pieces: List[np.ndarray] = []
254
- for ids in self._chunk_token_ids(clean):
255
- pieces.append(self._run_onnx(ids, voice, speed))
256
-
257
- if not pieces:
258
- return np.array([], dtype=np.float32)
259
- return pieces[0] if len(pieces) == 1 else np.concatenate(pieces)
260
-
261
 
262
  def generate_to_file(
263
  self,
 
3
  import json
4
  import os
5
  import re
6
+ from typing import Dict, Iterator, List, Optional, Tuple
7
 
8
  import gradio as gr
9
  import numpy as np
 
108
  providers=chosen_providers,
109
  )
110
 
111
+ # --- add: max-length detection and per-chunk budget ---
112
  self.max_seq_len = self._infer_max_seq_len() or int(os.getenv("KITTEN_MAX_SEQ_LEN", "512"))
113
  # reserve 2 slots for BOS/EOS tokens inserted below
114
  self._chunk_budget = max(1, self.max_seq_len - 2)
 
142
  Falls back to env var or 512 if unavailable. Optional dependency on 'onnx'.
143
  """
144
  try:
145
+ import onnx # optional
146
  except Exception:
147
  return None
148
  try:
 
192
  yield ids
193
  i = j
194
 
195
+ def generate(self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0) -> np.ndarray:
196
+ """Synthesize speech with automatic chunking at the model's max length."""
197
+ if voice not in self.available_voices:
198
+ raise ValueError(f"Voice '{voice}' not available. Choose from: {self.available_voices}")
 
 
199
 
200
+ # Phonemize once, then either run single-shot or chunked
201
+ clean = self._phonemize_to_clean(text)
202
+
203
+ # Fast path: fits in one pass
204
+ if len(clean) + 2 <= self.max_seq_len:
205
+ ids = self._cleaner(clean)
206
+ ids.insert(0, 0) # BOS
207
+ ids.append(0) # EOS
208
+ return self._run_onnx(ids, voice, speed)
209
+
210
+ # Chunked path: concatenate per-chunk audio
211
+ pieces: List[np.ndarray] = []
212
+ for ids in self._chunk_token_ids(clean):
213
+ pieces.append(self._run_onnx(ids, voice, speed))
214
+
215
+ if not pieces:
216
+ return np.array([], dtype=np.float32)
217
+ return pieces[0] if len(pieces) == 1 else np.concatenate(pieces)
218
 
219
  def generate_to_file(
220
  self,
 
247
  repo_id=repo_id, cache_dir=cache_dir, providers=providers
248
  )
249
 
250
+ def generate(
251
+ self, text: str, voice: str = "expr-voice-5-m", speed: float = 1.0
252
+ ) -> np.ndarray:
253
+ return self._model.generate(text, voice=voice, speed=speed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  def generate_to_file(
256
  self,