Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -182,10 +182,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
| 182 |
zh_pause_punc = r"。,、;:?!"
|
| 183 |
ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
|
| 184 |
gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
| 189 |
|
| 190 |
# inference
|
| 191 |
with torch.inference_mode():
|
|
@@ -738,9 +740,9 @@ This is a local web UI for F5 TTS with advanced batch processing support. This a
|
|
| 738 |
|
| 739 |
The checkpoint support Polish English and German.
|
| 740 |
|
| 741 |
-
Generations using CPU takes usually 2-3 minutes
|
| 742 |
|
| 743 |
-
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to
|
| 744 |
|
| 745 |
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
|
| 746 |
"""
|
|
|
|
| 182 |
zh_pause_punc = r"。,、;:?!"
|
| 183 |
ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
|
| 184 |
gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
|
| 185 |
+
if len(ref_text) >= 1:
|
| 186 |
+
duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
|
| 187 |
+
print(f"Duration: {duration} seconds")
|
| 188 |
+
else:
|
| 189 |
+
duration = min(5000, max(300, int(133 * gen_text_len / (speed * 10))))
|
| 190 |
+
print(f"Duration: {duration} seconds")
|
| 191 |
|
| 192 |
# inference
|
| 193 |
with torch.inference_mode():
|
|
|
|
| 740 |
|
| 741 |
The checkpoint support Polish English and German.
|
| 742 |
|
| 743 |
+
Generations using CPU takes usually 2-3 minutes using 8 step inferece.
|
| 744 |
|
| 745 |
+
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 5s, and shortening your prompt.
|
| 746 |
|
| 747 |
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
|
| 748 |
"""
|