Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -28,6 +28,8 @@ from infer.utils_infer import (
|
|
28 |
save_spectrogram,
|
29 |
)
|
30 |
from tokenizers import Tokenizer
|
|
|
|
|
31 |
from transformers import pipeline
|
32 |
import click
|
33 |
import soundfile as sf
|
@@ -131,7 +133,7 @@ F5TTS_ema_model = load_custom(
|
|
131 |
"hf://Gregniuki/F5-tts_English_German_Polish/English/model_222600.pt", "", F5TTS_model_cfg
|
132 |
)
|
133 |
E2TTS_ema_model = load_custom(
|
134 |
-
"hf://Gregniuki/F5-tts_English_German_Polish/
|
135 |
)
|
136 |
E2TTS_ema_model2 = load_custom(
|
137 |
"hf://Gregniuki/F5-tts_English_German_Polish/Polish/model_500000.pt", "", F5TTS_model_cfg
|
@@ -202,11 +204,24 @@ def chunk_text(text, max_chars):
|
|
202 |
return chunks
|
203 |
|
204 |
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
|
208 |
@gpu_decorator
|
209 |
-
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
|
210 |
if exp_name == "English":
|
211 |
ema_model = F5TTS_ema_model
|
212 |
elif exp_name == "Polish":
|
@@ -247,7 +262,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
247 |
|
248 |
for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
|
249 |
# Prepare the text
|
250 |
-
|
|
|
|
|
|
|
|
|
|
|
251 |
encoding = tokenizer.encode(text_list)
|
252 |
tokens = encoding.tokens
|
253 |
text_list = ' '.join(map(str, tokens))
|
@@ -384,7 +404,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
384 |
return (target_sample_rate, final_wave), spectrogram_path
|
385 |
|
386 |
@gpu_decorator
|
387 |
-
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15 # Set the desired language code dynamically
|
388 |
):
|
389 |
|
390 |
print(gen_text)
|
@@ -442,7 +462,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
|
|
442 |
print(f'gen_text {i}', batch_text)
|
443 |
|
444 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
445 |
-
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
|
446 |
|
447 |
|
448 |
@gpu_decorator
|
@@ -468,7 +488,7 @@ def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name
|
|
468 |
continue # Skip if the speaker is neither speaker1 nor speaker2
|
469 |
|
470 |
# Generate audio for this block
|
471 |
-
audio, _ = infer(ref_audio, ref_text, text, exp_name, remove_silence)
|
472 |
|
473 |
# Convert the generated audio to a numpy array
|
474 |
sr, audio_data = audio
|
@@ -580,6 +600,7 @@ with gr.Blocks() as app_tts:
|
|
580 |
model_choice,
|
581 |
remove_silence,
|
582 |
cross_fade_duration_slider,
|
|
|
583 |
],
|
584 |
outputs=[audio_output, spectrogram_output],
|
585 |
)
|
@@ -786,7 +807,7 @@ with gr.Blocks() as app_emotional:
|
|
786 |
ref_text = speech_types[current_emotion].get('ref_text', '')
|
787 |
|
788 |
# Generate speech for this segment
|
789 |
-
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence)
|
790 |
sr, audio_data = audio
|
791 |
|
792 |
# generated_audio_segments.append(audio_data)
|
|
|
28 |
save_spectrogram,
|
29 |
)
|
30 |
from tokenizers import Tokenizer
|
31 |
+
from phonemizer import phonemize
|
32 |
+
|
33 |
from transformers import pipeline
|
34 |
import click
|
35 |
import soundfile as sf
|
|
|
133 |
"hf://Gregniuki/F5-tts_English_German_Polish/English/model_222600.pt", "", F5TTS_model_cfg
|
134 |
)
|
135 |
E2TTS_ema_model = load_custom(
|
136 |
+
"hf://Gregniuki/F5-tts_English_German_Polish/multi/model_300000.pt", "", F5TTS_model_cfg
|
137 |
)
|
138 |
E2TTS_ema_model2 = load_custom(
|
139 |
"hf://Gregniuki/F5-tts_English_German_Polish/Polish/model_500000.pt", "", F5TTS_model_cfg
|
|
|
204 |
return chunks
|
205 |
|
206 |
|
207 |
+
def text_to_ipa(text, language='en-gb'):
|
208 |
+
try:
|
209 |
+
ipa_text = phonemize(
|
210 |
+
text,
|
211 |
+
language=language,
|
212 |
+
backend='espeak',
|
213 |
+
strip=False,
|
214 |
+
preserve_punctuation=True,
|
215 |
+
with_stress=True
|
216 |
+
)
|
217 |
+
return ipa_text #preserve_case(text, ipa_text)
|
218 |
+
except Exception as e:
|
219 |
+
print(f"Error processing text: {text}. Error: {e}")
|
220 |
+
return None
|
221 |
|
222 |
|
223 |
@gpu_decorator
|
224 |
+
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress(), language):
|
225 |
if exp_name == "English":
|
226 |
ema_model = F5TTS_ema_model
|
227 |
elif exp_name == "Polish":
|
|
|
262 |
|
263 |
for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
|
264 |
# Prepare the text
|
265 |
+
ipa_text_ref = text_to_ipa(ref_text, language=language)
|
266 |
+
ipa_text_gen = text_to_ipa(gen_text, language=language)
|
267 |
+
|
268 |
+
|
269 |
+
text_list = [ref_text_ref + gen_text_gen]
|
270 |
+
|
271 |
encoding = tokenizer.encode(text_list)
|
272 |
tokens = encoding.tokens
|
273 |
text_list = ' '.join(map(str, tokens))
|
|
|
404 |
return (target_sample_rate, final_wave), spectrogram_path
|
405 |
|
406 |
@gpu_decorator
|
407 |
+
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15, language # Set the desired language code dynamically
|
408 |
):
|
409 |
|
410 |
print(gen_text)
|
|
|
462 |
print(f'gen_text {i}', batch_text)
|
463 |
|
464 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
465 |
+
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration, language)
|
466 |
|
467 |
|
468 |
@gpu_decorator
|
|
|
488 |
continue # Skip if the speaker is neither speaker1 nor speaker2
|
489 |
|
490 |
# Generate audio for this block
|
491 |
+
audio, _ = infer(ref_audio, ref_text, text, exp_name, remove_silence, language)
|
492 |
|
493 |
# Convert the generated audio to a numpy array
|
494 |
sr, audio_data = audio
|
|
|
600 |
model_choice,
|
601 |
remove_silence,
|
602 |
cross_fade_duration_slider,
|
603 |
+
language='en-gb',
|
604 |
],
|
605 |
outputs=[audio_output, spectrogram_output],
|
606 |
)
|
|
|
807 |
ref_text = speech_types[current_emotion].get('ref_text', '')
|
808 |
|
809 |
# Generate speech for this segment
|
810 |
+
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, language)
|
811 |
sr, audio_data = audio
|
812 |
|
813 |
# generated_audio_segments.append(audio_data)
|