Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -223,7 +223,7 @@ def text_to_ipa(text, language=language):
|
|
223 |
|
224 |
|
225 |
@gpu_decorator
|
226 |
-
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()
|
227 |
if exp_name == "Multi":
|
228 |
ema_model = F5TTS_ema_model
|
229 |
# elif exp_name == "Polish":
|
@@ -263,6 +263,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
263 |
punctuation_weights = {",": 0, ".": 0, " ": 0} # Add more punctuation as needed
|
264 |
progress = tqdm(gen_text_batches)
|
265 |
ipa_text_ref = text_to_ipa(ref_text, language=language)
|
|
|
266 |
|
267 |
for i, gen_text in enumerate(progress):
|
268 |
|
@@ -411,7 +412,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
411 |
return (target_sample_rate, final_wave), spectrogram_path
|
412 |
|
413 |
@gpu_decorator
|
414 |
-
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15
|
415 |
):
|
416 |
|
417 |
print(gen_text)
|
@@ -469,7 +470,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
|
|
469 |
print(f'gen_text {i}', batch_text)
|
470 |
|
471 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
472 |
-
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration
|
473 |
|
474 |
|
475 |
@gpu_decorator
|
@@ -495,7 +496,7 @@ def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name
|
|
495 |
continue # Skip if the speaker is neither speaker1 nor speaker2
|
496 |
|
497 |
# Generate audio for this block
|
498 |
-
audio, _ = infer(ref_audio, ref_text, text, exp_name, remove_silence
|
499 |
|
500 |
# Convert the generated audio to a numpy array
|
501 |
sr, audio_data = audio
|
@@ -618,7 +619,7 @@ with gr.Blocks() as app_tts:
|
|
618 |
model_choice,
|
619 |
remove_silence,
|
620 |
cross_fade_duration_slider,
|
621 |
-
|
622 |
],
|
623 |
outputs=[audio_output, spectrogram_output],
|
624 |
)
|
@@ -824,7 +825,7 @@ with gr.Blocks() as app_emotional:
|
|
824 |
ref_text = speech_types[current_emotion].get('ref_text', '')
|
825 |
|
826 |
# Generate speech for this segment
|
827 |
-
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence
|
828 |
sr, audio_data = audio
|
829 |
|
830 |
# generated_audio_segments.append(audio_data)
|
|
|
223 |
|
224 |
|
225 |
@gpu_decorator
|
226 |
+
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
|
227 |
if exp_name == "Multi":
|
228 |
ema_model = F5TTS_ema_model
|
229 |
# elif exp_name == "Polish":
|
|
|
263 |
punctuation_weights = {",": 0, ".": 0, " ": 0} # Add more punctuation as needed
|
264 |
progress = tqdm(gen_text_batches)
|
265 |
ipa_text_ref = text_to_ipa(ref_text, language=language)
|
266 |
+
print(language)
|
267 |
|
268 |
for i, gen_text in enumerate(progress):
|
269 |
|
|
|
412 |
return (target_sample_rate, final_wave), spectrogram_path
|
413 |
|
414 |
@gpu_decorator
|
415 |
+
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15 # Set the desired language code dynamically
|
416 |
):
|
417 |
|
418 |
print(gen_text)
|
|
|
470 |
print(f'gen_text {i}', batch_text)
|
471 |
|
472 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
473 |
+
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
|
474 |
|
475 |
|
476 |
@gpu_decorator
|
|
|
496 |
continue # Skip if the speaker is neither speaker1 nor speaker2
|
497 |
|
498 |
# Generate audio for this block
|
499 |
+
audio, _ = infer(ref_audio, ref_text, text, exp_name, remove_silence)
|
500 |
|
501 |
# Convert the generated audio to a numpy array
|
502 |
sr, audio_data = audio
|
|
|
619 |
model_choice,
|
620 |
remove_silence,
|
621 |
cross_fade_duration_slider,
|
622 |
+
# language_choice,
|
623 |
],
|
624 |
outputs=[audio_output, spectrogram_output],
|
625 |
)
|
|
|
825 |
ref_text = speech_types[current_emotion].get('ref_text', '')
|
826 |
|
827 |
# Generate speech for this segment
|
828 |
+
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence)
|
829 |
sr, audio_data = audio
|
830 |
|
831 |
# generated_audio_segments.append(audio_data)
|