Gregniuki commited on
Commit
bf3a197
·
verified ·
1 Parent(s): ecc5628

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -8
app.py CHANGED
@@ -28,6 +28,8 @@ from infer.utils_infer import (
28
  save_spectrogram,
29
  )
30
  from tokenizers import Tokenizer
 
 
31
  from transformers import pipeline
32
  import click
33
  import soundfile as sf
@@ -131,7 +133,7 @@ F5TTS_ema_model = load_custom(
131
  "hf://Gregniuki/F5-tts_English_German_Polish/English/model_222600.pt", "", F5TTS_model_cfg
132
  )
133
  E2TTS_ema_model = load_custom(
134
- "hf://Gregniuki/F5-tts_English_German_Polish/Polish2/model_1200000.pt", "", F5TTS_model_cfg
135
  )
136
  E2TTS_ema_model2 = load_custom(
137
  "hf://Gregniuki/F5-tts_English_German_Polish/Polish/model_500000.pt", "", F5TTS_model_cfg
@@ -202,11 +204,24 @@ def chunk_text(text, max_chars):
202
  return chunks
203
 
204
 
205
-
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
 
208
  @gpu_decorator
209
- def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
210
  if exp_name == "English":
211
  ema_model = F5TTS_ema_model
212
  elif exp_name == "Polish":
@@ -247,7 +262,12 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
247
 
248
  for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
249
  # Prepare the text
250
- text_list = [ref_text + gen_text]
 
 
 
 
 
251
  encoding = tokenizer.encode(text_list)
252
  tokens = encoding.tokens
253
  text_list = ' '.join(map(str, tokens))
@@ -384,7 +404,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
384
  return (target_sample_rate, final_wave), spectrogram_path
385
 
386
  @gpu_decorator
387
- def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15 # Set the desired language code dynamically
388
  ):
389
 
390
  print(gen_text)
@@ -442,7 +462,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
442
  print(f'gen_text {i}', batch_text)
443
 
444
  gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
445
- return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
446
 
447
 
448
  @gpu_decorator
@@ -468,7 +488,7 @@ def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name
468
  continue # Skip if the speaker is neither speaker1 nor speaker2
469
 
470
  # Generate audio for this block
471
- audio, _ = infer(ref_audio, ref_text, text, exp_name, remove_silence)
472
 
473
  # Convert the generated audio to a numpy array
474
  sr, audio_data = audio
@@ -580,6 +600,7 @@ with gr.Blocks() as app_tts:
580
  model_choice,
581
  remove_silence,
582
  cross_fade_duration_slider,
 
583
  ],
584
  outputs=[audio_output, spectrogram_output],
585
  )
@@ -786,7 +807,7 @@ with gr.Blocks() as app_emotional:
786
  ref_text = speech_types[current_emotion].get('ref_text', '')
787
 
788
  # Generate speech for this segment
789
- audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence)
790
  sr, audio_data = audio
791
 
792
  # generated_audio_segments.append(audio_data)
 
28
  save_spectrogram,
29
  )
30
  from tokenizers import Tokenizer
31
+ from phonemizer import phonemize
32
+
33
  from transformers import pipeline
34
  import click
35
  import soundfile as sf
 
133
  "hf://Gregniuki/F5-tts_English_German_Polish/English/model_222600.pt", "", F5TTS_model_cfg
134
  )
135
  E2TTS_ema_model = load_custom(
136
+ "hf://Gregniuki/F5-tts_English_German_Polish/multi/model_300000.pt", "", F5TTS_model_cfg
137
  )
138
  E2TTS_ema_model2 = load_custom(
139
  "hf://Gregniuki/F5-tts_English_German_Polish/Polish/model_500000.pt", "", F5TTS_model_cfg
 
204
  return chunks
205
 
206
 
207
+ def text_to_ipa(text, language='en-gb'):
208
+ try:
209
+ ipa_text = phonemize(
210
+ text,
211
+ language=language,
212
+ backend='espeak',
213
+ strip=False,
214
+ preserve_punctuation=True,
215
+ with_stress=True
216
+ )
217
+ return ipa_text #preserve_case(text, ipa_text)
218
+ except Exception as e:
219
+ print(f"Error processing text: {text}. Error: {e}")
220
+ return None
221
 
222
 
223
  @gpu_decorator
224
+ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress(), language):
225
  if exp_name == "English":
226
  ema_model = F5TTS_ema_model
227
  elif exp_name == "Polish":
 
262
 
263
  for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
264
  # Prepare the text
265
+ ipa_text_ref = text_to_ipa(ref_text, language=language)
266
+ ipa_text_gen = text_to_ipa(gen_text, language=language)
267
+
268
+
269
+ text_list = [ref_text_ref + gen_text_gen]
270
+
271
  encoding = tokenizer.encode(text_list)
272
  tokens = encoding.tokens
273
  text_list = ' '.join(map(str, tokens))
 
404
  return (target_sample_rate, final_wave), spectrogram_path
405
 
406
  @gpu_decorator
407
+ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15, language # Set the desired language code dynamically
408
  ):
409
 
410
  print(gen_text)
 
462
  print(f'gen_text {i}', batch_text)
463
 
464
  gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
465
+ return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration, language)
466
 
467
 
468
  @gpu_decorator
 
488
  continue # Skip if the speaker is neither speaker1 nor speaker2
489
 
490
  # Generate audio for this block
491
+ audio, _ = infer(ref_audio, ref_text, text, exp_name, remove_silence, language)
492
 
493
  # Convert the generated audio to a numpy array
494
  sr, audio_data = audio
 
600
  model_choice,
601
  remove_silence,
602
  cross_fade_duration_slider,
603
+ language='en-gb',
604
  ],
605
  outputs=[audio_output, spectrogram_output],
606
  )
 
807
  ref_text = speech_types[current_emotion].get('ref_text', '')
808
 
809
  # Generate speech for this segment
810
+ audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, language)
811
  sr, audio_data = audio
812
 
813
  # generated_audio_segments.append(audio_data)