Spaces:

Gregniuki
/

f5-tts_Polish_English_German

Running on Zero

App Files Files Community

Gregniuki commited on Nov 25, 2024

Commit

b8a6f24

verified ·

1 Parent(s): 0e7339b

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -16

app.py CHANGED Viewed

@@ -37,37 +37,34 @@ device = (
     else "mps" if torch.backends.mps.is_available() else "cpu"
 )
-print(f"Using {device} device")
 print(f"Using {device} device")
-# Use smaller, CPU-friendly models
 pipe = pipeline(
     "automatic-speech-recognition",
-    model="openai/whisper-base",  # Smaller model for CPU
     torch_dtype=torch.float16,
     device=device,
 )
 vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
-# Reduce computational intensity
 target_sample_rate = 24000
-n_mel_channels = 100  # Reduced from 100
-hop_length = 256  # Increased from 256
 target_rms = 0.1
-nfe_step = 8  # Reduced from 8
-cfg_strength = 2  # Reduced from 2.0
 ode_method = "euler"
-sway_sampling_coef = -1
 speed = 1
 fix_duration = None
 def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
     ckpt_path = str(cached_path(f"hf://{page_name}/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
     vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
-    # Reduce model complexity for CPU
     model = CFM(
         transformer=model_cls(
             **model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
@@ -83,15 +80,16 @@ def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
         vocab_char_map=vocab_char_map,
     ).to(device)
-    model = load_checkpoint(model, ckpt_path, device, use_ema=True)
     return model
-# Reduced model configurations
 F5TTS_model_cfg = dict(
     dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
 )
-E2TTS_model_cfg = dict(dim=1024, depth=12, heads=8, ff_mult=2)
 F5TTS_ema_model = load_model(
     "Gregniuki", "F5-tts_English_German_Polish", "English", DiT, F5TTS_model_cfg, 222600

     else "mps" if torch.backends.mps.is_available() else "cpu"
 )
 print(f"Using {device} device")
 pipe = pipeline(
     "automatic-speech-recognition",
+    model="openai/whisper-large-v3-turbo",
     torch_dtype=torch.float16,
     device=device,
 )
 vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
+# --------------------- Settings -------------------- #
 target_sample_rate = 24000
+n_mel_channels = 100
+hop_length = 256
 target_rms = 0.1
+nfe_step = 8  # 16, 32
+cfg_strength = 2.0
 ode_method = "euler"
+sway_sampling_coef = -1.0
 speed = 1
 fix_duration = None
 def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
     ckpt_path = str(cached_path(f"hf://{page_name}/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
+    # ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors
     vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
     model = CFM(
         transformer=model_cls(
             **model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
         vocab_char_map=vocab_char_map,
     ).to(device)
+    model = load_checkpoint(model, ckpt_path, device, use_ema = True)
     return model
+# load models
 F5TTS_model_cfg = dict(
     dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
 )
+E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
 F5TTS_ema_model = load_model(
     "Gregniuki", "F5-tts_English_German_Polish", "English", DiT, F5TTS_model_cfg, 222600