Gregniuki commited on
Commit
b8a6f24
·
verified ·
1 Parent(s): 0e7339b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -16
app.py CHANGED
@@ -37,37 +37,34 @@ device = (
37
  else "mps" if torch.backends.mps.is_available() else "cpu"
38
  )
39
 
40
- print(f"Using {device} device")
41
  print(f"Using {device} device")
42
 
43
- # Use smaller, CPU-friendly models
44
  pipe = pipeline(
45
  "automatic-speech-recognition",
46
- model="openai/whisper-base", # Smaller model for CPU
47
  torch_dtype=torch.float16,
48
  device=device,
49
  )
50
  vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
51
 
52
- # Reduce computational intensity
 
53
  target_sample_rate = 24000
54
- n_mel_channels = 100 # Reduced from 100
55
- hop_length = 256 # Increased from 256
56
  target_rms = 0.1
57
- nfe_step = 8 # Reduced from 8
58
- cfg_strength = 2 # Reduced from 2.0
59
  ode_method = "euler"
60
- sway_sampling_coef = -1
61
  speed = 1
62
  fix_duration = None
63
 
 
64
  def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
65
  ckpt_path = str(cached_path(f"hf://{page_name}/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
 
66
  vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
67
-
68
- # Reduce model complexity for CPU
69
-
70
-
71
  model = CFM(
72
  transformer=model_cls(
73
  **model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
@@ -83,15 +80,16 @@ def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
83
  vocab_char_map=vocab_char_map,
84
  ).to(device)
85
 
86
- model = load_checkpoint(model, ckpt_path, device, use_ema=True)
87
 
88
  return model
89
 
90
- # Reduced model configurations
 
91
  F5TTS_model_cfg = dict(
92
  dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
93
  )
94
- E2TTS_model_cfg = dict(dim=1024, depth=12, heads=8, ff_mult=2)
95
 
96
  F5TTS_ema_model = load_model(
97
  "Gregniuki", "F5-tts_English_German_Polish", "English", DiT, F5TTS_model_cfg, 222600
 
37
  else "mps" if torch.backends.mps.is_available() else "cpu"
38
  )
39
 
 
40
  print(f"Using {device} device")
41
 
 
42
  pipe = pipeline(
43
  "automatic-speech-recognition",
44
+ model="openai/whisper-large-v3-turbo",
45
  torch_dtype=torch.float16,
46
  device=device,
47
  )
48
  vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
49
 
50
+ # --------------------- Settings -------------------- #
51
+
52
  target_sample_rate = 24000
53
+ n_mel_channels = 100
54
+ hop_length = 256
55
  target_rms = 0.1
56
+ nfe_step = 8 # 16, 32
57
+ cfg_strength = 2.0
58
  ode_method = "euler"
59
+ sway_sampling_coef = -1.0
60
  speed = 1
61
  fix_duration = None
62
 
63
+
64
  def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
65
  ckpt_path = str(cached_path(f"hf://{page_name}/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
66
+ # ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
67
  vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
 
 
 
 
68
  model = CFM(
69
  transformer=model_cls(
70
  **model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
 
80
  vocab_char_map=vocab_char_map,
81
  ).to(device)
82
 
83
+ model = load_checkpoint(model, ckpt_path, device, use_ema = True)
84
 
85
  return model
86
 
87
+
88
+ # load models
89
  F5TTS_model_cfg = dict(
90
  dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
91
  )
92
+ E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
93
 
94
  F5TTS_ema_model = load_model(
95
  "Gregniuki", "F5-tts_English_German_Polish", "English", DiT, F5TTS_model_cfg, 222600