Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -37,37 +37,34 @@ device = (
|
|
| 37 |
else "mps" if torch.backends.mps.is_available() else "cpu"
|
| 38 |
)
|
| 39 |
|
| 40 |
-
print(f"Using {device} device")
|
| 41 |
print(f"Using {device} device")
|
| 42 |
|
| 43 |
-
# Use smaller, CPU-friendly models
|
| 44 |
pipe = pipeline(
|
| 45 |
"automatic-speech-recognition",
|
| 46 |
-
model="openai/whisper-
|
| 47 |
torch_dtype=torch.float16,
|
| 48 |
device=device,
|
| 49 |
)
|
| 50 |
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
|
| 51 |
|
| 52 |
-
#
|
|
|
|
| 53 |
target_sample_rate = 24000
|
| 54 |
-
n_mel_channels = 100
|
| 55 |
-
hop_length = 256
|
| 56 |
target_rms = 0.1
|
| 57 |
-
nfe_step = 8 #
|
| 58 |
-
cfg_strength = 2
|
| 59 |
ode_method = "euler"
|
| 60 |
-
sway_sampling_coef = -1
|
| 61 |
speed = 1
|
| 62 |
fix_duration = None
|
| 63 |
|
|
|
|
| 64 |
def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
| 65 |
ckpt_path = str(cached_path(f"hf://{page_name}/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
|
|
|
|
| 66 |
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
|
| 67 |
-
|
| 68 |
-
# Reduce model complexity for CPU
|
| 69 |
-
|
| 70 |
-
|
| 71 |
model = CFM(
|
| 72 |
transformer=model_cls(
|
| 73 |
**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
|
|
@@ -83,15 +80,16 @@ def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
|
| 83 |
vocab_char_map=vocab_char_map,
|
| 84 |
).to(device)
|
| 85 |
|
| 86 |
-
model = load_checkpoint(model, ckpt_path, device, use_ema=True)
|
| 87 |
|
| 88 |
return model
|
| 89 |
|
| 90 |
-
|
|
|
|
| 91 |
F5TTS_model_cfg = dict(
|
| 92 |
dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
|
| 93 |
)
|
| 94 |
-
E2TTS_model_cfg = dict(dim=1024, depth=
|
| 95 |
|
| 96 |
F5TTS_ema_model = load_model(
|
| 97 |
"Gregniuki", "F5-tts_English_German_Polish", "English", DiT, F5TTS_model_cfg, 222600
|
|
|
|
| 37 |
else "mps" if torch.backends.mps.is_available() else "cpu"
|
| 38 |
)
|
| 39 |
|
|
|
|
| 40 |
print(f"Using {device} device")
|
| 41 |
|
|
|
|
| 42 |
pipe = pipeline(
|
| 43 |
"automatic-speech-recognition",
|
| 44 |
+
model="openai/whisper-large-v3-turbo",
|
| 45 |
torch_dtype=torch.float16,
|
| 46 |
device=device,
|
| 47 |
)
|
| 48 |
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
|
| 49 |
|
| 50 |
+
# --------------------- Settings -------------------- #
|
| 51 |
+
|
| 52 |
target_sample_rate = 24000
|
| 53 |
+
n_mel_channels = 100
|
| 54 |
+
hop_length = 256
|
| 55 |
target_rms = 0.1
|
| 56 |
+
nfe_step = 8 # 16, 32
|
| 57 |
+
cfg_strength = 2.0
|
| 58 |
ode_method = "euler"
|
| 59 |
+
sway_sampling_coef = -1.0
|
| 60 |
speed = 1
|
| 61 |
fix_duration = None
|
| 62 |
|
| 63 |
+
|
| 64 |
def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
| 65 |
ckpt_path = str(cached_path(f"hf://{page_name}/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
|
| 66 |
+
# ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
|
| 67 |
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
model = CFM(
|
| 69 |
transformer=model_cls(
|
| 70 |
**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
|
|
|
|
| 80 |
vocab_char_map=vocab_char_map,
|
| 81 |
).to(device)
|
| 82 |
|
| 83 |
+
model = load_checkpoint(model, ckpt_path, device, use_ema = True)
|
| 84 |
|
| 85 |
return model
|
| 86 |
|
| 87 |
+
|
| 88 |
+
# load models
|
| 89 |
F5TTS_model_cfg = dict(
|
| 90 |
dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
|
| 91 |
)
|
| 92 |
+
E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
|
| 93 |
|
| 94 |
F5TTS_ema_model = load_model(
|
| 95 |
"Gregniuki", "F5-tts_English_German_Polish", "English", DiT, F5TTS_model_cfg, 222600
|