Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -37,37 +37,34 @@ device = (
|
|
37 |
else "mps" if torch.backends.mps.is_available() else "cpu"
|
38 |
)
|
39 |
|
40 |
-
print(f"Using {device} device")
|
41 |
print(f"Using {device} device")
|
42 |
|
43 |
-
# Use smaller, CPU-friendly models
|
44 |
pipe = pipeline(
|
45 |
"automatic-speech-recognition",
|
46 |
-
model="openai/whisper-
|
47 |
torch_dtype=torch.float16,
|
48 |
device=device,
|
49 |
)
|
50 |
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
|
51 |
|
52 |
-
#
|
|
|
53 |
target_sample_rate = 24000
|
54 |
-
n_mel_channels = 100
|
55 |
-
hop_length = 256
|
56 |
target_rms = 0.1
|
57 |
-
nfe_step = 8 #
|
58 |
-
cfg_strength = 2
|
59 |
ode_method = "euler"
|
60 |
-
sway_sampling_coef = -1
|
61 |
speed = 1
|
62 |
fix_duration = None
|
63 |
|
|
|
64 |
def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
65 |
ckpt_path = str(cached_path(f"hf://{page_name}/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
|
|
|
66 |
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
|
67 |
-
|
68 |
-
# Reduce model complexity for CPU
|
69 |
-
|
70 |
-
|
71 |
model = CFM(
|
72 |
transformer=model_cls(
|
73 |
**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
|
@@ -83,15 +80,16 @@ def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
|
83 |
vocab_char_map=vocab_char_map,
|
84 |
).to(device)
|
85 |
|
86 |
-
model = load_checkpoint(model, ckpt_path, device, use_ema=True)
|
87 |
|
88 |
return model
|
89 |
|
90 |
-
|
|
|
91 |
F5TTS_model_cfg = dict(
|
92 |
dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
|
93 |
)
|
94 |
-
E2TTS_model_cfg = dict(dim=1024, depth=
|
95 |
|
96 |
F5TTS_ema_model = load_model(
|
97 |
"Gregniuki", "F5-tts_English_German_Polish", "English", DiT, F5TTS_model_cfg, 222600
|
|
|
37 |
else "mps" if torch.backends.mps.is_available() else "cpu"
|
38 |
)
|
39 |
|
|
|
40 |
print(f"Using {device} device")
|
41 |
|
|
|
42 |
pipe = pipeline(
|
43 |
"automatic-speech-recognition",
|
44 |
+
model="openai/whisper-large-v3-turbo",
|
45 |
torch_dtype=torch.float16,
|
46 |
device=device,
|
47 |
)
|
48 |
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
|
49 |
|
50 |
+
# --------------------- Settings -------------------- #
|
51 |
+
|
52 |
target_sample_rate = 24000
|
53 |
+
n_mel_channels = 100
|
54 |
+
hop_length = 256
|
55 |
target_rms = 0.1
|
56 |
+
nfe_step = 8 # 16, 32
|
57 |
+
cfg_strength = 2.0
|
58 |
ode_method = "euler"
|
59 |
+
sway_sampling_coef = -1.0
|
60 |
speed = 1
|
61 |
fix_duration = None
|
62 |
|
63 |
+
|
64 |
def load_model(page_name, repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
65 |
ckpt_path = str(cached_path(f"hf://{page_name}/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
|
66 |
+
# ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
|
67 |
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
|
|
|
|
|
|
|
|
|
68 |
model = CFM(
|
69 |
transformer=model_cls(
|
70 |
**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels
|
|
|
80 |
vocab_char_map=vocab_char_map,
|
81 |
).to(device)
|
82 |
|
83 |
+
model = load_checkpoint(model, ckpt_path, device, use_ema = True)
|
84 |
|
85 |
return model
|
86 |
|
87 |
+
|
88 |
+
# load models
|
89 |
F5TTS_model_cfg = dict(
|
90 |
dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
|
91 |
)
|
92 |
+
E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
|
93 |
|
94 |
F5TTS_ema_model = load_model(
|
95 |
"Gregniuki", "F5-tts_English_German_Polish", "English", DiT, F5TTS_model_cfg, 222600
|