ajayarora1235
commited on
Commit
·
cd4126a
1
Parent(s):
1833c9a
preload models
Browse files- app.py +29 -18
- pretrained_models/.gitkeep +0 -0
app.py
CHANGED
@@ -255,6 +255,30 @@ def load_hubert():
|
|
255 |
# )
|
256 |
# hubert_model = models[0]
|
257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
|
259 |
weight_root = "weights"
|
260 |
index_root = "logs"
|
@@ -1484,34 +1508,21 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
1484 |
# # original file loaded it each time. here we load it only once
|
1485 |
# global model_loaded
|
1486 |
# f model_loaded==False:
|
1487 |
-
from lib.voicecraft.models import voicecraft
|
1488 |
-
voicecraft_name = "giga830M.pth"
|
1489 |
-
ckpt_fn = f"./pretrained_models/{voicecraft_name}"
|
1490 |
-
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
|
1491 |
-
if not os.path.exists(ckpt_fn):
|
1492 |
-
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
|
1493 |
-
os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
|
1494 |
-
if not os.path.exists(encodec_fn):
|
1495 |
-
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
|
1496 |
-
os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
|
1497 |
|
1498 |
-
|
1499 |
-
|
1500 |
-
model.load_state_dict(ckpt["model"])
|
1501 |
-
model.to(config.device)
|
1502 |
-
model.eval()
|
1503 |
-
|
1504 |
-
phn2num = ckpt['phn2num']
|
1505 |
|
|
|
1506 |
text_tokenizer = TextTokenizer(backend="espeak")
|
1507 |
audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
|
1508 |
|
|
|
1509 |
# # run the model to get the output
|
1510 |
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
|
1511 |
'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
|
1512 |
"silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
|
1513 |
from lib.voicecraft.inference_tts_scale import inference_one_sample
|
1514 |
-
concated_audio, gen_audio = inference_one_sample(model,
|
1515 |
audio_fn, target_transcript, config.device, decode_config,
|
1516 |
prompt_end_frame)
|
1517 |
|
|
|
255 |
# )
|
256 |
# hubert_model = models[0]
|
257 |
|
258 |
+
def load_voicecraft():
|
259 |
+
global voicecraft_model
|
260 |
+
global phn2num
|
261 |
+
global voicecraft_config
|
262 |
+
|
263 |
+
from lib.voicecraft.models import voicecraft
|
264 |
+
voicecraft_name = "giga330M.pth"
|
265 |
+
ckpt_fn = f"./pretrained_models/{voicecraft_name}"
|
266 |
+
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
|
267 |
+
if not os.path.exists(ckpt_fn):
|
268 |
+
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
|
269 |
+
os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
|
270 |
+
if not os.path.exists(encodec_fn):
|
271 |
+
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
|
272 |
+
os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
|
273 |
+
|
274 |
+
ckpt = torch.load(ckpt_fn, map_location="cpu")
|
275 |
+
voicecraft_config = ckpt["config"]
|
276 |
+
voicecraft_model = voicecraft.VoiceCraft(ckpt["config"])
|
277 |
+
voicecraft_model.load_state_dict(ckpt["model"])
|
278 |
+
voicecraft_model.to(config.device)
|
279 |
+
voicecraft_model.eval()
|
280 |
+
|
281 |
+
phn2num = ckpt['phn2num']
|
282 |
|
283 |
weight_root = "weights"
|
284 |
index_root = "logs"
|
|
|
1508 |
# # original file loaded it each time. here we load it only once
|
1509 |
# global model_loaded
|
1510 |
# f model_loaded==False:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1511 |
|
1512 |
+
if voicecraft_model is None:
|
1513 |
+
load_voicecraft()
|
|
|
|
|
|
|
|
|
|
|
1514 |
|
1515 |
+
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
|
1516 |
text_tokenizer = TextTokenizer(backend="espeak")
|
1517 |
audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
|
1518 |
|
1519 |
+
|
1520 |
# # run the model to get the output
|
1521 |
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
|
1522 |
'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
|
1523 |
"silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
|
1524 |
from lib.voicecraft.inference_tts_scale import inference_one_sample
|
1525 |
+
concated_audio, gen_audio = inference_one_sample(model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
|
1526 |
audio_fn, target_transcript, config.device, decode_config,
|
1527 |
prompt_end_frame)
|
1528 |
|
pretrained_models/.gitkeep
ADDED
File without changes
|