ajayarora1235 commited on
Commit
cd4126a
·
1 Parent(s): 1833c9a

preload models

Browse files
Files changed (2) hide show
  1. app.py +29 -18
  2. pretrained_models/.gitkeep +0 -0
app.py CHANGED
@@ -255,6 +255,30 @@ def load_hubert():
255
  # )
256
  # hubert_model = models[0]
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
  weight_root = "weights"
260
  index_root = "logs"
@@ -1484,34 +1508,21 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
1484
  # # original file loaded it each time. here we load it only once
1485
  # global model_loaded
1486
  # f model_loaded==False:
1487
- from lib.voicecraft.models import voicecraft
1488
- voicecraft_name = "giga830M.pth"
1489
- ckpt_fn = f"./pretrained_models/{voicecraft_name}"
1490
- encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
1491
- if not os.path.exists(ckpt_fn):
1492
- os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
1493
- os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
1494
- if not os.path.exists(encodec_fn):
1495
- os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
1496
- os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
1497
 
1498
- ckpt = torch.load(ckpt_fn, map_location="cpu")
1499
- model = voicecraft.VoiceCraft(ckpt["config"])
1500
- model.load_state_dict(ckpt["model"])
1501
- model.to(config.device)
1502
- model.eval()
1503
-
1504
- phn2num = ckpt['phn2num']
1505
 
 
1506
  text_tokenizer = TextTokenizer(backend="espeak")
1507
  audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
1508
 
 
1509
  # # run the model to get the output
1510
  decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
1511
  'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
1512
  "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
1513
  from lib.voicecraft.inference_tts_scale import inference_one_sample
1514
- concated_audio, gen_audio = inference_one_sample(model, ckpt["config"], phn2num, text_tokenizer, audio_tokenizer,
1515
  audio_fn, target_transcript, config.device, decode_config,
1516
  prompt_end_frame)
1517
 
 
255
  # )
256
  # hubert_model = models[0]
257
 
258
+ def load_voicecraft():
259
+ global voicecraft_model
260
+ global phn2num
261
+ global voicecraft_config
262
+
263
+ from lib.voicecraft.models import voicecraft
264
+ voicecraft_name = "giga330M.pth"
265
+ ckpt_fn = f"./pretrained_models/{voicecraft_name}"
266
+ encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
267
+ if not os.path.exists(ckpt_fn):
268
+ os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
269
+ os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
270
+ if not os.path.exists(encodec_fn):
271
+ os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
272
+ os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
273
+
274
+ ckpt = torch.load(ckpt_fn, map_location="cpu")
275
+ voicecraft_config = ckpt["config"]
276
+ voicecraft_model = voicecraft.VoiceCraft(ckpt["config"])
277
+ voicecraft_model.load_state_dict(ckpt["model"])
278
+ voicecraft_model.to(config.device)
279
+ voicecraft_model.eval()
280
+
281
+ phn2num = ckpt['phn2num']
282
 
283
  weight_root = "weights"
284
  index_root = "logs"
 
1508
  # # original file loaded it each time. here we load it only once
1509
  # global model_loaded
1510
  # f model_loaded==False:
 
 
 
 
 
 
 
 
 
 
1511
 
1512
+ if voicecraft_model is None:
1513
+ load_voicecraft()
 
 
 
 
 
1514
 
1515
+ encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
1516
  text_tokenizer = TextTokenizer(backend="espeak")
1517
  audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
1518
 
1519
+
1520
  # # run the model to get the output
1521
  decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
1522
  'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
1523
  "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
1524
  from lib.voicecraft.inference_tts_scale import inference_one_sample
1525
+ concated_audio, gen_audio = inference_one_sample(model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
1526
  audio_fn, target_transcript, config.device, decode_config,
1527
  prompt_end_frame)
1528
 
pretrained_models/.gitkeep ADDED
File without changes