ajayarora1235 commited on
Commit
afeb57e
·
1 Parent(s): a18eac3

make voicecraft changes

Browse files
Files changed (1) hide show
  1. app.py +30 -12
app.py CHANGED
@@ -248,14 +248,12 @@ def load_hubert():
248
  global hubert_model
249
  # Load the model
250
 
251
- configH = HubertConfig()
252
  configH.output_hidden_states = True
253
  hubert_model = HubertModel(configH)
254
  hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
255
  # Prepare the model
256
  hubert_model = hubert_model.to(config.device)
257
- config.device = "cuda"
258
- config.is_half=True
259
  if config.is_half:
260
  hubert_model = hubert_model.half()
261
  else:
@@ -1400,6 +1398,7 @@ def download_from_url(url, model, associated_user=None):
1400
  os.makedirs("unzips", exist_ok=True)
1401
  zipfile = model + '.zip'
1402
  zipfile_path = './zips/' + zipfile
 
1403
  try:
1404
  if "drive.google.com" in url or "drive.usercontent.google.com":
1405
  subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
@@ -1474,6 +1473,18 @@ def stoptraining(mim):
1474
 
1475
 
1476
  def transcribe_btn_click(audio_choice):
 
 
 
 
 
 
 
 
 
 
 
 
1477
  batch_size = 1 # Adjust based on your GPU memory availability
1478
  compute_type = "float16"
1479
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -1497,7 +1508,6 @@ def transcribe_btn_click(audio_choice):
1497
  orig_audio = audio_choice
1498
  orig_transcript = result
1499
  # move the audio and transcript to temp folder
1500
- temp_folder = "./demo/temp"
1501
  os.makedirs(temp_folder, exist_ok=True)
1502
  os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
1503
  filename = os.path.splitext(orig_audio.split("/")[-1])[0]
@@ -1507,12 +1517,9 @@ def transcribe_btn_click(audio_choice):
1507
  align_temp = f"{temp_folder}/mfa_alignments"
1508
  os.makedirs(align_temp, exist_ok=True)
1509
 
1510
- global audio_fn
1511
  audio_fn = f"{temp_folder}/{filename}.wav"
1512
- global transcript_fn
1513
  transcript_fn = f"{temp_folder}/{filename}.txt"
1514
 
1515
-
1516
  return result
1517
 
1518
 
@@ -1530,6 +1537,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
1530
  info = torchaudio.info(audio_fn)
1531
  audio_dur = info.num_frames / info.sample_rate
1532
 
 
1533
  assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1534
  prompt_end_frame = int(cut_off_sec * info.sample_rate)
1535
 
@@ -1570,7 +1578,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
1570
 
1571
  return [seg_save_fn_concat, seg_save_fn_gen]
1572
 
1573
- def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1574
  temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
1575
  sid,
1576
  f0_up_key,
@@ -1585,19 +1593,25 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
1585
  rms_mix_rate,
1586
  protect,
1587
  crepe_hop_length):
1588
-
1589
  global voicecraft_model, voicecraft_config, phn2num
 
 
 
 
1590
 
1591
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1592
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1593
  os.environ["USER"] = "USER"
1594
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1595
- cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
 
1596
  target_transcript = transcribed_text + ' ' + target_transcript
1597
  print(target_transcript)
1598
  info = torchaudio.info(audio_fn)
1599
  audio_dur = info.num_frames / info.sample_rate
1600
 
 
 
1601
  assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1602
  prompt_end_frame = int(cut_off_sec * info.sample_rate)
1603
 
@@ -1617,6 +1631,7 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
1617
  concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
1618
  audio_fn, target_transcript, config.device, decode_config,
1619
  prompt_end_frame)
 
1620
 
1621
  # save segments for comparison
1622
  concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
@@ -1636,7 +1651,8 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
1636
 
1637
  f0_up_key = int(f0_up_key)
1638
  try:
1639
- audio = gen_audio
 
1640
  audio_max = np.abs(audio).max() / 0.95
1641
  if audio_max > 1:
1642
  audio /= audio_max
@@ -1657,6 +1673,7 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
1657
  # file_big_npy = (
1658
  # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
1659
  # )
 
1660
  audio_opt = vc.pipeline(
1661
  hubert_model,
1662
  net_g,
@@ -2029,6 +2046,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
2029
  run_btn_joint.click(
2030
  fn=run_joint,
2031
  inputs=[
 
2032
  seed,
2033
  stop_repitition,
2034
  sample_batch_size,
@@ -2429,4 +2447,4 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
2429
  )
2430
 
2431
  app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False, auth=[('jvke', 'thisfeelslikeai'), ('cmss60', 'yourseedislate')])
2432
- #endregion
 
248
  global hubert_model
249
  # Load the model
250
 
251
+ configH= HubertConfig()
252
  configH.output_hidden_states = True
253
  hubert_model = HubertModel(configH)
254
  hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
255
  # Prepare the model
256
  hubert_model = hubert_model.to(config.device)
 
 
257
  if config.is_half:
258
  hubert_model = hubert_model.half()
259
  else:
 
1398
  os.makedirs("unzips", exist_ok=True)
1399
  zipfile = model + '.zip'
1400
  zipfile_path = './zips/' + zipfile
1401
+ return
1402
  try:
1403
  if "drive.google.com" in url or "drive.usercontent.google.com":
1404
  subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
 
1473
 
1474
 
1475
  def transcribe_btn_click(audio_choice):
1476
+ global transcript_fn
1477
+ global audio_fn
1478
+
1479
+ temp_folder = "./demo/temp"
1480
+ orig_audio = audio_choice
1481
+ filename = os.path.splitext(orig_audio.split("/")[-1])[0]
1482
+ audio_fn = f"{temp_folder}/{filename}.wav"
1483
+ transcript_fn = f"{temp_folder}/{filename}.txt"
1484
+ if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
1485
+ print("Audio and transcript already exist, skipping transcript")
1486
+ return
1487
+
1488
  batch_size = 1 # Adjust based on your GPU memory availability
1489
  compute_type = "float16"
1490
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
1508
  orig_audio = audio_choice
1509
  orig_transcript = result
1510
  # move the audio and transcript to temp folder
 
1511
  os.makedirs(temp_folder, exist_ok=True)
1512
  os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
1513
  filename = os.path.splitext(orig_audio.split("/")[-1])[0]
 
1517
  align_temp = f"{temp_folder}/mfa_alignments"
1518
  os.makedirs(align_temp, exist_ok=True)
1519
 
 
1520
  audio_fn = f"{temp_folder}/{filename}.wav"
 
1521
  transcript_fn = f"{temp_folder}/{filename}.txt"
1522
 
 
1523
  return result
1524
 
1525
 
 
1537
  info = torchaudio.info(audio_fn)
1538
  audio_dur = info.num_frames / info.sample_rate
1539
 
1540
+ print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
1541
  assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1542
  prompt_end_frame = int(cut_off_sec * info.sample_rate)
1543
 
 
1578
 
1579
  return [seg_save_fn_concat, seg_save_fn_gen]
1580
 
1581
+ def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1582
  temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
1583
  sid,
1584
  f0_up_key,
 
1593
  rms_mix_rate,
1594
  protect,
1595
  crepe_hop_length):
 
1596
  global voicecraft_model, voicecraft_config, phn2num
1597
+
1598
+ print("Transcribing the input audio")
1599
+ transcribe_btn_click(input_audio_fn)
1600
+ print("Transcription complete")
1601
 
1602
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1603
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1604
  os.environ["USER"] = "USER"
1605
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1606
+ # cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1607
+
1608
  target_transcript = transcribed_text + ' ' + target_transcript
1609
  print(target_transcript)
1610
  info = torchaudio.info(audio_fn)
1611
  audio_dur = info.num_frames / info.sample_rate
1612
 
1613
+ cut_off_sec = audio_dur - 0.1
1614
+
1615
  assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1616
  prompt_end_frame = int(cut_off_sec * info.sample_rate)
1617
 
 
1631
  concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
1632
  audio_fn, target_transcript, config.device, decode_config,
1633
  prompt_end_frame)
1634
+ print("prompt_end_frame: ", prompt_end_frame, "voicecraft_config: ", voicecraft_config, "audio_fn: ", audio_fn, "target_transcript: ", target_transcript, "config.device: ", config.device, "decode_config: ", decode_config)
1635
 
1636
  # save segments for comparison
1637
  concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
 
1651
 
1652
  f0_up_key = int(f0_up_key)
1653
  try:
1654
+ # audio = gen_audio.squeeze()
1655
+ audio = load_audio(seg_save_fn_gen, 16000, DoFormant, Quefrency, Timbre).squeeze()
1656
  audio_max = np.abs(audio).max() / 0.95
1657
  if audio_max > 1:
1658
  audio /= audio_max
 
1673
  # file_big_npy = (
1674
  # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
1675
  # )
1676
+ print(f"Making VC Pipeline, device: {config.device}, audio shape: {audio.shape}")
1677
  audio_opt = vc.pipeline(
1678
  hubert_model,
1679
  net_g,
 
2046
  run_btn_joint.click(
2047
  fn=run_joint,
2048
  inputs=[
2049
+ input_audio0,
2050
  seed,
2051
  stop_repitition,
2052
  sample_batch_size,
 
2447
  )
2448
 
2449
  app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False, auth=[('jvke', 'thisfeelslikeai'), ('cmss60', 'yourseedislate')])
2450
+ #endregion