Commit
·
afeb57e
1
Parent(s):
a18eac3
make voicecraft changes
Browse files
app.py
CHANGED
@@ -248,14 +248,12 @@ def load_hubert():
|
|
248 |
global hubert_model
|
249 |
# Load the model
|
250 |
|
251 |
-
configH
|
252 |
configH.output_hidden_states = True
|
253 |
hubert_model = HubertModel(configH)
|
254 |
hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
|
255 |
# Prepare the model
|
256 |
hubert_model = hubert_model.to(config.device)
|
257 |
-
config.device = "cuda"
|
258 |
-
config.is_half=True
|
259 |
if config.is_half:
|
260 |
hubert_model = hubert_model.half()
|
261 |
else:
|
@@ -1400,6 +1398,7 @@ def download_from_url(url, model, associated_user=None):
|
|
1400 |
os.makedirs("unzips", exist_ok=True)
|
1401 |
zipfile = model + '.zip'
|
1402 |
zipfile_path = './zips/' + zipfile
|
|
|
1403 |
try:
|
1404 |
if "drive.google.com" in url or "drive.usercontent.google.com":
|
1405 |
subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
|
@@ -1474,6 +1473,18 @@ def stoptraining(mim):
|
|
1474 |
|
1475 |
|
1476 |
def transcribe_btn_click(audio_choice):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1477 |
batch_size = 1 # Adjust based on your GPU memory availability
|
1478 |
compute_type = "float16"
|
1479 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
@@ -1497,7 +1508,6 @@ def transcribe_btn_click(audio_choice):
|
|
1497 |
orig_audio = audio_choice
|
1498 |
orig_transcript = result
|
1499 |
# move the audio and transcript to temp folder
|
1500 |
-
temp_folder = "./demo/temp"
|
1501 |
os.makedirs(temp_folder, exist_ok=True)
|
1502 |
os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
|
1503 |
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
|
@@ -1507,12 +1517,9 @@ def transcribe_btn_click(audio_choice):
|
|
1507 |
align_temp = f"{temp_folder}/mfa_alignments"
|
1508 |
os.makedirs(align_temp, exist_ok=True)
|
1509 |
|
1510 |
-
global audio_fn
|
1511 |
audio_fn = f"{temp_folder}/{filename}.wav"
|
1512 |
-
global transcript_fn
|
1513 |
transcript_fn = f"{temp_folder}/{filename}.txt"
|
1514 |
|
1515 |
-
|
1516 |
return result
|
1517 |
|
1518 |
|
@@ -1530,6 +1537,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
1530 |
info = torchaudio.info(audio_fn)
|
1531 |
audio_dur = info.num_frames / info.sample_rate
|
1532 |
|
|
|
1533 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
1534 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
1535 |
|
@@ -1570,7 +1578,7 @@ def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, cod
|
|
1570 |
|
1571 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
1572 |
|
1573 |
-
def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
1574 |
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
|
1575 |
sid,
|
1576 |
f0_up_key,
|
@@ -1585,19 +1593,25 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
|
|
1585 |
rms_mix_rate,
|
1586 |
protect,
|
1587 |
crepe_hop_length):
|
1588 |
-
|
1589 |
global voicecraft_model, voicecraft_config, phn2num
|
|
|
|
|
|
|
|
|
1590 |
|
1591 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
1592 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
1593 |
os.environ["USER"] = "USER"
|
1594 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1595 |
-
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
|
|
1596 |
target_transcript = transcribed_text + ' ' + target_transcript
|
1597 |
print(target_transcript)
|
1598 |
info = torchaudio.info(audio_fn)
|
1599 |
audio_dur = info.num_frames / info.sample_rate
|
1600 |
|
|
|
|
|
1601 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
1602 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
1603 |
|
@@ -1617,6 +1631,7 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
|
|
1617 |
concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
|
1618 |
audio_fn, target_transcript, config.device, decode_config,
|
1619 |
prompt_end_frame)
|
|
|
1620 |
|
1621 |
# save segments for comparison
|
1622 |
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
|
@@ -1636,7 +1651,8 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
|
|
1636 |
|
1637 |
f0_up_key = int(f0_up_key)
|
1638 |
try:
|
1639 |
-
audio = gen_audio
|
|
|
1640 |
audio_max = np.abs(audio).max() / 0.95
|
1641 |
if audio_max > 1:
|
1642 |
audio /= audio_max
|
@@ -1657,6 +1673,7 @@ def run_joint(seed, stop_repetition, sample_batch_size, left_margin, right_margi
|
|
1657 |
# file_big_npy = (
|
1658 |
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
1659 |
# )
|
|
|
1660 |
audio_opt = vc.pipeline(
|
1661 |
hubert_model,
|
1662 |
net_g,
|
@@ -2029,6 +2046,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
2029 |
run_btn_joint.click(
|
2030 |
fn=run_joint,
|
2031 |
inputs=[
|
|
|
2032 |
seed,
|
2033 |
stop_repitition,
|
2034 |
sample_batch_size,
|
@@ -2429,4 +2447,4 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
2429 |
)
|
2430 |
|
2431 |
app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False, auth=[('jvke', 'thisfeelslikeai'), ('cmss60', 'yourseedislate')])
|
2432 |
-
#endregion
|
|
|
248 |
global hubert_model
|
249 |
# Load the model
|
250 |
|
251 |
+
configH= HubertConfig()
|
252 |
configH.output_hidden_states = True
|
253 |
hubert_model = HubertModel(configH)
|
254 |
hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
|
255 |
# Prepare the model
|
256 |
hubert_model = hubert_model.to(config.device)
|
|
|
|
|
257 |
if config.is_half:
|
258 |
hubert_model = hubert_model.half()
|
259 |
else:
|
|
|
1398 |
os.makedirs("unzips", exist_ok=True)
|
1399 |
zipfile = model + '.zip'
|
1400 |
zipfile_path = './zips/' + zipfile
|
1401 |
+
return
|
1402 |
try:
|
1403 |
if "drive.google.com" in url or "drive.usercontent.google.com":
|
1404 |
subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
|
|
|
1473 |
|
1474 |
|
1475 |
def transcribe_btn_click(audio_choice):
|
1476 |
+
global transcript_fn
|
1477 |
+
global audio_fn
|
1478 |
+
|
1479 |
+
temp_folder = "./demo/temp"
|
1480 |
+
orig_audio = audio_choice
|
1481 |
+
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
|
1482 |
+
audio_fn = f"{temp_folder}/{filename}.wav"
|
1483 |
+
transcript_fn = f"{temp_folder}/{filename}.txt"
|
1484 |
+
if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
|
1485 |
+
print("Audio and transcript already exist, skipping transcript")
|
1486 |
+
return
|
1487 |
+
|
1488 |
batch_size = 1 # Adjust based on your GPU memory availability
|
1489 |
compute_type = "float16"
|
1490 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
1508 |
orig_audio = audio_choice
|
1509 |
orig_transcript = result
|
1510 |
# move the audio and transcript to temp folder
|
|
|
1511 |
os.makedirs(temp_folder, exist_ok=True)
|
1512 |
os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
|
1513 |
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
|
|
|
1517 |
align_temp = f"{temp_folder}/mfa_alignments"
|
1518 |
os.makedirs(align_temp, exist_ok=True)
|
1519 |
|
|
|
1520 |
audio_fn = f"{temp_folder}/{filename}.wav"
|
|
|
1521 |
transcript_fn = f"{temp_folder}/{filename}.txt"
|
1522 |
|
|
|
1523 |
return result
|
1524 |
|
1525 |
|
|
|
1537 |
info = torchaudio.info(audio_fn)
|
1538 |
audio_dur = info.num_frames / info.sample_rate
|
1539 |
|
1540 |
+
print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
|
1541 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
1542 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
1543 |
|
|
|
1578 |
|
1579 |
return [seg_save_fn_concat, seg_save_fn_gen]
|
1580 |
|
1581 |
+
def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
1582 |
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
|
1583 |
sid,
|
1584 |
f0_up_key,
|
|
|
1593 |
rms_mix_rate,
|
1594 |
protect,
|
1595 |
crepe_hop_length):
|
|
|
1596 |
global voicecraft_model, voicecraft_config, phn2num
|
1597 |
+
|
1598 |
+
print("Transcribing the input audio")
|
1599 |
+
transcribe_btn_click(input_audio_fn)
|
1600 |
+
print("Transcription complete")
|
1601 |
|
1602 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
1603 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
1604 |
os.environ["USER"] = "USER"
|
1605 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1606 |
+
# cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1607 |
+
|
1608 |
target_transcript = transcribed_text + ' ' + target_transcript
|
1609 |
print(target_transcript)
|
1610 |
info = torchaudio.info(audio_fn)
|
1611 |
audio_dur = info.num_frames / info.sample_rate
|
1612 |
|
1613 |
+
cut_off_sec = audio_dur - 0.1
|
1614 |
+
|
1615 |
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
|
1616 |
prompt_end_frame = int(cut_off_sec * info.sample_rate)
|
1617 |
|
|
|
1631 |
concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
|
1632 |
audio_fn, target_transcript, config.device, decode_config,
|
1633 |
prompt_end_frame)
|
1634 |
+
print("prompt_end_frame: ", prompt_end_frame, "voicecraft_config: ", voicecraft_config, "audio_fn: ", audio_fn, "target_transcript: ", target_transcript, "config.device: ", config.device, "decode_config: ", decode_config)
|
1635 |
|
1636 |
# save segments for comparison
|
1637 |
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
|
|
|
1651 |
|
1652 |
f0_up_key = int(f0_up_key)
|
1653 |
try:
|
1654 |
+
# audio = gen_audio.squeeze()
|
1655 |
+
audio = load_audio(seg_save_fn_gen, 16000, DoFormant, Quefrency, Timbre).squeeze()
|
1656 |
audio_max = np.abs(audio).max() / 0.95
|
1657 |
if audio_max > 1:
|
1658 |
audio /= audio_max
|
|
|
1673 |
# file_big_npy = (
|
1674 |
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
1675 |
# )
|
1676 |
+
print(f"Making VC Pipeline, device: {config.device}, audio shape: {audio.shape}")
|
1677 |
audio_opt = vc.pipeline(
|
1678 |
hubert_model,
|
1679 |
net_g,
|
|
|
2046 |
run_btn_joint.click(
|
2047 |
fn=run_joint,
|
2048 |
inputs=[
|
2049 |
+
input_audio0,
|
2050 |
seed,
|
2051 |
stop_repitition,
|
2052 |
sample_batch_size,
|
|
|
2447 |
)
|
2448 |
|
2449 |
app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False, auth=[('jvke', 'thisfeelslikeai'), ('cmss60', 'yourseedislate')])
|
2450 |
+
#endregion
|