3417543_models / app.py
ajayarora1235's picture
get rid of cutoff time
6736ecf
raw
history blame
100 kB
import subprocess, torch, os, traceback, sys, warnings, shutil, numpy as np
import pandas as pd
import torchaudio
from lib.voicecraft.data.tokenizer import (
AudioTokenizer,
TextTokenizer,
)
import whisperx
import os
import time
import gc
import gradio as gr
from mega import Mega
os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
import threading
from time import sleep
from subprocess import Popen
import faiss
from random import shuffle
import json, datetime, requests
now_dir = os.getcwd()
sys.path.append(now_dir)
tmp = os.path.join(now_dir, "TEMP")
shutil.rmtree(tmp, ignore_errors=True)
shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True)
os.makedirs(tmp, exist_ok=True)
os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True)
os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True)
os.environ["TEMP"] = tmp
warnings.filterwarnings("ignore")
torch.manual_seed(114514)
import signal
import math
from utils import load_audio, CSVutil
global DoFormant, Quefrency, Timbre
from transformers import HubertModel, HubertConfig
if not os.path.isdir('csvdb/'):
os.makedirs('csvdb')
frmnt, stp = open("csvdb/formanting.csv", 'w'), open("csvdb/stop.csv", 'w')
frmnt.close()
stp.close()
try:
DoFormant, Quefrency, Timbre = CSVutil('csvdb/formanting.csv', 'r', 'formanting')
DoFormant = (
lambda DoFormant: True if DoFormant.lower() == 'true' else (False if DoFormant.lower() == 'false' else DoFormant)
)(DoFormant)
except (ValueError, TypeError, IndexError):
DoFormant, Quefrency, Timbre = False, 1.0, 1.0
CSVutil('csvdb/formanting.csv', 'w+', 'formanting', DoFormant, Quefrency, Timbre)
def update_message(request: gr.Request):
change_choices(request.username)
return f"Welcome, {request.username}"
def download_models():
# Download hubert base model if not present
if not os.path.isfile('./hubert_base.pt'):
response = requests.get('https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt')
if response.status_code == 200:
with open('./hubert_base.pt', 'wb') as f:
f.write(response.content)
print("Downloaded hubert base model file successfully. File saved to ./hubert_base.pt.")
else:
raise Exception("Failed to download hubert base model file. Status code: " + str(response.status_code) + ".")
# Download rmvpe model if not present
if not os.path.isfile('./rmvpe.pt'):
response = requests.get('https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt?download=true')
if response.status_code == 200:
with open('./rmvpe.pt', 'wb') as f:
f.write(response.content)
print("Downloaded rmvpe model file successfully. File saved to ./rmvpe.pt.")
else:
raise Exception("Failed to download rmvpe model file. Status code: " + str(response.status_code) + ".")
download_models()
print("\n-------------------------------\nRVC v2 Easy GUI (Local Edition)\n-------------------------------\n")
def formant_apply(qfrency, tmbre):
Quefrency = qfrency
Timbre = tmbre
DoFormant = True
CSVutil('csvdb/formanting.csv', 'w+', 'formanting', DoFormant, qfrency, tmbre)
return ({"value": Quefrency, "__type__": "update"}, {"value": Timbre, "__type__": "update"})
def get_fshift_presets():
fshift_presets_list = []
for dirpath, _, filenames in os.walk("./formantshiftcfg/"):
for filename in filenames:
if filename.endswith(".txt"):
fshift_presets_list.append(os.path.join(dirpath,filename).replace('\\','/'))
if len(fshift_presets_list) > 0:
return fshift_presets_list
else:
return ''
def formant_enabled(cbox, qfrency, tmbre, frmntapply, formantpreset, formant_refresh_button):
if (cbox):
DoFormant = True
CSVutil('csvdb/formanting.csv', 'w+', 'formanting', DoFormant, qfrency, tmbre)
#print(f"is checked? - {cbox}\ngot {DoFormant}")
return (
{"value": True, "__type__": "update"},
{"visible": True, "__type__": "update"},
{"visible": True, "__type__": "update"},
{"visible": True, "__type__": "update"},
{"visible": True, "__type__": "update"},
{"visible": True, "__type__": "update"},
)
else:
DoFormant = False
CSVutil('csvdb/formanting.csv', 'w+', 'formanting', DoFormant, qfrency, tmbre)
#print(f"is checked? - {cbox}\ngot {DoFormant}")
return (
{"value": False, "__type__": "update"},
{"visible": False, "__type__": "update"},
{"visible": False, "__type__": "update"},
{"visible": False, "__type__": "update"},
{"visible": False, "__type__": "update"},
{"visible": False, "__type__": "update"},
{"visible": False, "__type__": "update"},
)
def preset_apply(preset, qfer, tmbr):
if str(preset) != '':
with open(str(preset), 'r') as p:
content = p.readlines()
qfer, tmbr = content[0].split('\n')[0], content[1]
formant_apply(qfer, tmbr)
else:
pass
return ({"value": qfer, "__type__": "update"}, {"value": tmbr, "__type__": "update"})
def update_fshift_presets(preset, qfrency, tmbre):
qfrency, tmbre = preset_apply(preset, qfrency, tmbre)
if (str(preset) != ''):
with open(str(preset), 'r') as p:
content = p.readlines()
qfrency, tmbre = content[0].split('\n')[0], content[1]
formant_apply(qfrency, tmbre)
else:
pass
return (
{"choices": get_fshift_presets(), "__type__": "update"},
{"value": qfrency, "__type__": "update"},
{"value": tmbre, "__type__": "update"},
)
# i18n = I18nAuto()
#i18n.print()
# 判断是否有能用来训练和加速推理的N卡
ngpu = torch.cuda.device_count()
gpu_infos = []
mem = []
if (not torch.cuda.is_available()) or ngpu == 0:
if_gpu_ok = False
else:
if_gpu_ok = False
for i in range(ngpu):
gpu_name = torch.cuda.get_device_name(i)
if (
"10" in gpu_name
or "16" in gpu_name
or "20" in gpu_name
or "30" in gpu_name
or "40" in gpu_name
or "A2" in gpu_name.upper()
or "A3" in gpu_name.upper()
or "A4" in gpu_name.upper()
or "P4" in gpu_name.upper()
or "A50" in gpu_name.upper()
or "A60" in gpu_name.upper()
or "70" in gpu_name
or "80" in gpu_name
or "90" in gpu_name
or "M4" in gpu_name.upper()
or "T4" in gpu_name.upper()
or "TITAN" in gpu_name.upper()
): # A10#A100#V100#A40#P40#M40#K80#A4500
if_gpu_ok = True # 至少有一张能用的N卡
gpu_infos.append("%s\t%s" % (i, gpu_name))
mem.append(
int(
torch.cuda.get_device_properties(i).total_memory
/ 1024
/ 1024
/ 1024
+ 0.4
)
)
if if_gpu_ok == True and len(gpu_infos) > 0:
gpu_info = "\n".join(gpu_infos)
default_batch_size = min(mem) // 2
else:
gpu_info = "test"
default_batch_size = 1
gpus = "-".join([i[0] for i in gpu_infos])
from lib.infer_pack.models import (
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
SynthesizerTrnMs768NSFsid,
SynthesizerTrnMs768NSFsid_nono,
)
import soundfile as sf
import logging
from vc_infer_pipeline import VC
from config import Config
config = Config()
# from trainset_preprocess_pipeline import PreProcess
logging.getLogger("numba").setLevel(logging.WARNING)
hubert_model = None
voicecraft_model = None
voicecraft_config = None
phn2num = None
associated_links = {}
def load_hubert():
global hubert_model
# Load the model
configH= HubertConfig()
configH.output_hidden_states = True
hubert_model = HubertModel(configH)
hubert_model.load_state_dict(torch.load('hubert_base_hf_statedict.pt'))
# Prepare the model
hubert_model = hubert_model.to(config.device)
if config.is_half:
hubert_model = hubert_model.half()
else:
hubert_model = hubert_model.float()
hubert_model.eval()
# models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
# ["hubert_base.pt"],
# suffix="",
# )
# hubert_model = models[0]
def load_voicecraft():
global voicecraft_model, phn2num, voicecraft_config
from lib.voicecraft.models import voicecraft
voicecraft_name = "giga330M.pth"
ckpt_fn = f"./pretrained_models/{voicecraft_name}"
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
if not os.path.exists(ckpt_fn):
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
if not os.path.exists(encodec_fn):
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
ckpt = torch.load(ckpt_fn, map_location="cpu")
voicecraft_config = ckpt["config"]
voicecraft_model = voicecraft.VoiceCraft(ckpt["config"])
voicecraft_model.load_state_dict(ckpt["model"])
voicecraft_model.to(config.device)
voicecraft_model.eval()
phn2num = ckpt['phn2num']
weight_root = "weights"
index_root = "logs"
names = []
for name in os.listdir(weight_root):
if name.endswith(".pth"):
names.append(name)
index_paths = []
for root, dirs, files in os.walk(index_root, topdown=False):
for name in files:
if name.endswith(".index") and "trained" not in name:
index_paths.append("%s/%s" % (root, name))
def vc_single(
sid,
input_audio_path,
f0_up_key,
f0_file,
f0_method,
file_index,
#file_index2,
# file_big_npy,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
crepe_hop_length,
): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
global tgt_sr, net_g, vc, hubert_model, version
if input_audio_path is None:
return "You need to upload an audio", None
f0_up_key = int(f0_up_key)
try:
audio = load_audio(input_audio_path, 16000, DoFormant, Quefrency, Timbre)
audio_max = np.abs(audio).max() / 0.95
if audio_max > 1:
audio /= audio_max
times = [0, 0, 0]
if hubert_model == None:
load_hubert()
if_f0 = cpt.get("f0", 1)
file_index = (
(
file_index.strip(" ")
.strip('"')
.strip("\n")
.strip('"')
.strip(" ")
.replace("trained", "added")
)
) # 防止小白写错,自动帮他替换掉
# file_big_npy = (
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
# )
audio_opt = vc.pipeline(
hubert_model,
net_g,
sid,
audio,
input_audio_path,
times,
f0_up_key,
f0_method,
file_index,
# file_big_npy,
index_rate,
if_f0,
filter_radius,
tgt_sr,
resample_sr,
rms_mix_rate,
version,
protect,
crepe_hop_length,
f0_file=f0_file,
)
if resample_sr >= 16000 and tgt_sr != resample_sr:
tgt_sr = resample_sr
index_info = (
"Using index:%s." % file_index
if os.path.exists(file_index)
else "Index not used."
)
return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
index_info,
times[0],
times[1],
times[2],
), (tgt_sr, audio_opt)
except:
info = traceback.format_exc()
print(info)
return info, (None, None)
def vc_multi(
sid,
dir_path,
opt_root,
paths,
f0_up_key,
f0_method,
file_index,
file_index2,
# file_big_npy,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
format1,
crepe_hop_length,
):
try:
dir_path = (
dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车
opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
os.makedirs(opt_root, exist_ok=True)
try:
if dir_path != "":
paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)]
else:
paths = [path.name for path in paths]
except:
traceback.print_exc()
paths = [path.name for path in paths]
infos = []
for path in paths:
info, opt = vc_single(
sid,
path,
f0_up_key,
None,
f0_method,
file_index,
# file_big_npy,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
crepe_hop_length
)
if "Success" in info:
try:
tgt_sr, audio_opt = opt
if format1 in ["wav", "flac"]:
sf.write(
"%s/%s.%s" % (opt_root, os.path.basename(path), format1),
audio_opt,
tgt_sr,
)
else:
path = "%s/%s.wav" % (opt_root, os.path.basename(path))
sf.write(
path,
audio_opt,
tgt_sr,
)
if os.path.exists(path):
os.system(
"ffmpeg -i %s -vn %s -q:a 2 -y"
% (path, path[:-4] + ".%s" % format1)
)
except:
info += traceback.format_exc()
infos.append("%s->%s" % (os.path.basename(path), info))
yield "\n".join(infos)
yield "\n".join(infos)
except:
yield traceback.format_exc()
# 一个选项卡全局只能有一个音色
def get_vc(sid):
global n_spk, tgt_sr, net_g, vc, cpt, version
if sid == "" or sid == []:
global hubert_model
if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
print("clean_empty_cache")
del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
if torch.cuda.is_available():
torch.cuda.empty_cache()
###楼下不这么折腾清理不干净
if_f0 = cpt.get("f0", 1)
version = cpt.get("version", "v1")
if version == "v1":
if if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(
*cpt["config"], is_half=config.is_half
)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif version == "v2":
if if_f0 == 1:
net_g = SynthesizerTrnMs768NSFsid(
*cpt["config"], is_half=config.is_half
)
else:
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
del net_g, cpt
if torch.cuda.is_available():
torch.cuda.empty_cache()
cpt = None
return {"visible": False, "__type__": "update"}
person = "%s/%s" % (weight_root, sid)
print("loading %s" % person)
cpt = torch.load(person, map_location="cpu")
tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
if_f0 = cpt.get("f0", 1)
version = cpt.get("version", "v1")
if version == "v1":
if if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif version == "v2":
if if_f0 == 1:
net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
else:
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
del net_g.enc_q
print(net_g.load_state_dict(cpt["weight"], strict=False))
net_g.eval().to(config.device)
if config.is_half:
net_g = net_g.half()
else:
net_g = net_g.float()
vc = VC(tgt_sr, config)
n_spk = cpt["config"][-3]
return {"visible": False, "maximum": n_spk, "__type__": "update"}
def change_choices(username=None):
names = []
print(associated_links)
for name in os.listdir(weight_root):
if name.endswith(".pth"):
if username is None:
names.append(name)
else:
if associated_links.get(name) == username:
names.append(name)
index_paths = []
for root, dirs, files in os.walk(index_root, topdown=False):
for name in files:
if name.endswith(".index") and "trained" not in name:
if username is None:
index_paths.append("%s/%s" % (root, name))
else:
if associated_links.get(name) == username:
index_paths.append("%s/%s" % (root, name))
return {"choices": sorted(names), "__type__": "update"}, {
"choices": sorted(index_paths),
"__type__": "update",
}
def clean():
return {"value": "", "__type__": "update"}
sr_dict = {
"32k": 32000,
"40k": 40000,
"48k": 48000,
}
def if_done(done, p):
while 1:
if p.poll() == None:
sleep(0.5)
else:
break
done[0] = True
def if_done_multi(done, ps):
while 1:
# poll==None代表进程未结束
# 只要有一个进程未结束都不停
flag = 1
for p in ps:
if p.poll() == None:
flag = 0
sleep(0.5)
break
if flag == 1:
break
done[0] = True
def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
sr = sr_dict[sr]
os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w")
f.close()
cmd = (
config.python_cmd
+ " trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s "
% (trainset_dir, sr, n_p, now_dir, exp_dir)
+ str(config.noparallel)
)
print(cmd)
p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False]
threading.Thread(
target=if_done,
args=(
done,
p,
),
).start()
while 1:
with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
yield (f.read())
sleep(1)
if done[0] == True:
break
with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
log = f.read()
print(log)
yield log
# but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, echl):
gpus = gpus.split("-")
os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w")
f.close()
if if_f0:
cmd = config.python_cmd + " extract_f0_print.py %s/logs/%s %s %s %s" % (
now_dir,
exp_dir,
n_p,
f0method,
echl,
)
print(cmd)
p = Popen(cmd, shell=True, cwd=now_dir) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False]
threading.Thread(
target=if_done,
args=(
done,
p,
),
).start()
while 1:
with open(
"%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r"
) as f:
yield (f.read())
sleep(1)
if done[0] == True:
break
with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
log = f.read()
print(log)
yield log
####对不同part分别开多进程
"""
n_part=int(sys.argv[1])
i_part=int(sys.argv[2])
i_gpu=sys.argv[3]
exp_dir=sys.argv[4]
os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
"""
leng = len(gpus)
ps = []
for idx, n_g in enumerate(gpus):
cmd = (
config.python_cmd
+ " extract_feature_print.py %s %s %s %s %s/logs/%s %s"
% (
config.device,
leng,
idx,
n_g,
now_dir,
exp_dir,
version19,
)
)
print(cmd)
p = Popen(
cmd, shell=True, cwd=now_dir
) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
ps.append(p)
###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
done = [False]
threading.Thread(
target=if_done_multi,
args=(
done,
ps,
),
).start()
while 1:
with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
yield (f.read())
sleep(1)
if done[0] == True:
break
with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
log = f.read()
print(log)
yield log
def change_sr2(sr2, if_f0_3, version19):
path_str = "" if version19 == "v1" else "_v2"
f0_str = "f0" if if_f0_3 else ""
if_pretrained_generator_exist = os.access("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK)
if_pretrained_discriminator_exist = os.access("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK)
if (if_pretrained_generator_exist == False):
print("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
if (if_pretrained_discriminator_exist == False):
print("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
return (
("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_generator_exist else "",
("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_discriminator_exist else "",
{"visible": True, "__type__": "update"}
)
def change_version19(sr2, if_f0_3, version19):
path_str = "" if version19 == "v1" else "_v2"
f0_str = "f0" if if_f0_3 else ""
if_pretrained_generator_exist = os.access("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK)
if_pretrained_discriminator_exist = os.access("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK)
if (if_pretrained_generator_exist == False):
print("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
if (if_pretrained_discriminator_exist == False):
print("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
return (
("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_generator_exist else "",
("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_discriminator_exist else "",
)
def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15
path_str = "" if version19 == "v1" else "_v2"
if_pretrained_generator_exist = os.access("pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK)
if_pretrained_discriminator_exist = os.access("pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK)
if (if_pretrained_generator_exist == False):
print("pretrained%s/f0G%s.pth" % (path_str, sr2), "not exist, will not use pretrained model")
if (if_pretrained_discriminator_exist == False):
print("pretrained%s/f0D%s.pth" % (path_str, sr2), "not exist, will not use pretrained model")
if if_f0_3:
return (
{"visible": True, "__type__": "update"},
"pretrained%s/f0G%s.pth" % (path_str, sr2) if if_pretrained_generator_exist else "",
"pretrained%s/f0D%s.pth" % (path_str, sr2) if if_pretrained_discriminator_exist else "",
)
return (
{"visible": False, "__type__": "update"},
("pretrained%s/G%s.pth" % (path_str, sr2)) if if_pretrained_generator_exist else "",
("pretrained%s/D%s.pth" % (path_str, sr2)) if if_pretrained_discriminator_exist else "",
)
global log_interval
def set_log_interval(exp_dir, batch_size12):
log_interval = 1
folder_path = os.path.join(exp_dir, "1_16k_wavs")
if os.path.exists(folder_path) and os.path.isdir(folder_path):
wav_files = [f for f in os.listdir(folder_path) if f.endswith(".wav")]
if wav_files:
sample_size = len(wav_files)
log_interval = math.ceil(sample_size / batch_size12)
if log_interval > 1:
log_interval += 1
return log_interval
# but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
def click_train(
exp_dir1,
sr2,
if_f0_3,
spk_id5,
save_epoch10,
total_epoch11,
batch_size12,
if_save_latest13,
pretrained_G14,
pretrained_D15,
gpus16,
if_cache_gpu17,
if_save_every_weights18,
version19,
):
CSVutil('csvdb/stop.csv', 'w+', 'formanting', False)
# 生成filelist
exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
os.makedirs(exp_dir, exist_ok=True)
gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir)
feature_dir = (
"%s/3_feature256" % (exp_dir)
if version19 == "v1"
else "%s/3_feature768" % (exp_dir)
)
log_interval = set_log_interval(exp_dir, batch_size12)
if if_f0_3:
f0_dir = "%s/2a_f0" % (exp_dir)
f0nsf_dir = "%s/2b-f0nsf" % (exp_dir)
names = (
set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
& set([name.split(".")[0] for name in os.listdir(feature_dir)])
& set([name.split(".")[0] for name in os.listdir(f0_dir)])
& set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
)
else:
names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
[name.split(".")[0] for name in os.listdir(feature_dir)]
)
opt = []
for name in names:
if if_f0_3:
opt.append(
"%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
% (
gt_wavs_dir.replace("\\", "\\\\"),
name,
feature_dir.replace("\\", "\\\\"),
name,
f0_dir.replace("\\", "\\\\"),
name,
f0nsf_dir.replace("\\", "\\\\"),
name,
spk_id5,
)
)
else:
opt.append(
"%s/%s.wav|%s/%s.npy|%s"
% (
gt_wavs_dir.replace("\\", "\\\\"),
name,
feature_dir.replace("\\", "\\\\"),
name,
spk_id5,
)
)
fea_dim = 256 if version19 == "v1" else 768
if if_f0_3:
for _ in range(2):
opt.append(
"%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
% (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
)
else:
for _ in range(2):
opt.append(
"%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
% (now_dir, sr2, now_dir, fea_dim, spk_id5)
)
shuffle(opt)
with open("%s/filelist.txt" % exp_dir, "w") as f:
f.write("\n".join(opt))
print("write filelist done")
# 生成config#无需生成config
# cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
print("use gpus:", gpus16)
if pretrained_G14 == "":
print("no pretrained Generator")
if pretrained_D15 == "":
print("no pretrained Discriminator")
if gpus16:
cmd = (
config.python_cmd
+ " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s -li %s"
% (
exp_dir1,
sr2,
1 if if_f0_3 else 0,
batch_size12,
gpus16,
total_epoch11,
save_epoch10,
("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "",
("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "",
1 if if_save_latest13 == True else 0,
1 if if_cache_gpu17 == True else 0,
1 if if_save_every_weights18 == True else 0,
version19,
log_interval,
)
)
else:
cmd = (
config.python_cmd
+ " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s -li %s"
% (
exp_dir1,
sr2,
1 if if_f0_3 else 0,
batch_size12,
total_epoch11,
save_epoch10,
("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "\b",
("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "\b",
1 if if_save_latest13 == True else 0,
1 if if_cache_gpu17 == True else 0,
1 if if_save_every_weights18 == True else 0,
version19,
log_interval,
)
)
print(cmd)
p = Popen(cmd, shell=True, cwd=now_dir)
global PID
PID = p.pid
p.wait()
return ("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log", {"visible": False, "__type__": "update"}, {"visible": True, "__type__": "update"})
# but4.click(train_index, [exp_dir1], info3)
def train_index(exp_dir1, version19):
exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
os.makedirs(exp_dir, exist_ok=True)
feature_dir = (
"%s/3_feature256" % (exp_dir)
if version19 == "v1"
else "%s/3_feature768" % (exp_dir)
)
if os.path.exists(feature_dir) == False:
return "请先进行特征提取!"
listdir_res = list(os.listdir(feature_dir))
if len(listdir_res) == 0:
return "请先进行特征提取!"
npys = []
for name in sorted(listdir_res):
phone = np.load("%s/%s" % (feature_dir, name))
npys.append(phone)
big_npy = np.concatenate(npys, 0)
big_npy_idx = np.arange(big_npy.shape[0])
np.random.shuffle(big_npy_idx)
big_npy = big_npy[big_npy_idx]
np.save("%s/total_fea.npy" % exp_dir, big_npy)
# n_ivf = big_npy.shape[0] // 39
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
infos = []
infos.append("%s,%s" % (big_npy.shape, n_ivf))
yield "\n".join(infos)
index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
# index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf)
infos.append("training")
yield "\n".join(infos)
index_ivf = faiss.extract_index_ivf(index) #
index_ivf.nprobe = 1
index.train(big_npy)
faiss.write_index(
index,
"%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
)
# faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
infos.append("adding")
yield "\n".join(infos)
batch_size_add = 8192
for i in range(0, big_npy.shape[0], batch_size_add):
index.add(big_npy[i : i + batch_size_add])
faiss.write_index(
index,
"%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
)
infos.append(
"成功构建索引,added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (n_ivf, index_ivf.nprobe, exp_dir1, version19)
)
# faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
# infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
yield "\n".join(infos)
# but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
def train1key(
exp_dir1,
sr2,
if_f0_3,
trainset_dir4,
spk_id5,
np7,
f0method8,
save_epoch10,
total_epoch11,
batch_size12,
if_save_latest13,
pretrained_G14,
pretrained_D15,
gpus16,
if_cache_gpu17,
if_save_every_weights18,
version19,
echl
):
infos = []
def get_info_str(strr):
infos.append(strr)
return "\n".join(infos)
model_log_dir = "%s/logs/%s" % (now_dir, exp_dir1)
preprocess_log_path = "%s/preprocess.log" % model_log_dir
extract_f0_feature_log_path = "%s/extract_f0_feature.log" % model_log_dir
gt_wavs_dir = "%s/0_gt_wavs" % model_log_dir
feature_dir = (
"%s/3_feature256" % model_log_dir
if version19 == "v1"
else "%s/3_feature768" % model_log_dir
)
os.makedirs(model_log_dir, exist_ok=True)
#########step1:处理数据
open(preprocess_log_path, "w").close()
cmd = (
config.python_cmd
+ " trainset_preprocess_pipeline_print.py %s %s %s %s "
% (trainset_dir4, sr_dict[sr2], np7, model_log_dir)
+ str(config.noparallel)
)
yield get_info_str("step1: step 1")
yield get_info_str(cmd)
p = Popen(cmd, shell=True)
p.wait()
with open(preprocess_log_path, "r") as f:
print(f.read())
#########step2a:提取音高
open(extract_f0_feature_log_path, "w")
if if_f0_3:
yield get_info_str("step2a:正在提取音高")
cmd = config.python_cmd + " extract_f0_print.py %s %s %s %s" % (
model_log_dir,
np7,
f0method8,
echl
)
yield get_info_str(cmd)
p = Popen(cmd, shell=True, cwd=now_dir)
p.wait()
with open(extract_f0_feature_log_path, "r") as f:
print(f.read())
else:
yield get_info_str("step2a:step2a")
#######step2b:提取特征
yield get_info_str("step2b:step2b")
gpus = gpus16.split("-")
leng = len(gpus)
ps = []
for idx, n_g in enumerate(gpus):
cmd = config.python_cmd + " extract_feature_print.py %s %s %s %s %s %s" % (
config.device,
leng,
idx,
n_g,
model_log_dir,
version19,
)
yield get_info_str(cmd)
p = Popen(
cmd, shell=True, cwd=now_dir
) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
ps.append(p)
for p in ps:
p.wait()
with open(extract_f0_feature_log_path, "r") as f:
print(f.read())
#######step3a:训练模型
yield get_info_str("step3a:step3a")
# 生成filelist
if if_f0_3:
f0_dir = "%s/2a_f0" % model_log_dir
f0nsf_dir = "%s/2b-f0nsf" % model_log_dir
names = (
set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
& set([name.split(".")[0] for name in os.listdir(feature_dir)])
& set([name.split(".")[0] for name in os.listdir(f0_dir)])
& set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
)
else:
names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
[name.split(".")[0] for name in os.listdir(feature_dir)]
)
opt = []
for name in names:
if if_f0_3:
opt.append(
"%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
% (
gt_wavs_dir.replace("\\", "\\\\"),
name,
feature_dir.replace("\\", "\\\\"),
name,
f0_dir.replace("\\", "\\\\"),
name,
f0nsf_dir.replace("\\", "\\\\"),
name,
spk_id5,
)
)
else:
opt.append(
"%s/%s.wav|%s/%s.npy|%s"
% (
gt_wavs_dir.replace("\\", "\\\\"),
name,
feature_dir.replace("\\", "\\\\"),
name,
spk_id5,
)
)
fea_dim = 256 if version19 == "v1" else 768
if if_f0_3:
for _ in range(2):
opt.append(
"%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
% (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
)
else:
for _ in range(2):
opt.append(
"%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
% (now_dir, sr2, now_dir, fea_dim, spk_id5)
)
shuffle(opt)
with open("%s/filelist.txt" % model_log_dir, "w") as f:
f.write("\n".join(opt))
yield get_info_str("write filelist done")
if gpus16:
cmd = (
config.python_cmd
+" train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
% (
exp_dir1,
sr2,
1 if if_f0_3 else 0,
batch_size12,
gpus16,
total_epoch11,
save_epoch10,
("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "",
("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "",
1 if if_save_latest13 == True else 0,
1 if if_cache_gpu17 == True else 0,
1 if if_save_every_weights18 == True else 0,
version19,
)
)
else:
cmd = (
config.python_cmd
+ " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
% (
exp_dir1,
sr2,
1 if if_f0_3 else 0,
batch_size12,
total_epoch11,
save_epoch10,
("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "",
("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "",
1 if if_save_latest13 == True else 0,
1 if if_cache_gpu17 == True else 0,
1 if if_save_every_weights18 == True else 0,
version19,
)
)
yield get_info_str(cmd)
p = Popen(cmd, shell=True, cwd=now_dir)
p.wait()
yield get_info_str("training done, in train.log")
#######step3b:训练索引
npys = []
listdir_res = list(os.listdir(feature_dir))
for name in sorted(listdir_res):
phone = np.load("%s/%s" % (feature_dir, name))
npys.append(phone)
big_npy = np.concatenate(npys, 0)
big_npy_idx = np.arange(big_npy.shape[0])
np.random.shuffle(big_npy_idx)
big_npy = big_npy[big_npy_idx]
np.save("%s/total_fea.npy" % model_log_dir, big_npy)
# n_ivf = big_npy.shape[0] // 39
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
yield get_info_str("%s,%s" % (big_npy.shape, n_ivf))
index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
yield get_info_str("training index")
index_ivf = faiss.extract_index_ivf(index) #
index_ivf.nprobe = 1
index.train(big_npy)
faiss.write_index(
index,
"%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
)
yield get_info_str("adding index")
batch_size_add = 8192
for i in range(0, big_npy.shape[0], batch_size_add):
index.add(big_npy[i : i + batch_size_add])
faiss.write_index(
index,
"%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
)
yield get_info_str(
"成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index"
% (n_ivf, index_ivf.nprobe, exp_dir1, version19)
)
yield get_info_str("yes!")
def whethercrepeornah(radio):
mango = True if radio == 'mangio-crepe' or radio == 'mangio-crepe-tiny' else False
return ({"visible": mango, "__type__": "update"})
# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
def change_info_(ckpt_path):
if (
os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log"))
== False
):
return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
try:
with open(
ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r"
) as f:
info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
sr, f0 = info["sample_rate"], info["if_f0"]
version = "v2" if ("version" in info and info["version"] == "v2") else "v1"
return sr, str(f0), version
except:
traceback.print_exc()
return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
def export_onnx(ModelPath, ExportedPath, MoeVS=True):
cpt = torch.load(ModelPath, map_location="cpu")
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
hidden_channels = 256 if cpt.get("version","v1")=="v1"else 768#cpt["config"][-2] # hidden_channels,为768Vec做准备
test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
test_pitchf = torch.rand(1, 200) # nsf基频
test_ds = torch.LongTensor([0]) # 说话人ID
test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
device = "cpu" # 导出时设备(不影响使用模型)
net_g = SynthesizerTrnMsNSFsidM(
*cpt["config"], is_half=False,version=cpt.get("version","v1")
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
net_g.load_state_dict(cpt["weight"], strict=False)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
output_names = [
"audio",
]
# net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
torch.onnx.export(
net_g,
(
test_phone.to(device),
test_phone_lengths.to(device),
test_pitch.to(device),
test_pitchf.to(device),
test_ds.to(device),
test_rnd.to(device),
),
ExportedPath,
dynamic_axes={
"phone": [1],
"pitch": [1],
"pitchf": [1],
"rnd": [2],
},
do_constant_folding=False,
opset_version=16,
verbose=False,
input_names=input_names,
output_names=output_names,
)
return "Finished"
#region RVC WebUI App
def get_presets():
data = None
with open('../inference-presets.json', 'r') as file:
data = json.load(file)
preset_names = []
for preset in data['presets']:
preset_names.append(preset['name'])
return preset_names
def change_choices2():
audio_files=[]
for filename in os.listdir("./audios"):
if filename.endswith(('.wav','.mp3','.ogg','.flac','.m4a','.aac','.mp4')):
audio_files.append(os.path.join('./audios',filename).replace('\\', '/'))
return {"choices": sorted(audio_files), "__type__": "update"}, {"__type__": "update"}
audio_files=[]
for filename in os.listdir("./audios"):
if filename.endswith(('.wav','.mp3','.ogg','.flac','.m4a','.aac','.mp4')):
audio_files.append(os.path.join('./audios',filename).replace('\\', '/'))
def get_index():
if check_for_name() != '':
chosen_model=sorted(names)[0].split(".")[0]
logs_path="./logs/"+chosen_model
if os.path.exists(logs_path):
for file in os.listdir(logs_path):
if file.endswith(".index"):
return os.path.join(logs_path, file)
return ''
else:
return ''
def get_indexes():
indexes_list=[]
for dirpath, dirnames, filenames in os.walk("./logs/"):
for filename in filenames:
if filename.endswith(".index"):
indexes_list.append(os.path.join(dirpath,filename))
if len(indexes_list) > 0:
return indexes_list
else:
return ''
def get_name():
if len(audio_files) > 0:
return sorted(audio_files)[0]
else:
return ''
def save_to_wav(record_button):
if record_button is None:
pass
else:
path_to_file=record_button
new_name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+'.wav'
new_path='./audios/'+new_name
shutil.move(path_to_file,new_path)
return new_path
def save_to_wav2(dropbox):
file_path = dropbox.name
destination_dir = './audios'
destination_path = os.path.join(destination_dir, os.path.basename(file_path))
shutil.copy2(file_path, destination_path)
os.remove(file_path)
return destination_path
def match_index(sid0):
folder=sid0.split(".")[0]
parent_dir="./logs/"+folder
if os.path.exists(parent_dir):
for filename in os.listdir(parent_dir):
if filename.endswith(".index"):
index_path=os.path.join(parent_dir,filename)
return index_path
else:
return ''
def check_for_name():
if len(names) > 0:
return sorted(names)[0]
else:
return ''
def download_from_url(url, model, associated_user=None):
if url == '':
return "URL cannot be left empty."
if model =='':
return "You need to name your model. For example: My-Model"
url = url.strip()
zip_dirs = ["zips", "unzips"]
for directory in zip_dirs:
if os.path.exists(directory):
shutil.rmtree(directory)
os.makedirs("zips", exist_ok=True)
os.makedirs("unzips", exist_ok=True)
zipfile = model + '.zip'
zipfile_path = './zips/' + zipfile
return
try:
if "drive.google.com" in url or "drive.usercontent.google.com":
subprocess.run(["gdown", url, "--fuzzy", "-O", zipfile_path])
elif "mega.nz" in url:
m = Mega()
m.download_url(url, './zips')
else:
subprocess.run(["wget", url, "-O", zipfile_path])
for filename in os.listdir("./zips"):
if filename.endswith(".zip"):
zipfile_path = os.path.join("./zips/",filename)
shutil.unpack_archive(zipfile_path, "./unzips", 'zip')
else:
return "No zipfile found."
for root, dirs, files in os.walk('./unzips'):
for file in files:
file_path = os.path.join(root, file)
if file.endswith(".index"):
os.mkdir(f'./logs/{model}')
shutil.copy2(file_path,f'./logs/{model}')
if associated_user is not None:
associated_links[file] = associated_user
elif "G_" not in file and "D_" not in file and file.endswith(".pth"):
shutil.copy(file_path,f'./weights/{model}.pth')
if associated_user is not None:
associated_links[f'{model}.pth'] = associated_user
shutil.rmtree("zips")
shutil.rmtree("unzips")
change_choices()
return "Model downloaded, you can go back to the inference page!"
except:
return "ERROR - The download failed. Check if the link is valid."
def success_message(face):
return f'{face.name} has been uploaded.', 'None'
def mouth(size, face, voice, faces):
if size == 'Half':
size = 2
else:
size = 1
if faces == 'None':
character = face.name
else:
if faces == 'Ben Shapiro':
character = '/content/wav2lip-HD/inputs/ben-shapiro-10.mp4'
elif faces == 'Andrew Tate':
character = '/content/wav2lip-HD/inputs/tate-7.mp4'
command = "python inference.py " \
"--checkpoint_path checkpoints/wav2lip.pth " \
f"--face {character} " \
f"--audio {voice} " \
"--pads 0 20 0 0 " \
"--outfile /content/wav2lip-HD/outputs/result.mp4 " \
"--fps 24 " \
f"--resize_factor {size}"
process = subprocess.Popen(command, shell=True, cwd='/content/wav2lip-HD/Wav2Lip-master')
stdout, stderr = process.communicate()
return '/content/wav2lip-HD/outputs/result.mp4', 'Animation completed.'
def stoptraining(mim):
if int(mim) == 1:
try:
CSVutil('csvdb/stop.csv', 'w+', 'stop', 'True')
os.kill(PID, signal.SIGTERM)
except Exception as e:
print(f"Couldn't click due to {e}")
return (
{"visible": False, "__type__": "update"},
{"visible": True, "__type__": "update"},
)
def transcribe_btn_click(audio_choice):
global transcript_fn
global audio_fn
temp_folder = "./demo/temp"
orig_audio = audio_choice
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
audio_fn = f"{temp_folder}/{filename}.wav"
transcript_fn = f"{temp_folder}/{filename}.txt"
if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
print("Audio and transcript already exist, skipping transcript")
return
batch_size = 1 # Adjust based on your GPU memory availability
compute_type = "float16"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
pre_result = model.transcribe(audio_choice, batch_size=batch_size)
# Correctly handle the transcription result based on its structure
if 'segments' in pre_result:
result = " ".join([segment['text'] for segment in pre_result['segments']])
else:
result = pre_result.get('text', '')
print("Transcribe text: " + result) # Directly print the result as it is now a string
# remove model to save VRAM
gc.collect(); torch.cuda.empty_cache(); del model
# point to the original file or record the file
# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
orig_audio = audio_choice
orig_transcript = result
# move the audio and transcript to temp folder
os.makedirs(temp_folder, exist_ok=True)
os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
with open(f"{temp_folder}/{filename}.txt", "w") as f:
f.write(orig_transcript)
# run MFA to get the alignment
align_temp = f"{temp_folder}/mfa_alignments"
os.makedirs(align_temp, exist_ok=True)
audio_fn = f"{temp_folder}/{filename}.wav"
transcript_fn = f"{temp_folder}/{filename}.txt"
return result
def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
global voicecraft_model, voicecraft_config, phn2num
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["USER"] = "USER"
print("Transcribing the input audio")
transcribe_btn_click(input_audio_fn)
print("Transcription complete")
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
target_transcript = transcribed_text + target_transcript
print(target_transcript)
info = torchaudio.info(audio_fn)
audio_dur = info.num_frames / info.sample_rate
print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
prompt_end_frame = int(cut_off_sec * info.sample_rate)
# # load model, tokenizer, and other necessary files
# # original file loaded it each time. here we load it only once
# global model_loaded
# f model_loaded==False:
if voicecraft_model is None:
load_voicecraft()
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
text_tokenizer = TextTokenizer(backend="espeak")
audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
# # run the model to get the output
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
"silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
from lib.voicecraft.inference_tts_scale import inference_one_sample
concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
audio_fn, target_transcript, config.device, decode_config,
prompt_end_frame)
# save segments for comparison
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
output_dir = "./demo/generated_tts"
os.makedirs(output_dir, exist_ok=True)
seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
return [seg_save_fn_concat, seg_save_fn_gen]
def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text,
sid,
f0_up_key,
f0_file,
f0_method,
file_index,
#file_index2,
# file_big_npy,
index_rate,
filter_radius,
resample_sr,
rms_mix_rate,
protect,
crepe_hop_length):
global voicecraft_model, voicecraft_config, phn2num
print("Transcribing the input audio")
transcribe_btn_click(input_audio_fn)
print("Transcription complete")
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["USER"] = "USER"
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
target_transcript = transcribed_text + ' ' + target_transcript
print(target_transcript)
info = torchaudio.info(audio_fn)
audio_dur = info.num_frames / info.sample_rate
assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
prompt_end_frame = int(cut_off_sec * info.sample_rate)
if voicecraft_model is None:
load_voicecraft()
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
text_tokenizer = TextTokenizer(backend="espeak")
audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
# # run the model to get the output
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
"silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
from lib.voicecraft.inference_tts_scale import inference_one_sample
concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
audio_fn, target_transcript, config.device, decode_config,
prompt_end_frame)
print("prompt_end_frame: ", prompt_end_frame, "voicecraft_config: ", voicecraft_config, "audio_fn: ", audio_fn, "target_transcript: ", target_transcript, "config.device: ", config.device, "decode_config: ", decode_config)
# save segments for comparison
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
output_dir = "./demo/generated_tts"
os.makedirs(output_dir, exist_ok=True)
seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
global tgt_sr, net_g, vc, hubert_model, version
f0_up_key = int(f0_up_key)
try:
# audio = gen_audio.squeeze()
audio = load_audio(seg_save_fn_gen, 16000, DoFormant, Quefrency, Timbre).squeeze()
audio_max = np.abs(audio).max() / 0.95
if audio_max > 1:
audio /= audio_max
times = [0, 0, 0]
if hubert_model == None:
load_hubert()
if_f0 = cpt.get("f0", 1)
file_index = (
(
file_index.strip(" ")
.strip('"')
.strip("\n")
.strip('"')
.strip(" ")
.replace("trained", "added")
)
) # 防止小白写错,自动帮他替换掉
# file_big_npy = (
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
# )
print(f"Making VC Pipeline, device: {config.device}, audio shape: {audio.shape}")
audio_opt = vc.pipeline(
hubert_model,
net_g,
sid,
audio,
seg_save_fn_gen,
times,
f0_up_key,
f0_method,
file_index,
# file_big_npy,
index_rate,
if_f0,
filter_radius,
tgt_sr,
resample_sr,
rms_mix_rate,
version,
protect,
crepe_hop_length,
f0_file=f0_file,
)
if resample_sr >= 16000 and tgt_sr != resample_sr:
tgt_sr = resample_sr
index_info = (
"Using index:%s." % file_index
if os.path.exists(file_index)
else "Index not used."
)
return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
index_info,
times[0],
times[1],
times[2],
), (tgt_sr, audio_opt)
except:
info = traceback.format_exc()
print(info)
return info, (None, None)
def upload_to_dataset(files, dir):
if dir == '':
dir = './dataset'
if not os.path.exists(dir):
os.makedirs(dir)
count = 0
for file in files:
path=file.name
shutil.copy2(path,dir)
count += 1
return f' {count} files uploaded to {dir}.'
def zip_downloader(model):
if not os.path.exists(f'./weights/{model}.pth'):
return {"__type__": "update"}, f'Make sure the Voice Name is correct. I could not find {model}.pth'
index_found = False
for file in os.listdir(f'./logs/{model}'):
if file.endswith('.index') and 'added' in file:
log_file = file
index_found = True
if index_found:
return [f'./weights/{model}.pth', f'./logs/{model}/{log_file}'], "Done"
else:
return f'./weights/{model}.pth', "Could not find Index file."
#download_from_url('https://drive.google.com/uc?id=1O98vvnle_nZP8ZdpnZFLZ5TU1UZe7x0p&confirm=t', 'JVKE-main', 'jvke')
#download_from_url('https://drive.google.com/uc?id=1Wag0vPlp42kRDffccXljjjlK7QsHf2xe&confirm=t', 'JVKE-main-v2', 'jvke')
#download_from_url('https://drive.google.com/uc?id=1h810cil3YRlN4pu4oO43zKq9z3cYjItp&confirm=t', 'jvke-nighttime-v4', 'jvke')
download_from_url('https://drive.google.com/uc?id=1fa6FSLwqSQMI49NvSXOpI4pUVuKsrop5&confirm=t', 'Andoni', 'cmss60')
download_from_url('https://drive.google.com/uc?id=1iGhD93_szvs0xyg-U5z_jhfBECBxcTfK&confirm=t', 'Alex', 'cmss60')
download_from_url('https://drive.google.com/uc?id=1DwRru_WFh4LS0eqU_39qEPuwltj9ZTRr&confirm=t', 'Elaine', 'cmss60')
download_from_url('https://drive.google.com/uc?id=1Xen2BBRoqfF3CNO_XqEr2ZgCcITz--Je&confirm=t', 'Emily', 'cmss60')
download_from_url('https://drive.google.com/uc?id=1gHfrS1rnhnj3sHdnOM4vx04rcxucc74D&confirm=t', 'Justis', 'cmss60')
download_from_url('https://drive.google.com/uc?id=1PlQELpXawx74mEv9MYeREcyvwE7vFFe_&confirm=t', 'Kayana', 'cmss60')
download_from_url('https://drive.google.com/uc?id=16hJvfWAhuWWVEeXyDYt9PHJl-k_Kouxf&confirm=t', 'Prince', 'cmss60')
download_from_url('https://drive.google.com/uc?id=1zE1tP95_unNjVkqYb0aBt3AsBq_u9-R9&confirm=t', 'Lupe', 'cmss60')
weight_root = "weights"
index_root = "logs"
names = []
for name in os.listdir(weight_root):
if name.endswith(".pth"):
names.append(name)
index_paths = []
for root, dirs, files in os.walk(index_root, topdown=False):
for name in files:
if name.endswith(".index") and "trained" not in name:
index_paths.append("%s/%s" % (root, name))
with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose"), title="HITGEN AI") as app:
with gr.Tabs():
with gr.TabItem("Inference"):
app.load(update_message)
# Other RVC stuff
with gr.Row():
sid0 = gr.Dropdown(label="1. Choose your model", choices=sorted(names), value=check_for_name())
refresh_button = gr.Button("Refresh", variant="primary")
if check_for_name() != '':
get_vc(sorted(names)[0])
vc_transform0 = gr.Number(label="Key Shift: 0 for no key shifted output; 12 f for output an octave higher and -12 for output an octave lower.", value=0)
#clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
spk_item = gr.Slider(
minimum=0,
maximum=2333,
step=1,
label="speaker id",
value=0,
visible=False,
interactive=True,
)
#clean_button.click(fn=clean, inputs=[], outputs=[sid0])
sid0.change(
fn=get_vc,
inputs=[sid0],
outputs=[spk_item],
)
but0 = gr.Button("Convert", variant="primary")
with gr.Row():
with gr.Column():
# with gr.Row():
# dropbox = gr.File(label="Drag your audio file and click refresh.")
# with gr.Row():
# record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
with gr.Row():
input_audio0 = gr.Dropdown(
label="2.Choose the audio file.",
value="./audios/Test_Audio.mp3",
choices=audio_files
)
# dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
# dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
transcribed_text = gr.Textbox(label="transcibed text + mfa",
value="The dogs sat at the door.",
info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
# record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
# record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
with gr.Row():
# with gr.Column():
# input_audio = gr.Audio(label="Input Audio", type="filepath")
# # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
# # choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
# # info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
# transcribed_text = gr.Textbox(label="transcibed text + mfa",
# info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
# transcribe_info_text = gr.TextArea(label="How to use",
# value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
# transcribe_btn = gr.Button(value="transcribe and create mfa")
with gr.Column():
target_transcript = gr.Textbox(label="target transcript")
output_audio_con = gr.Audio(label="Output Audio concatenated")
output_audio_gen = gr.Audio(label="Output Audio generated")
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
run_btn = gr.Button(value="run")
run_btn_joint = gr.Button(value="run with RVC")
# transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
# outputs=[transcribed_text])
with gr.Column():
vc_output2 = gr.Audio(
label="Final Result! (Click on the three dots to download the audio)",
type='filepath',
interactive=False,
)
#with gr.Column():
with gr.Accordion("Advanced TTS Settings", open=False):
seed = gr.Number(label='seed', interactive=True, value=1)
stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
info=" not used for TTS, only for speech editing")
right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
info=" not used for TTS, only for speech editing")
codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
codec_sr = gr.Number(label='codec', interactive=True, value=50)
top_k = gr.Number(label='top_k', interactive=True, value=0)
top_p = gr.Number(label='top_p', interactive=True, value=0.8)
temperature = gr.Number(label='temperature', interactive=True, value=1)
kvcache = gr.Number(label='kvcache', interactive=True, value=1,
info='set to 0 to use less VRAM, results may be worse and slower inference')
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
with gr.Accordion("Index Settings", open=False):
#with gr.Row():
file_index1 = gr.Dropdown(
label="3. Choose the index file (in case it wasn't automatically found.)",
choices=get_indexes(),
value=get_index(),
interactive=True,
)
sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
refresh_button.click(
fn=change_choices, inputs=[], outputs=[sid0, file_index1]
)
# file_big_npy1 = gr.Textbox(
# label=i18n("特征文件路径"),
# value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
# interactive=True,
# )
index_rate1 = gr.Slider(
minimum=0,
maximum=1,
label="index rate",
value=0,
interactive=True,
)
# animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
with gr.Accordion("Advanced Options", open=False):
f0method0 = gr.Radio(
label="Optional: Change the Pitch Extraction Algorithm. Extraction methods are sorted from 'worst quality' to 'best quality'. If you don't know what you're doing, leave rmvpe.",
choices=["pm", "dio", "crepe-tiny", "mangio-crepe-tiny", "crepe", "harvest", "mangio-crepe", "rmvpe"], # Fork Feature. Add Crepe-Tiny
value="rmvpe",
interactive=True,
)
crepe_hop_length = gr.Slider(
minimum=1,
maximum=512,
step=1,
label="Mangio-Crepe Hop Length. Higher numbers will reduce the chance of extreme pitch changes but lower numbers will increase accuracy. 64-192 is a good range to experiment with.",
value=120,
interactive=True,
visible=False,
)
f0method0.change(fn=whethercrepeornah, inputs=[f0method0], outputs=[crepe_hop_length])
filter_radius0 = gr.Slider(
minimum=0,
maximum=7,
label="label",
value=3,
step=1,
interactive=True,
)
resample_sr0 = gr.Slider(
minimum=0,
maximum=48000,
label="label",
value=0,
step=1,
interactive=True,
visible=False
)
rms_mix_rate0 = gr.Slider(
minimum=0,
maximum=1,
label="label",
value=0.21,
interactive=True,
)
protect0 = gr.Slider(
minimum=0,
maximum=0.5,
label="label",
value=0,
step=0.01,
interactive=True,
)
formanting = gr.Checkbox(
value=bool(DoFormant),
label="[EXPERIMENTAL] Formant shift inference audio",
info="Used for male to female and vice-versa conversions",
interactive=True,
visible=True,
)
formant_preset = gr.Dropdown(
value='',
choices=get_fshift_presets(),
label="browse presets for formanting",
visible=bool(DoFormant),
)
formant_refresh_button = gr.Button(
value='\U0001f504',
visible=bool(DoFormant),
variant='primary',
)
#formant_refresh_button = ToolButton( elem_id='1')
#create_refresh_button(formant_preset, lambda: {"choices": formant_preset}, "refresh_list_shiftpresets")
qfrency = gr.Slider(
value=Quefrency,
info="Default value is 1.0",
label="Frequency for formant shifting",
minimum=0.0,
maximum=16.0,
step=0.1,
visible=bool(DoFormant),
interactive=True,
)
tmbre = gr.Slider(
value=Timbre,
info="Default value is 1.0",
label="Timbre for formant shifting",
minimum=0.0,
maximum=16.0,
step=0.1,
visible=bool(DoFormant),
interactive=True,
)
formant_preset.change(fn=preset_apply, inputs=[formant_preset, qfrency, tmbre], outputs=[qfrency, tmbre])
frmntbut = gr.Button("Apply", variant="primary", visible=bool(DoFormant))
formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
with gr.Row():
vc_output1 = gr.Textbox("")
f0_file = gr.File(label="f0 file", visible=False)
run_btn.click(fn=run,
inputs=[
seed,
stop_repitition,
sample_batch_size,
left_margin,
right_margin,
codecaudio_sr,
codec_sr,
top_k,
top_p,
temperature,
kvcache,
cutoff_value,
target_transcript,
silence_tokens,
transcribed_text],
outputs=[
output_audio_con,
output_audio_gen
])
but0.click(
vc_single,
[
spk_item,
input_audio0,
vc_transform0,
f0_file,
f0method0,
file_index1,
# file_index2,
# file_big_npy1,
index_rate1,
filter_radius0,
resample_sr0,
rms_mix_rate0,
protect0,
crepe_hop_length
],
[vc_output1, vc_output2],
)
run_btn_joint.click(
fn=run_joint,
inputs=[
input_audio0,
seed,
stop_repitition,
sample_batch_size,
left_margin,
right_margin,
codecaudio_sr,
codec_sr,
top_k,
top_p,
temperature,
kvcache,
cutoff_value,
target_transcript,
silence_tokens,
transcribed_text,
spk_item,
vc_transform0,
f0_file,
f0method0,
file_index1,
# file_index2,
# file_big_npy1,
index_rate1,
filter_radius0,
resample_sr0,
rms_mix_rate0,
protect0,
crepe_hop_length
],
outputs=[vc_output1, vc_output2])
with gr.Accordion("Batch Conversion",open=False, visible=False):
with gr.Row():
with gr.Column():
vc_transform1 = gr.Number(
label="speaker id", value=0
)
opt_input = gr.Textbox(label="opt", value="opt")
f0method1 = gr.Radio(
label="f0 method",
choices=["pm", "harvest", "crepe", "rmvpe"],
value="rmvpe",
interactive=True,
)
filter_radius1 = gr.Slider(
minimum=0,
maximum=7,
label="harvest",
value=3,
step=1,
interactive=True,
)
with gr.Column():
file_index3 = gr.Textbox(
label="file index",
value="",
interactive=True,
)
file_index4 = gr.Dropdown(
label="index path (dropdown)",
choices=sorted(index_paths),
interactive=True,
)
refresh_button.click(
fn=lambda username: change_choices(username)[1],
inputs=[gr.State('username')],
outputs=file_index4,
)
# file_big_npy2 = gr.Textbox(
# label=i18n("特征文件路径"),
# value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
# interactive=True,
# )
index_rate2 = gr.Slider(
minimum=0,
maximum=1,
label="index rate 2",
value=1,
interactive=True,
)
with gr.Column():
resample_sr1 = gr.Slider(
minimum=0,
maximum=48000,
label="resample rate",
value=0,
step=1,
interactive=True,
)
rms_mix_rate1 = gr.Slider(
minimum=0,
maximum=1,
label="rms mix rate",
value=1,
interactive=True,
)
protect1 = gr.Slider(
minimum=0,
maximum=0.5,
label="protection rate",
value=0.33,
step=0.01,
interactive=True,
)
with gr.Column():
dir_input = gr.Textbox(
label="directory input",
value="E:\codes\py39\\test-20230416b\\todo-songs",
)
inputs = gr.File(
file_count="multiple", label="input"
)
with gr.Row():
format1 = gr.Radio(
label="output format",
choices=["wav", "flac", "mp3", "m4a"],
value="flac",
interactive=True,
)
but1 = gr.Button("primary", variant="primary")
vc_output3 = gr.Textbox(label="label")
but1.click(
vc_multi,
[
spk_item,
dir_input,
opt_input,
inputs,
vc_transform1,
f0method1,
file_index3,
file_index4,
# file_big_npy2,
index_rate2,
filter_radius1,
resample_sr1,
rms_mix_rate1,
protect1,
format1,
crepe_hop_length,
],
[vc_output3],
)
but1.click(fn=lambda: easy_uploader.clear())
with gr.TabItem("Download Voice Models"):
with gr.Row():
url=gr.Textbox(label="Huggingface Link:")
with gr.Row():
model = gr.Textbox(label="Name of the model (without spaces):")
download_button=gr.Button("Download")
with gr.Row():
status_bar=gr.Textbox(label="Download Status")
download_button.click(fn=download_from_url, inputs=[url, model], outputs=[status_bar])
def has_two_files_in_pretrained_folder():
pretrained_folder = "./pretrained/"
if not os.path.exists(pretrained_folder):
return False
files_in_folder = os.listdir(pretrained_folder)
num_files = len(files_in_folder)
return num_files >= 2
if has_two_files_in_pretrained_folder():
print("Pretrained weights are downloaded. Training tab enabled!\n-------------------------------")
with gr.TabItem("Train", visible=False):
with gr.Row():
with gr.Column():
exp_dir1 = gr.Textbox(label="Voice Name:", value="My-Voice")
sr2 = gr.Radio(
label="sample rate",
choices=["40k", "48k"],
value="40k",
interactive=True,
visible=False
)
if_f0_3 = gr.Radio(
label="extract f0",
choices=[True, False],
value=True,
interactive=True,
visible=False
)
version19 = gr.Radio(
label="RVC version",
choices=["v1", "v2"],
value="v2",
interactive=True,
visible=False,
)
np7 = gr.Slider(
minimum=0,
maximum=config.n_cpu,
step=1,
label="# of CPUs for data processing (Leave as it is)",
value=config.n_cpu,
interactive=True,
visible=True
)
trainset_dir4 = gr.Textbox(label="Path to your dataset (audios, not zip):", value="./dataset")
easy_uploader = gr.Files(label='OR Drop your audios here. They will be uploaded in your dataset path above.',file_types=['audio'])
but1 = gr.Button("1. Process The Dataset", variant="primary")
info1 = gr.Textbox(label="Status (wait until it says 'end preprocess'):", value="")
easy_uploader.upload(fn=upload_to_dataset, inputs=[easy_uploader, trainset_dir4], outputs=[info1])
but1.click(
preprocess_dataset, [trainset_dir4, exp_dir1, sr2, np7], [info1]
)
with gr.Column():
spk_id5 = gr.Slider(
minimum=0,
maximum=4,
step=1,
label="speaker id",
value=0,
interactive=True,
visible=False
)
with gr.Accordion('GPU Settings', open=False, visible=False):
gpus6 = gr.Textbox(
label="0-1-2",
value=gpus,
interactive=True,
visible=False
)
gpu_info9 = gr.Textbox(label="GPU", value=gpu_info)
f0method8 = gr.Radio(
label="f0 method",
choices=["harvest","crepe", "mangio-crepe", "rmvpe"], # Fork feature: Crepe on f0 extraction for training.
value="rmvpe",
interactive=True,
)
extraction_crepe_hop_length = gr.Slider(
minimum=1,
maximum=512,
step=1,
label="crepe_hop_length",
value=128,
interactive=True,
visible=False,
)
f0method8.change(fn=whethercrepeornah, inputs=[f0method8], outputs=[extraction_crepe_hop_length])
but2 = gr.Button("2. Pitch Extraction", variant="primary")
info2 = gr.Textbox(label="Status(Check the Colab Notebook's cell output):", value="", max_lines=8)
but2.click(
extract_f0_feature,
[gpus6, np7, f0method8, if_f0_3, exp_dir1, version19, extraction_crepe_hop_length],
[info2],
)
with gr.Row():
with gr.Column():
total_epoch11 = gr.Slider(
minimum=1,
maximum=5000,
step=10,
label="Total # of training epochs (IF you choose a value too high, your model will sound horribly overtrained.):",
value=250,
interactive=True,
)
butstop = gr.Button(
"Stop Training",
variant='primary',
visible=False,
)
but3 = gr.Button("3. Train Model", variant="primary", visible=True)
but3.click(fn=stoptraining, inputs=[gr.Number(value=0, visible=False)], outputs=[but3, butstop])
butstop.click(fn=stoptraining, inputs=[gr.Number(value=1, visible=False)], outputs=[butstop, but3])
but4 = gr.Button("4.Train Index", variant="primary")
info3 = gr.Textbox(label="Status(Check the Colab Notebook's cell output):", value="", max_lines=10)
with gr.Accordion("Training Preferences (You can leave these as they are)", open=False):
#gr.Markdown(value=i18n("step3: 填写训练设置, 开始训练模型和索引"))
with gr.Column():
save_epoch10 = gr.Slider(
minimum=1,
maximum=200,
step=1,
label="Backup every X amount of epochs:",
value=10,
interactive=True,
)
batch_size12 = gr.Slider(
minimum=1,
maximum=40,
step=1,
label="Batch Size (LEAVE IT unless you know what you're doing!):",
value=default_batch_size,
interactive=True,
)
if_save_latest13 = gr.Checkbox(
label="Save only the latest '.ckpt' file to save disk space.",
value=True,
interactive=True,
)
if_cache_gpu17 = gr.Checkbox(
label="Cache all training sets to GPU memory. Caching small datasets (less than 10 minutes) can speed up training, but caching large datasets will consume a lot of GPU memory and may not provide much speed improvement.",
value=False,
interactive=True,
)
if_save_every_weights18 = gr.Checkbox(
label="Save a small final model to the 'weights' folder at each save point.",
value=True,
interactive=True,
)
zip_model = gr.Button('5. Download Model')
zipped_model = gr.Files(label='Your Model and Index file can be downloaded here:')
zip_model.click(fn=zip_downloader, inputs=[exp_dir1], outputs=[zipped_model, info3])
with gr.Group():
with gr.Accordion("Base Model Locations:", open=False, visible=False):
pretrained_G14 = gr.Textbox(
label="G PATH",
value="pretrained_v2/f0G40k.pth",
interactive=True,
)
pretrained_D15 = gr.Textbox(
label="D PATH",
value="pretrained_v2/f0D40k.pth",
interactive=True,
)
gpus16 = gr.Textbox(
label="GPU NUM",
value=gpus,
interactive=True,
)
sr2.change(
change_sr2,
[sr2, if_f0_3, version19],
[pretrained_G14, pretrained_D15, version19],
)
version19.change(
change_version19,
[sr2, if_f0_3, version19],
[pretrained_G14, pretrained_D15],
)
if_f0_3.change(
change_f0,
[if_f0_3, sr2, version19],
[f0method8, pretrained_G14, pretrained_D15],
)
but5 = gr.Button("label", variant="primary", visible=False)
but3.click(
click_train,
[
exp_dir1,
sr2,
if_f0_3,
spk_id5,
save_epoch10,
total_epoch11,
batch_size12,
if_save_latest13,
pretrained_G14,
pretrained_D15,
gpus16,
if_cache_gpu17,
if_save_every_weights18,
version19,
],
[
info3,
butstop,
but3,
],
)
but4.click(train_index, [exp_dir1, version19], info3)
but5.click(
train1key,
[
exp_dir1,
sr2,
if_f0_3,
trainset_dir4,
spk_id5,
np7,
f0method8,
save_epoch10,
total_epoch11,
batch_size12,
if_save_latest13,
pretrained_G14,
pretrained_D15,
gpus16,
if_cache_gpu17,
if_save_every_weights18,
version19,
extraction_crepe_hop_length
],
info3,
)
else:
print(
"Pretrained weights not downloaded. Disabling training tab.\n"
"Wondering how to train a voice? Join AI HUB Discord Server! https://discord.gg/aihub\n"
"-------------------------------\n"
)
app.queue(concurrency_count=511, max_size=1022).launch(share=False, quiet=False, auth=[('jvke', 'thisfeelslikeai'), ('cmss60', 'yourseedislate')])
#endregion