Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,25 +1,27 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
|
| 3 |
import torch
|
| 4 |
-
|
| 5 |
from omegaconf import OmegaConf
|
| 6 |
from pitch import load_csv_pitch
|
| 7 |
from grad.utils import fix_len_compatibility
|
| 8 |
from grad.model import GradTTS
|
| 9 |
from bigvgan.model.generator import Generator
|
| 10 |
-
|
| 11 |
import gradio as gr
|
| 12 |
import numpy as np
|
| 13 |
import soundfile
|
| 14 |
import librosa
|
| 15 |
import logging
|
| 16 |
|
|
|
|
| 17 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
| 18 |
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
| 19 |
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
| 20 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
| 21 |
|
|
|
|
|
|
|
| 22 |
|
|
|
|
| 23 |
def load_gvc_model(checkpoint_path, model):
|
| 24 |
assert os.path.isfile(checkpoint_path)
|
| 25 |
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
|
|
@@ -30,12 +32,12 @@ def load_gvc_model(checkpoint_path, model):
|
|
| 30 |
try:
|
| 31 |
new_state_dict[k] = saved_state_dict[k]
|
| 32 |
except:
|
| 33 |
-
print("
|
| 34 |
new_state_dict[k] = v
|
| 35 |
model.load_state_dict(new_state_dict)
|
| 36 |
return model
|
| 37 |
|
| 38 |
-
|
| 39 |
def load_bigv_model(checkpoint_path, model):
|
| 40 |
assert os.path.isfile(checkpoint_path)
|
| 41 |
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
|
|
@@ -46,12 +48,12 @@ def load_bigv_model(checkpoint_path, model):
|
|
| 46 |
try:
|
| 47 |
new_state_dict[k] = saved_state_dict[k]
|
| 48 |
except:
|
| 49 |
-
print("
|
| 50 |
new_state_dict[k] = v
|
| 51 |
model.load_state_dict(new_state_dict)
|
| 52 |
return model
|
| 53 |
|
| 54 |
-
|
| 55 |
@torch.no_grad()
|
| 56 |
def gvc_main(device, model, _vec, _pit, spk, rature=1.015):
|
| 57 |
l_vec = _vec.shape[0]
|
|
@@ -67,9 +69,8 @@ def gvc_main(device, model, _vec, _pit, spk, rature=1.015):
|
|
| 67 |
y_dec = y_dec[:, :l_vec]
|
| 68 |
return y_dec
|
| 69 |
|
| 70 |
-
|
| 71 |
def svc_change(argswav, argsspk):
|
| 72 |
-
|
| 73 |
argsvec = "svc_tmp.ppg.npy"
|
| 74 |
os.system(f"python hubert/inference.py -w {argswav} -v {argsvec}")
|
| 75 |
argspit = "svc_tmp.pit.npy"
|
|
@@ -79,11 +80,13 @@ def svc_change(argswav, argsspk):
|
|
| 79 |
hps = OmegaConf.load('configs/base.yaml')
|
| 80 |
|
| 81 |
print('Initializing Grad-TTS...')
|
| 82 |
-
model = GradTTS(
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
| 87 |
|
| 88 |
load_gvc_model('grad_pretrain/gvc.pretrain.pth', model)
|
| 89 |
model.eval()
|
|
@@ -108,22 +111,21 @@ def svc_change(argswav, argsspk):
|
|
| 108 |
|
| 109 |
with torch.no_grad():
|
| 110 |
spk = spk.unsqueeze(0).to(device)
|
| 111 |
-
|
| 112 |
all_frame = len_min
|
| 113 |
hop_frame = 8
|
| 114 |
-
out_chunk = 2400 # 24
|
| 115 |
out_index = 0
|
| 116 |
mel = None
|
| 117 |
|
| 118 |
-
while
|
| 119 |
-
if
|
| 120 |
cut_s = 0
|
| 121 |
cut_s_out = 0
|
| 122 |
else:
|
| 123 |
cut_s = out_index - hop_frame
|
| 124 |
cut_s_out = hop_frame
|
| 125 |
|
| 126 |
-
if
|
| 127 |
cut_e = all_frame
|
| 128 |
cut_e_out = -1
|
| 129 |
else:
|
|
@@ -135,9 +137,9 @@ def svc_change(argswav, argsspk):
|
|
| 135 |
|
| 136 |
sub_out = gvc_main(device, model, sub_vec, sub_pit, spk, 0.95)
|
| 137 |
sub_out = sub_out[:, cut_s_out:cut_e_out]
|
| 138 |
-
|
| 139 |
out_index = out_index + out_chunk
|
| 140 |
-
if mel
|
| 141 |
mel = sub_out
|
| 142 |
else:
|
| 143 |
mel = torch.cat((mel, sub_out), -1)
|
|
@@ -175,41 +177,55 @@ def svc_change(argswav, argsspk):
|
|
| 175 |
|
| 176 |
return audio
|
| 177 |
|
| 178 |
-
|
| 179 |
def svc_main(sid, input_audio):
|
| 180 |
if input_audio is None:
|
| 181 |
-
return "You need to upload an audio", None
|
|
|
|
|
|
|
| 182 |
sampling_rate, audio = input_audio
|
| 183 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
| 184 |
if len(audio.shape) > 1:
|
| 185 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
| 186 |
if sampling_rate != 16000:
|
| 187 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
| 188 |
-
if
|
| 189 |
-
audio = audio[:16000*100]
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
soundfile.write(wav_path, audio, 16000, format="wav")
|
| 192 |
out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
|
| 193 |
-
return "
|
| 194 |
-
|
| 195 |
|
|
|
|
| 196 |
app = gr.Blocks()
|
| 197 |
with app:
|
| 198 |
with gr.Tabs():
|
| 199 |
with gr.TabItem("Grad-SVC"):
|
| 200 |
gr.Markdown(
|
| 201 |
-
"
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
| 213 |
vc_submit.click(svc_main, [sid, vc_input3], [vc_output1, vc_output2])
|
| 214 |
|
| 215 |
-
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
import torch
|
|
|
|
| 4 |
from omegaconf import OmegaConf
|
| 5 |
from pitch import load_csv_pitch
|
| 6 |
from grad.utils import fix_len_compatibility
|
| 7 |
from grad.model import GradTTS
|
| 8 |
from bigvgan.model.generator import Generator
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
import numpy as np
|
| 11 |
import soundfile
|
| 12 |
import librosa
|
| 13 |
import logging
|
| 14 |
|
| 15 |
+
# Set logging levels to suppress unnecessary warnings
|
| 16 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
| 17 |
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
| 18 |
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
| 19 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
| 20 |
|
| 21 |
+
# Append current working directory to system path
|
| 22 |
+
sys.path.append(os.getcwd())
|
| 23 |
|
| 24 |
+
# Function to load Grad-TTS model checkpoint
|
| 25 |
def load_gvc_model(checkpoint_path, model):
|
| 26 |
assert os.path.isfile(checkpoint_path)
|
| 27 |
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
|
|
|
|
| 32 |
try:
|
| 33 |
new_state_dict[k] = saved_state_dict[k]
|
| 34 |
except:
|
| 35 |
+
print(f"{k} is not in the checkpoint")
|
| 36 |
new_state_dict[k] = v
|
| 37 |
model.load_state_dict(new_state_dict)
|
| 38 |
return model
|
| 39 |
|
| 40 |
+
# Function to load BigVGAN model checkpoint
|
| 41 |
def load_bigv_model(checkpoint_path, model):
|
| 42 |
assert os.path.isfile(checkpoint_path)
|
| 43 |
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
|
|
|
|
| 48 |
try:
|
| 49 |
new_state_dict[k] = saved_state_dict[k]
|
| 50 |
except:
|
| 51 |
+
print(f"{k} is not in the checkpoint")
|
| 52 |
new_state_dict[k] = v
|
| 53 |
model.load_state_dict(new_state_dict)
|
| 54 |
return model
|
| 55 |
|
| 56 |
+
# Main Grad-TTS inference function
|
| 57 |
@torch.no_grad()
|
| 58 |
def gvc_main(device, model, _vec, _pit, spk, rature=1.015):
|
| 59 |
l_vec = _vec.shape[0]
|
|
|
|
| 69 |
y_dec = y_dec[:, :l_vec]
|
| 70 |
return y_dec
|
| 71 |
|
| 72 |
+
# Function to process input audio and extract features
|
| 73 |
def svc_change(argswav, argsspk):
|
|
|
|
| 74 |
argsvec = "svc_tmp.ppg.npy"
|
| 75 |
os.system(f"python hubert/inference.py -w {argswav} -v {argsvec}")
|
| 76 |
argspit = "svc_tmp.pit.npy"
|
|
|
|
| 80 |
hps = OmegaConf.load('configs/base.yaml')
|
| 81 |
|
| 82 |
print('Initializing Grad-TTS...')
|
| 83 |
+
model = GradTTS(
|
| 84 |
+
hps.grad.n_mels, hps.grad.n_vecs, hps.grad.n_pits, hps.grad.n_spks,
|
| 85 |
+
hps.grad.n_embs, hps.grad.n_enc_channels, hps.grad.filter_channels,
|
| 86 |
+
hps.grad.dec_dim, hps.grad.beta_min, hps.grad.beta_max, hps.grad.pe_scale
|
| 87 |
+
)
|
| 88 |
+
print(f'Number of encoder parameters = {model.encoder.nparams/1e6:.2f}m')
|
| 89 |
+
print(f'Number of decoder parameters = {model.decoder.nparams/1e6:.2f}m')
|
| 90 |
|
| 91 |
load_gvc_model('grad_pretrain/gvc.pretrain.pth', model)
|
| 92 |
model.eval()
|
|
|
|
| 111 |
|
| 112 |
with torch.no_grad():
|
| 113 |
spk = spk.unsqueeze(0).to(device)
|
|
|
|
| 114 |
all_frame = len_min
|
| 115 |
hop_frame = 8
|
| 116 |
+
out_chunk = 2400 # 24 seconds
|
| 117 |
out_index = 0
|
| 118 |
mel = None
|
| 119 |
|
| 120 |
+
while out_index < all_frame:
|
| 121 |
+
if out_index == 0: # Start frame
|
| 122 |
cut_s = 0
|
| 123 |
cut_s_out = 0
|
| 124 |
else:
|
| 125 |
cut_s = out_index - hop_frame
|
| 126 |
cut_s_out = hop_frame
|
| 127 |
|
| 128 |
+
if out_index + out_chunk + hop_frame > all_frame: # End frame
|
| 129 |
cut_e = all_frame
|
| 130 |
cut_e_out = -1
|
| 131 |
else:
|
|
|
|
| 137 |
|
| 138 |
sub_out = gvc_main(device, model, sub_vec, sub_pit, spk, 0.95)
|
| 139 |
sub_out = sub_out[:, cut_s_out:cut_e_out]
|
| 140 |
+
|
| 141 |
out_index = out_index + out_chunk
|
| 142 |
+
if mel is None:
|
| 143 |
mel = sub_out
|
| 144 |
else:
|
| 145 |
mel = torch.cat((mel, sub_out), -1)
|
|
|
|
| 177 |
|
| 178 |
return audio
|
| 179 |
|
| 180 |
+
# Main function to handle audio input and conversion
|
| 181 |
def svc_main(sid, input_audio):
|
| 182 |
if input_audio is None:
|
| 183 |
+
return "You need to upload an audio file", None
|
| 184 |
+
|
| 185 |
+
|
| 186 |
sampling_rate, audio = input_audio
|
| 187 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
| 188 |
if len(audio.shape) > 1:
|
| 189 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
| 190 |
if sampling_rate != 16000:
|
| 191 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
| 192 |
+
if len(audio) > 16000 * 100:
|
| 193 |
+
audio = audio[:16000 * 100]
|
| 194 |
+
|
| 195 |
+
separator = Separator()
|
| 196 |
+
separator.load_model()
|
| 197 |
+
output_names = {
|
| 198 |
+
"Vocals": "vocals_output",
|
| 199 |
+
"Instrumental": "instrumental_output",
|
| 200 |
+
}
|
| 201 |
+
output_files = separator.separate(audio, output_names)
|
| 202 |
+
wav_path = "vocals_output.wav"
|
| 203 |
soundfile.write(wav_path, audio, 16000, format="wav")
|
| 204 |
out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
|
| 205 |
+
return "Conversion Successful", (32000, out_audio)
|
|
|
|
| 206 |
|
| 207 |
+
# Gradio WebUI setup
|
| 208 |
app = gr.Blocks()
|
| 209 |
with app:
|
| 210 |
with gr.Tabs():
|
| 211 |
with gr.TabItem("Grad-SVC"):
|
| 212 |
gr.Markdown(
|
| 213 |
+
"""
|
| 214 |
+
Based on Grad-TTS from HUAWEI Noah's Ark Lab
|
| 215 |
+
|
| 216 |
+
This project is named Grad-SVC, or GVC for short. Its core technology is diffusion, but it is very different from other diffusion-based SVC models.
|
| 217 |
+
|
| 218 |
+
<video id='video' controls='' preload='yes'>
|
| 219 |
+
<source id='mp4' src='https://github.com/PlayVoice/Grad-SVC/assets/16432329/f9b66af7-b5b5-4efb-b73d-adb0dc84a0ae' type='video/mp4'>
|
| 220 |
+
</video>
|
| 221 |
+
"""
|
| 222 |
+
)
|
| 223 |
+
sid = gr.Dropdown(label="Voice Tone", choices=["22", "33", "47", "51"], value="47")
|
| 224 |
+
vc_input3 = gr.Audio(label="Upload Audio")
|
| 225 |
+
vc_submit = gr.Button("Convert", variant="primary")
|
| 226 |
+
vc_output1 = gr.Textbox(label="Status Information")
|
| 227 |
+
vc_output2 = gr.Audio(label="Converted Audio")
|
| 228 |
vc_submit.click(svc_main, [sid, vc_input3], [vc_output1, vc_output2])
|
| 229 |
|
| 230 |
+
# Launch the Gradio app
|
| 231 |
+
app.launch(share=True)
|