R-Kentaren commited on
Commit
8a50537
·
verified ·
1 Parent(s): d56ba5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -42
app.py CHANGED
@@ -1,25 +1,27 @@
1
- import sys,os
2
- sys.path.append(os.path.dirname(os.path.abspath(__file__)))
3
  import torch
4
-
5
  from omegaconf import OmegaConf
6
  from pitch import load_csv_pitch
7
  from grad.utils import fix_len_compatibility
8
  from grad.model import GradTTS
9
  from bigvgan.model.generator import Generator
10
-
11
  import gradio as gr
12
  import numpy as np
13
  import soundfile
14
  import librosa
15
  import logging
16
 
 
17
  logging.getLogger('numba').setLevel(logging.WARNING)
18
  logging.getLogger('markdown_it').setLevel(logging.WARNING)
19
  logging.getLogger('urllib3').setLevel(logging.WARNING)
20
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
21
 
 
 
22
 
 
23
  def load_gvc_model(checkpoint_path, model):
24
  assert os.path.isfile(checkpoint_path)
25
  checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
@@ -30,12 +32,12 @@ def load_gvc_model(checkpoint_path, model):
30
  try:
31
  new_state_dict[k] = saved_state_dict[k]
32
  except:
33
- print("%s is not in the checkpoint" % k)
34
  new_state_dict[k] = v
35
  model.load_state_dict(new_state_dict)
36
  return model
37
 
38
-
39
  def load_bigv_model(checkpoint_path, model):
40
  assert os.path.isfile(checkpoint_path)
41
  checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
@@ -46,12 +48,12 @@ def load_bigv_model(checkpoint_path, model):
46
  try:
47
  new_state_dict[k] = saved_state_dict[k]
48
  except:
49
- print("%s is not in the checkpoint" % k)
50
  new_state_dict[k] = v
51
  model.load_state_dict(new_state_dict)
52
  return model
53
 
54
-
55
  @torch.no_grad()
56
  def gvc_main(device, model, _vec, _pit, spk, rature=1.015):
57
  l_vec = _vec.shape[0]
@@ -67,9 +69,8 @@ def gvc_main(device, model, _vec, _pit, spk, rature=1.015):
67
  y_dec = y_dec[:, :l_vec]
68
  return y_dec
69
 
70
-
71
  def svc_change(argswav, argsspk):
72
-
73
  argsvec = "svc_tmp.ppg.npy"
74
  os.system(f"python hubert/inference.py -w {argswav} -v {argsvec}")
75
  argspit = "svc_tmp.pit.npy"
@@ -79,11 +80,13 @@ def svc_change(argswav, argsspk):
79
  hps = OmegaConf.load('configs/base.yaml')
80
 
81
  print('Initializing Grad-TTS...')
82
- model = GradTTS(hps.grad.n_mels, hps.grad.n_vecs, hps.grad.n_pits, hps.grad.n_spks, hps.grad.n_embs,
83
- hps.grad.n_enc_channels, hps.grad.filter_channels,
84
- hps.grad.dec_dim, hps.grad.beta_min, hps.grad.beta_max, hps.grad.pe_scale)
85
- print('Number of encoder parameters = %.2fm' % (model.encoder.nparams/1e6))
86
- print('Number of decoder parameters = %.2fm' % (model.decoder.nparams/1e6))
 
 
87
 
88
  load_gvc_model('grad_pretrain/gvc.pretrain.pth', model)
89
  model.eval()
@@ -108,22 +111,21 @@ def svc_change(argswav, argsspk):
108
 
109
  with torch.no_grad():
110
  spk = spk.unsqueeze(0).to(device)
111
-
112
  all_frame = len_min
113
  hop_frame = 8
114
- out_chunk = 2400 # 24 S
115
  out_index = 0
116
  mel = None
117
 
118
- while (out_index < all_frame):
119
- if (out_index == 0): # start frame
120
  cut_s = 0
121
  cut_s_out = 0
122
  else:
123
  cut_s = out_index - hop_frame
124
  cut_s_out = hop_frame
125
 
126
- if (out_index + out_chunk + hop_frame > all_frame): # end frame
127
  cut_e = all_frame
128
  cut_e_out = -1
129
  else:
@@ -135,9 +137,9 @@ def svc_change(argswav, argsspk):
135
 
136
  sub_out = gvc_main(device, model, sub_vec, sub_pit, spk, 0.95)
137
  sub_out = sub_out[:, cut_s_out:cut_e_out]
138
-
139
  out_index = out_index + out_chunk
140
- if mel == None:
141
  mel = sub_out
142
  else:
143
  mel = torch.cat((mel, sub_out), -1)
@@ -175,41 +177,55 @@ def svc_change(argswav, argsspk):
175
 
176
  return audio
177
 
178
-
179
  def svc_main(sid, input_audio):
180
  if input_audio is None:
181
- return "You need to upload an audio", None
 
 
182
  sampling_rate, audio = input_audio
183
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
184
  if len(audio.shape) > 1:
185
  audio = librosa.to_mono(audio.transpose(1, 0))
186
  if sampling_rate != 16000:
187
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
188
- if (len(audio) > 16000*100):
189
- audio = audio[:16000*100]
190
- wav_path = "temp.wav"
 
 
 
 
 
 
 
 
191
  soundfile.write(wav_path, audio, 16000, format="wav")
192
  out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
193
- return "Success", (32000, out_audio)
194
-
195
 
 
196
  app = gr.Blocks()
197
  with app:
198
  with gr.Tabs():
199
  with gr.TabItem("Grad-SVC"):
200
  gr.Markdown(
201
- "Based on Grad-TTS from HUAWEI Noah's Ark Lab\n\n"
202
- "This project is named as [Grad-SVC](), or [GVC]() for short. Its core technology is diffusion, but so different from other diffusion based SVC models.\n\n"
203
- "<video id='video' controls='' preload='yes'>\n\n"
204
- "<source id='mp4' src='https://github.com/PlayVoice/Grad-SVC/assets/16432329/f9b66af7-b5b5-4efb-b73d-adb0dc84a0ae' type='video/mp4'>\n\n"
205
- "</videos>\n\n"
206
- )
207
- sid = gr.Dropdown(label="音色", choices=[
208
- "22", "33", "47", "51"], value="47")
209
- vc_input3 = gr.Audio(label="上传音频")
210
- vc_submit = gr.Button("转换", variant="primary")
211
- vc_output1 = gr.Textbox(label="状态信息")
212
- vc_output2 = gr.Audio(label="转换音频")
 
 
 
213
  vc_submit.click(svc_main, [sid, vc_input3], [vc_output1, vc_output2])
214
 
215
- app.launch(share=True)
 
 
1
+ import sys
2
+ import os
3
  import torch
 
4
  from omegaconf import OmegaConf
5
  from pitch import load_csv_pitch
6
  from grad.utils import fix_len_compatibility
7
  from grad.model import GradTTS
8
  from bigvgan.model.generator import Generator
 
9
  import gradio as gr
10
  import numpy as np
11
  import soundfile
12
  import librosa
13
  import logging
14
 
15
+ # Set logging levels to suppress unnecessary warnings
16
  logging.getLogger('numba').setLevel(logging.WARNING)
17
  logging.getLogger('markdown_it').setLevel(logging.WARNING)
18
  logging.getLogger('urllib3').setLevel(logging.WARNING)
19
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
20
 
21
+ # Append current working directory to system path
22
+ sys.path.append(os.getcwd())
23
 
24
+ # Function to load Grad-TTS model checkpoint
25
  def load_gvc_model(checkpoint_path, model):
26
  assert os.path.isfile(checkpoint_path)
27
  checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
 
32
  try:
33
  new_state_dict[k] = saved_state_dict[k]
34
  except:
35
+ print(f"{k} is not in the checkpoint")
36
  new_state_dict[k] = v
37
  model.load_state_dict(new_state_dict)
38
  return model
39
 
40
+ # Function to load BigVGAN model checkpoint
41
  def load_bigv_model(checkpoint_path, model):
42
  assert os.path.isfile(checkpoint_path)
43
  checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
 
48
  try:
49
  new_state_dict[k] = saved_state_dict[k]
50
  except:
51
+ print(f"{k} is not in the checkpoint")
52
  new_state_dict[k] = v
53
  model.load_state_dict(new_state_dict)
54
  return model
55
 
56
+ # Main Grad-TTS inference function
57
  @torch.no_grad()
58
  def gvc_main(device, model, _vec, _pit, spk, rature=1.015):
59
  l_vec = _vec.shape[0]
 
69
  y_dec = y_dec[:, :l_vec]
70
  return y_dec
71
 
72
+ # Function to process input audio and extract features
73
  def svc_change(argswav, argsspk):
 
74
  argsvec = "svc_tmp.ppg.npy"
75
  os.system(f"python hubert/inference.py -w {argswav} -v {argsvec}")
76
  argspit = "svc_tmp.pit.npy"
 
80
  hps = OmegaConf.load('configs/base.yaml')
81
 
82
  print('Initializing Grad-TTS...')
83
+ model = GradTTS(
84
+ hps.grad.n_mels, hps.grad.n_vecs, hps.grad.n_pits, hps.grad.n_spks,
85
+ hps.grad.n_embs, hps.grad.n_enc_channels, hps.grad.filter_channels,
86
+ hps.grad.dec_dim, hps.grad.beta_min, hps.grad.beta_max, hps.grad.pe_scale
87
+ )
88
+ print(f'Number of encoder parameters = {model.encoder.nparams/1e6:.2f}m')
89
+ print(f'Number of decoder parameters = {model.decoder.nparams/1e6:.2f}m')
90
 
91
  load_gvc_model('grad_pretrain/gvc.pretrain.pth', model)
92
  model.eval()
 
111
 
112
  with torch.no_grad():
113
  spk = spk.unsqueeze(0).to(device)
 
114
  all_frame = len_min
115
  hop_frame = 8
116
+ out_chunk = 2400 # 24 seconds
117
  out_index = 0
118
  mel = None
119
 
120
+ while out_index < all_frame:
121
+ if out_index == 0: # Start frame
122
  cut_s = 0
123
  cut_s_out = 0
124
  else:
125
  cut_s = out_index - hop_frame
126
  cut_s_out = hop_frame
127
 
128
+ if out_index + out_chunk + hop_frame > all_frame: # End frame
129
  cut_e = all_frame
130
  cut_e_out = -1
131
  else:
 
137
 
138
  sub_out = gvc_main(device, model, sub_vec, sub_pit, spk, 0.95)
139
  sub_out = sub_out[:, cut_s_out:cut_e_out]
140
+
141
  out_index = out_index + out_chunk
142
+ if mel is None:
143
  mel = sub_out
144
  else:
145
  mel = torch.cat((mel, sub_out), -1)
 
177
 
178
  return audio
179
 
180
+ # Main function to handle audio input and conversion
181
  def svc_main(sid, input_audio):
182
  if input_audio is None:
183
+ return "You need to upload an audio file", None
184
+
185
+
186
  sampling_rate, audio = input_audio
187
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
188
  if len(audio.shape) > 1:
189
  audio = librosa.to_mono(audio.transpose(1, 0))
190
  if sampling_rate != 16000:
191
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
192
+ if len(audio) > 16000 * 100:
193
+ audio = audio[:16000 * 100]
194
+
195
+ separator = Separator()
196
+ separator.load_model()
197
+ output_names = {
198
+ "Vocals": "vocals_output",
199
+ "Instrumental": "instrumental_output",
200
+ }
201
+ output_files = separator.separate(audio, output_names)
202
+ wav_path = "vocals_output.wav"
203
  soundfile.write(wav_path, audio, 16000, format="wav")
204
  out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
205
+ return "Conversion Successful", (32000, out_audio)
 
206
 
207
+ # Gradio WebUI setup
208
  app = gr.Blocks()
209
  with app:
210
  with gr.Tabs():
211
  with gr.TabItem("Grad-SVC"):
212
  gr.Markdown(
213
+ """
214
+ Based on Grad-TTS from HUAWEI Noah's Ark Lab
215
+
216
+ This project is named Grad-SVC, or GVC for short. Its core technology is diffusion, but it is very different from other diffusion-based SVC models.
217
+
218
+ <video id='video' controls='' preload='yes'>
219
+ <source id='mp4' src='https://github.com/PlayVoice/Grad-SVC/assets/16432329/f9b66af7-b5b5-4efb-b73d-adb0dc84a0ae' type='video/mp4'>
220
+ </video>
221
+ """
222
+ )
223
+ sid = gr.Dropdown(label="Voice Tone", choices=["22", "33", "47", "51"], value="47")
224
+ vc_input3 = gr.Audio(label="Upload Audio")
225
+ vc_submit = gr.Button("Convert", variant="primary")
226
+ vc_output1 = gr.Textbox(label="Status Information")
227
+ vc_output2 = gr.Audio(label="Converted Audio")
228
  vc_submit.click(svc_main, [sid, vc_input3], [vc_output1, vc_output2])
229
 
230
+ # Launch the Gradio app
231
+ app.launch(share=True)