Ryouko65777 commited on
Commit
ad0df31
·
verified ·
1 Parent(s): eb1c669

Update vc_infer_pipeline.py

Browse files
Files changed (1) hide show
  1. vc_infer_pipeline.py +648 -646
vc_infer_pipeline.py CHANGED
@@ -1,646 +1,648 @@
1
- import numpy as np, parselmouth, torch, pdb, sys, os
2
- from time import time as ttime
3
- import torch.nn.functional as F
4
- import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
5
- from torch import Tensor
6
- import scipy.signal as signal
7
- import pyworld, os, traceback, faiss, librosa, torchcrepe
8
- from scipy import signal
9
- from functools import lru_cache
10
-
11
- now_dir = os.getcwd()
12
- sys.path.append(now_dir)
13
-
14
- bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
15
-
16
- input_audio_path2wav = {}
17
-
18
-
19
- @lru_cache
20
- def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
21
- audio = input_audio_path2wav[input_audio_path]
22
- f0, t = pyworld.harvest(
23
- audio,
24
- fs=fs,
25
- f0_ceil=f0max,
26
- f0_floor=f0min,
27
- frame_period=frame_period,
28
- )
29
- f0 = pyworld.stonemask(audio, f0, t, fs)
30
- return f0
31
-
32
-
33
- def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
34
- # print(data1.max(),data2.max())
35
- rms1 = librosa.feature.rms(
36
- y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
37
- ) # 每半秒一个点
38
- rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
39
- rms1 = torch.from_numpy(rms1)
40
- rms1 = F.interpolate(
41
- rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
42
- ).squeeze()
43
- rms2 = torch.from_numpy(rms2)
44
- rms2 = F.interpolate(
45
- rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
46
- ).squeeze()
47
- rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
48
- data2 *= (
49
- torch.pow(rms1, torch.tensor(1 - rate))
50
- * torch.pow(rms2, torch.tensor(rate - 1))
51
- ).numpy()
52
- return data2
53
-
54
-
55
- class VC(object):
56
- def __init__(self, tgt_sr, config):
57
- self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
58
- config.x_pad,
59
- config.x_query,
60
- config.x_center,
61
- config.x_max,
62
- config.is_half,
63
- )
64
- self.sr = 16000 # hubert输入采样率
65
- self.window = 160 # 每帧点数
66
- self.t_pad = self.sr * self.x_pad # 每条前后pad时间
67
- self.t_pad_tgt = tgt_sr * self.x_pad
68
- self.t_pad2 = self.t_pad * 2
69
- self.t_query = self.sr * self.x_query # 查询切点前后查询时间
70
- self.t_center = self.sr * self.x_center # 查询切点位置
71
- self.t_max = self.sr * self.x_max # 免查询时长阈值
72
- self.device = config.device
73
-
74
- # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
75
- def get_optimal_torch_device(self, index: int = 0) -> torch.device:
76
- # Get cuda device
77
- if torch.cuda.is_available():
78
- return torch.device(
79
- f"cuda:{index % torch.cuda.device_count()}"
80
- ) # Very fast
81
- elif torch.backends.mps.is_available():
82
- return torch.device("mps")
83
- # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
84
- # Else wise return the "cpu" as a torch device,
85
- return torch.device("cpu")
86
-
87
- # Fork Feature: Compute f0 with the crepe method
88
- def get_f0_crepe_computation(
89
- self,
90
- x,
91
- f0_min,
92
- f0_max,
93
- p_len,
94
- hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
95
- model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
96
- ):
97
- x = x.astype(
98
- np.float32
99
- ) # fixes the F.conv2D exception. We needed to convert double to float.
100
- x /= np.quantile(np.abs(x), 0.999)
101
- torch_device = self.get_optimal_torch_device()
102
- audio = torch.from_numpy(x).to(torch_device, copy=True)
103
- audio = torch.unsqueeze(audio, dim=0)
104
- if audio.ndim == 2 and audio.shape[0] > 1:
105
- audio = torch.mean(audio, dim=0, keepdim=True).detach()
106
- audio = audio.detach()
107
- print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
108
- pitch: Tensor = torchcrepe.predict(
109
- audio,
110
- self.sr,
111
- hop_length,
112
- f0_min,
113
- f0_max,
114
- model,
115
- batch_size=hop_length * 2,
116
- device=torch_device,
117
- pad=True,
118
- )
119
- p_len = p_len or x.shape[0] // hop_length
120
- # Resize the pitch for final f0
121
- source = np.array(pitch.squeeze(0).cpu().float().numpy())
122
- source[source < 0.001] = np.nan
123
- target = np.interp(
124
- np.arange(0, len(source) * p_len, len(source)) / p_len,
125
- np.arange(0, len(source)),
126
- source,
127
- )
128
- f0 = np.nan_to_num(target)
129
- return f0 # Resized f0
130
-
131
- def get_f0_official_crepe_computation(
132
- self,
133
- x,
134
- f0_min,
135
- f0_max,
136
- model="full",
137
- ):
138
- # Pick a batch size that doesn't cause memory errors on your gpu
139
- batch_size = 512
140
- # Compute pitch using first gpu
141
- audio = torch.tensor(np.copy(x))[None].float()
142
- f0, pd = torchcrepe.predict(
143
- audio,
144
- self.sr,
145
- self.window,
146
- f0_min,
147
- f0_max,
148
- model,
149
- batch_size=batch_size,
150
- device=self.device,
151
- return_periodicity=True,
152
- )
153
- pd = torchcrepe.filter.median(pd, 3)
154
- f0 = torchcrepe.filter.mean(f0, 3)
155
- f0[pd < 0.1] = 0
156
- f0 = f0[0].cpu().numpy()
157
- return f0
158
-
159
- # Fork Feature: Compute pYIN f0 method
160
- def get_f0_pyin_computation(self, x, f0_min, f0_max):
161
- y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
162
- f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
163
- f0 = f0[1:] # Get rid of extra first frame
164
- return f0
165
-
166
- # Fork Feature: Acquire median hybrid f0 estimation calculation
167
- def get_f0_hybrid_computation(
168
- self,
169
- methods_str,
170
- input_audio_path,
171
- x,
172
- f0_min,
173
- f0_max,
174
- p_len,
175
- filter_radius,
176
- crepe_hop_length,
177
- time_step,
178
- ):
179
- # Get various f0 methods from input to use in the computation stack
180
- s = methods_str
181
- s = s.split("hybrid")[1]
182
- s = s.replace("[", "").replace("]", "")
183
- methods = s.split("+")
184
- f0_computation_stack = []
185
-
186
- print("Calculating f0 pitch estimations for methods: %s" % str(methods))
187
- x = x.astype(np.float32)
188
- x /= np.quantile(np.abs(x), 0.999)
189
- # Get f0 calculations for all methods specified
190
- for method in methods:
191
- f0 = None
192
- if method == "pm":
193
- f0 = (
194
- parselmouth.Sound(x, self.sr)
195
- .to_pitch_ac(
196
- time_step=time_step / 1000,
197
- voicing_threshold=0.6,
198
- pitch_floor=f0_min,
199
- pitch_ceiling=f0_max,
200
- )
201
- .selected_array["frequency"]
202
- )
203
- pad_size = (p_len - len(f0) + 1) // 2
204
- if pad_size > 0 or p_len - len(f0) - pad_size > 0:
205
- f0 = np.pad(
206
- f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
207
- )
208
- elif method == "crepe":
209
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
210
- f0 = f0[1:] # Get rid of extra first frame
211
- elif method == "crepe-tiny":
212
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
213
- f0 = f0[1:] # Get rid of extra first frame
214
- elif method == "mangio-crepe":
215
- f0 = self.get_f0_crepe_computation(
216
- x, f0_min, f0_max, p_len, crepe_hop_length
217
- )
218
- elif method == "mangio-crepe-tiny":
219
- f0 = self.get_f0_crepe_computation(
220
- x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
221
- )
222
- elif method == "harvest":
223
- f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
224
- if filter_radius > 2:
225
- f0 = signal.medfilt(f0, 3)
226
- f0 = f0[1:] # Get rid of first frame.
227
- elif method == "dio": # Potentially buggy?
228
- f0, t = pyworld.dio(
229
- x.astype(np.double),
230
- fs=self.sr,
231
- f0_ceil=f0_max,
232
- f0_floor=f0_min,
233
- frame_period=10,
234
- )
235
- f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
236
- f0 = signal.medfilt(f0, 3)
237
- f0 = f0[1:]
238
- # elif method == "pyin": Not Working just yet
239
- # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
240
- # Push method to the stack
241
- f0_computation_stack.append(f0)
242
-
243
- for fc in f0_computation_stack:
244
- print(len(fc))
245
-
246
- print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
247
- f0_median_hybrid = None
248
- if len(f0_computation_stack) == 1:
249
- f0_median_hybrid = f0_computation_stack[0]
250
- else:
251
- f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
252
- return f0_median_hybrid
253
-
254
- def get_f0(
255
- self,
256
- input_audio_path,
257
- x,
258
- p_len,
259
- f0_up_key,
260
- f0_method,
261
- filter_radius,
262
- crepe_hop_length,
263
- inp_f0=None,
264
- ):
265
- global input_audio_path2wav
266
- time_step = self.window / self.sr * 1000
267
- f0_min = 50
268
- f0_max = 1100
269
- f0_mel_min = 1127 * np.log(1 + f0_min / 700)
270
- f0_mel_max = 1127 * np.log(1 + f0_max / 700)
271
- if f0_method == "pm":
272
- f0 = (
273
- parselmouth.Sound(x, self.sr)
274
- .to_pitch_ac(
275
- time_step=time_step / 1000,
276
- voicing_threshold=0.6,
277
- pitch_floor=f0_min,
278
- pitch_ceiling=f0_max,
279
- )
280
- .selected_array["frequency"]
281
- )
282
- pad_size = (p_len - len(f0) + 1) // 2
283
- if pad_size > 0 or p_len - len(f0) - pad_size > 0:
284
- f0 = np.pad(
285
- f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
286
- )
287
- elif f0_method == "harvest":
288
- input_audio_path2wav[input_audio_path] = x.astype(np.double)
289
- f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
290
- if filter_radius > 2:
291
- f0 = signal.medfilt(f0, 3)
292
- elif f0_method == "dio": # Potentially Buggy?
293
- f0, t = pyworld.dio(
294
- x.astype(np.double),
295
- fs=self.sr,
296
- f0_ceil=f0_max,
297
- f0_floor=f0_min,
298
- frame_period=10,
299
- )
300
- f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
301
- f0 = signal.medfilt(f0, 3)
302
- elif f0_method == "crepe":
303
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
304
- elif f0_method == "crepe-tiny":
305
- f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
306
- elif f0_method == "mangio-crepe":
307
- f0 = self.get_f0_crepe_computation(
308
- x, f0_min, f0_max, p_len, crepe_hop_length
309
- )
310
- elif f0_method == "mangio-crepe-tiny":
311
- f0 = self.get_f0_crepe_computation(
312
- x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
313
- )
314
- elif f0_method == "rmvpe":
315
- if hasattr(self, "model_rmvpe") == False:
316
- from rmvpe import RMVPE
317
-
318
- print("loading rmvpe model")
319
- self.model_rmvpe = RMVPE(
320
- "rmvpe.pt", is_half=self.is_half, device=self.device
321
- )
322
- f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
323
-
324
- elif "hybrid" in f0_method:
325
- # Perform hybrid median pitch estimation
326
- input_audio_path2wav[input_audio_path] = x.astype(np.double)
327
- f0 = self.get_f0_hybrid_computation(
328
- f0_method,
329
- input_audio_path,
330
- x,
331
- f0_min,
332
- f0_max,
333
- p_len,
334
- filter_radius,
335
- crepe_hop_length,
336
- time_step,
337
- )
338
-
339
- f0 *= pow(2, f0_up_key / 12)
340
- # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
341
- tf0 = self.sr // self.window # 每秒f0点数
342
- if inp_f0 is not None:
343
- delta_t = np.round(
344
- (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
345
- ).astype("int16")
346
- replace_f0 = np.interp(
347
- list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
348
- )
349
- shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
350
- f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
351
- :shape
352
- ]
353
- # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
354
- f0bak = f0.copy()
355
- f0_mel = 1127 * np.log(1 + f0 / 700)
356
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
357
- f0_mel_max - f0_mel_min
358
- ) + 1
359
- f0_mel[f0_mel <= 1] = 1
360
- f0_mel[f0_mel > 255] = 255
361
- f0_coarse = np.rint(f0_mel).astype(np.int)
362
-
363
- return f0_coarse, f0bak # 1-0
364
-
365
- def vc(
366
- self,
367
- model,
368
- net_g,
369
- sid,
370
- audio0,
371
- pitch,
372
- pitchf,
373
- times,
374
- index,
375
- big_npy,
376
- index_rate,
377
- version,
378
- protect,
379
- ): # ,file_index,file_big_npy
380
- feats = torch.from_numpy(audio0)
381
- if self.is_half:
382
- feats = feats.half()
383
- else:
384
- feats = feats.float()
385
- if feats.dim() == 2: # double channels
386
- feats = feats.mean(-1)
387
- assert feats.dim() == 1, feats.dim()
388
- feats = feats.view(1, -1)
389
- padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
390
-
391
- inputs = {
392
- "source": feats.to(self.device),
393
- "padding_mask": padding_mask,
394
- "output_layer": 9 if version == "v1" else 12,
395
- }
396
- t0 = ttime()
397
- with torch.no_grad():
398
- logits = model.extract_features(**inputs)
399
- feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
400
- if protect < 0.5 and pitch != None and pitchf != None:
401
- feats0 = feats.clone()
402
- if (
403
- isinstance(index, type(None)) == False
404
- and isinstance(big_npy, type(None)) == False
405
- and index_rate != 0
406
- ):
407
- npy = feats[0].cpu().numpy()
408
- if self.is_half:
409
- npy = npy.astype("float32")
410
-
411
- # _, I = index.search(npy, 1)
412
- # npy = big_npy[I.squeeze()]
413
-
414
- score, ix = index.search(npy, k=8)
415
- weight = np.square(1 / score)
416
- weight /= weight.sum(axis=1, keepdims=True)
417
- npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
418
-
419
- if self.is_half:
420
- npy = npy.astype("float16")
421
- feats = (
422
- torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
423
- + (1 - index_rate) * feats
424
- )
425
-
426
- feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
427
- if protect < 0.5 and pitch != None and pitchf != None:
428
- feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
429
- 0, 2, 1
430
- )
431
- t1 = ttime()
432
- p_len = audio0.shape[0] // self.window
433
- if feats.shape[1] < p_len:
434
- p_len = feats.shape[1]
435
- if pitch != None and pitchf != None:
436
- pitch = pitch[:, :p_len]
437
- pitchf = pitchf[:, :p_len]
438
-
439
- if protect < 0.5 and pitch != None and pitchf != None:
440
- pitchff = pitchf.clone()
441
- pitchff[pitchf > 0] = 1
442
- pitchff[pitchf < 1] = protect
443
- pitchff = pitchff.unsqueeze(-1)
444
- feats = feats * pitchff + feats0 * (1 - pitchff)
445
- feats = feats.to(feats0.dtype)
446
- p_len = torch.tensor([p_len], device=self.device).long()
447
- with torch.no_grad():
448
- if pitch != None and pitchf != None:
449
- audio1 = (
450
- (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
451
- .data.cpu()
452
- .float()
453
- .numpy()
454
- )
455
- else:
456
- audio1 = (
457
- (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
458
- )
459
- del feats, p_len, padding_mask
460
- if torch.cuda.is_available():
461
- torch.cuda.empty_cache()
462
- t2 = ttime()
463
- times[0] += t1 - t0
464
- times[2] += t2 - t1
465
- return audio1
466
-
467
- def pipeline(
468
- self,
469
- model,
470
- net_g,
471
- sid,
472
- audio,
473
- input_audio_path,
474
- times,
475
- f0_up_key,
476
- f0_method,
477
- file_index,
478
- # file_big_npy,
479
- index_rate,
480
- if_f0,
481
- filter_radius,
482
- tgt_sr,
483
- resample_sr,
484
- rms_mix_rate,
485
- version,
486
- protect,
487
- crepe_hop_length,
488
- f0_file=None,
489
- ):
490
- if (
491
- file_index != ""
492
- # and file_big_npy != ""
493
- # and os.path.exists(file_big_npy) == True
494
- and os.path.exists(file_index) == True
495
- and index_rate != 0
496
- ):
497
- try:
498
- index = faiss.read_index(file_index)
499
- # big_npy = np.load(file_big_npy)
500
- big_npy = index.reconstruct_n(0, index.ntotal)
501
- except:
502
- traceback.print_exc()
503
- index = big_npy = None
504
- else:
505
- index = big_npy = None
506
- audio = signal.filtfilt(bh, ah, audio)
507
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
508
- opt_ts = []
509
- if audio_pad.shape[0] > self.t_max:
510
- audio_sum = np.zeros_like(audio)
511
- for i in range(self.window):
512
- audio_sum += audio_pad[i : i - self.window]
513
- for t in range(self.t_center, audio.shape[0], self.t_center):
514
- opt_ts.append(
515
- t
516
- - self.t_query
517
- + np.where(
518
- np.abs(audio_sum[t - self.t_query : t + self.t_query])
519
- == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
520
- )[0][0]
521
- )
522
- s = 0
523
- audio_opt = []
524
- t = None
525
- t1 = ttime()
526
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
527
- p_len = audio_pad.shape[0] // self.window
528
- inp_f0 = None
529
- if hasattr(f0_file, "name") == True:
530
- try:
531
- with open(f0_file.name, "r") as f:
532
- lines = f.read().strip("\n").split("\n")
533
- inp_f0 = []
534
- for line in lines:
535
- inp_f0.append([float(i) for i in line.split(",")])
536
- inp_f0 = np.array(inp_f0, dtype="float32")
537
- except:
538
- traceback.print_exc()
539
- sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
540
- pitch, pitchf = None, None
541
- if if_f0 == 1:
542
- pitch, pitchf = self.get_f0(
543
- input_audio_path,
544
- audio_pad,
545
- p_len,
546
- f0_up_key,
547
- f0_method,
548
- filter_radius,
549
- crepe_hop_length,
550
- inp_f0,
551
- )
552
- pitch = pitch[:p_len]
553
- pitchf = pitchf[:p_len]
554
- if self.device == "mps":
555
- pitchf = pitchf.astype(np.float32)
556
- pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
557
- pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
558
- t2 = ttime()
559
- times[1] += t2 - t1
560
- for t in opt_ts:
561
- t = t // self.window * self.window
562
- if if_f0 == 1:
563
- audio_opt.append(
564
- self.vc(
565
- model,
566
- net_g,
567
- sid,
568
- audio_pad[s : t + self.t_pad2 + self.window],
569
- pitch[:, s // self.window : (t + self.t_pad2) // self.window],
570
- pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
571
- times,
572
- index,
573
- big_npy,
574
- index_rate,
575
- version,
576
- protect,
577
- )[self.t_pad_tgt : -self.t_pad_tgt]
578
- )
579
- else:
580
- audio_opt.append(
581
- self.vc(
582
- model,
583
- net_g,
584
- sid,
585
- audio_pad[s : t + self.t_pad2 + self.window],
586
- None,
587
- None,
588
- times,
589
- index,
590
- big_npy,
591
- index_rate,
592
- version,
593
- protect,
594
- )[self.t_pad_tgt : -self.t_pad_tgt]
595
- )
596
- s = t
597
- if if_f0 == 1:
598
- audio_opt.append(
599
- self.vc(
600
- model,
601
- net_g,
602
- sid,
603
- audio_pad[t:],
604
- pitch[:, t // self.window :] if t is not None else pitch,
605
- pitchf[:, t // self.window :] if t is not None else pitchf,
606
- times,
607
- index,
608
- big_npy,
609
- index_rate,
610
- version,
611
- protect,
612
- )[self.t_pad_tgt : -self.t_pad_tgt]
613
- )
614
- else:
615
- audio_opt.append(
616
- self.vc(
617
- model,
618
- net_g,
619
- sid,
620
- audio_pad[t:],
621
- None,
622
- None,
623
- times,
624
- index,
625
- big_npy,
626
- index_rate,
627
- version,
628
- protect,
629
- )[self.t_pad_tgt : -self.t_pad_tgt]
630
- )
631
- audio_opt = np.concatenate(audio_opt)
632
- if rms_mix_rate != 1:
633
- audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
634
- if resample_sr >= 16000 and tgt_sr != resample_sr:
635
- audio_opt = librosa.resample(
636
- audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
637
- )
638
- audio_max = np.abs(audio_opt).max() / 0.99
639
- max_int16 = 32768
640
- if audio_max > 1:
641
- max_int16 /= audio_max
642
- audio_opt = (audio_opt * max_int16).astype(np.int16)
643
- del pitch, pitchf, sid
644
- if torch.cuda.is_available():
645
- torch.cuda.empty_cache()
646
- return audio_opt
 
 
 
1
+ import numpy as np, parselmouth, torch, pdb, sys, os
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
5
+ from torch import Tensor
6
+ import scipy.signal as signal
7
+ import pyworld, os, traceback, faiss, librosa, torchcrepe
8
+ from scipy import signal
9
+ from functools import lru_cache
10
+ import gc, re
11
+ import random
12
+
13
+ now_dir = os.getcwd()
14
+ sys.path.append(now_dir)
15
+
16
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
17
+
18
+ input_audio_path2wav = {}
19
+
20
+
21
+ @lru_cache
22
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
23
+ audio = input_audio_path2wav[input_audio_path]
24
+ f0, t = pyworld.harvest(
25
+ audio,
26
+ fs=fs,
27
+ f0_ceil=f0max,
28
+ f0_floor=f0min,
29
+ frame_period=frame_period,
30
+ )
31
+ f0 = pyworld.stonemask(audio, f0, t, fs)
32
+ return f0
33
+
34
+
35
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
36
+ # print(data1.max(),data2.max())
37
+ rms1 = librosa.feature.rms(
38
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
39
+ ) # 每半秒一个点
40
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
41
+ rms1 = torch.from_numpy(rms1)
42
+ rms1 = F.interpolate(
43
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
44
+ ).squeeze()
45
+ rms2 = torch.from_numpy(rms2)
46
+ rms2 = F.interpolate(
47
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
48
+ ).squeeze()
49
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
50
+ data2 *= (
51
+ torch.pow(rms1, torch.tensor(1 - rate))
52
+ * torch.pow(rms2, torch.tensor(rate - 1))
53
+ ).numpy()
54
+ return data2
55
+
56
+
57
+ class VC(object):
58
+ def __init__(self, tgt_sr, config):
59
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
60
+ config.x_pad,
61
+ config.x_query,
62
+ config.x_center,
63
+ config.x_max,
64
+ config.is_half,
65
+ )
66
+ self.sr = 16000 # hubert输入采样率
67
+ self.window = 160 # 每帧点数
68
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
69
+ self.t_pad_tgt = tgt_sr * self.x_pad
70
+ self.t_pad2 = self.t_pad * 2
71
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
72
+ self.t_center = self.sr * self.x_center # 查询切点位置
73
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
74
+ self.device = config.device
75
+
76
+ # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
77
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
78
+ # Get cuda device
79
+ if torch.cuda.is_available():
80
+ return torch.device(
81
+ f"cuda:{index % torch.cuda.device_count()}"
82
+ ) # Very fast
83
+ elif torch.backends.mps.is_available():
84
+ return torch.device("mps")
85
+ # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
86
+ # Else wise return the "cpu" as a torch device,
87
+ return torch.device("cpu")
88
+
89
+ # Fork Feature: Compute f0 with the crepe method
90
+ def get_f0_crepe_computation(
91
+ self,
92
+ x,
93
+ f0_min,
94
+ f0_max,
95
+ p_len,
96
+ hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
97
+ model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
98
+ ):
99
+ x = x.astype(
100
+ np.float32
101
+ ) # fixes the F.conv2D exception. We needed to convert double to float.
102
+ x /= np.quantile(np.abs(x), 0.999)
103
+ torch_device = self.get_optimal_torch_device()
104
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
105
+ audio = torch.unsqueeze(audio, dim=0)
106
+ if audio.ndim == 2 and audio.shape[0] > 1:
107
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
108
+ audio = audio.detach()
109
+ print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
110
+ pitch: Tensor = torchcrepe.predict(
111
+ audio,
112
+ self.sr,
113
+ hop_length,
114
+ f0_min,
115
+ f0_max,
116
+ model,
117
+ batch_size=hop_length * 2,
118
+ device=torch_device,
119
+ pad=True,
120
+ )
121
+ p_len = p_len or x.shape[0] // hop_length
122
+ # Resize the pitch for final f0
123
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
124
+ source[source < 0.001] = np.nan
125
+ target = np.interp(
126
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
127
+ np.arange(0, len(source)),
128
+ source,
129
+ )
130
+ f0 = np.nan_to_num(target)
131
+ return f0 # Resized f0
132
+
133
+ def get_f0_official_crepe_computation(
134
+ self,
135
+ x,
136
+ f0_min,
137
+ f0_max,
138
+ model="full",
139
+ ):
140
+ # Pick a batch size that doesn't cause memory errors on your gpu
141
+ batch_size = 512
142
+ # Compute pitch using first gpu
143
+ audio = torch.tensor(np.copy(x))[None].float()
144
+ f0, pd = torchcrepe.predict(
145
+ audio,
146
+ self.sr,
147
+ self.window,
148
+ f0_min,
149
+ f0_max,
150
+ model,
151
+ batch_size=batch_size,
152
+ device=self.device,
153
+ return_periodicity=True,
154
+ )
155
+ pd = torchcrepe.filter.median(pd, 3)
156
+ f0 = torchcrepe.filter.mean(f0, 3)
157
+ f0[pd < 0.1] = 0
158
+ f0 = f0[0].cpu().numpy()
159
+ return f0
160
+
161
+ # Fork Feature: Compute pYIN f0 method
162
+ def get_f0_pyin_computation(self, x, f0_min, f0_max):
163
+ y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
164
+ f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
165
+ f0 = f0[1:] # Get rid of extra first frame
166
+ return f0
167
+
168
+ # Fork Feature: Acquire median hybrid f0 estimation calculation
169
+ def get_f0_hybrid_computation(
170
+ self,
171
+ methods_str,
172
+ input_audio_path,
173
+ x,
174
+ f0_min,
175
+ f0_max,
176
+ p_len,
177
+ filter_radius,
178
+ crepe_hop_length,
179
+ time_step,
180
+ ):
181
+ # Get various f0 methods from input to use in the computation stack
182
+ s = methods_str
183
+ s = s.split("hybrid")[1]
184
+ s = s.replace("[", "").replace("]", "")
185
+ methods = s.split("+")
186
+ f0_computation_stack = []
187
+
188
+ print("Calculating f0 pitch estimations for methods: %s" % str(methods))
189
+ x = x.astype(np.float32)
190
+ x /= np.quantile(np.abs(x), 0.999)
191
+ # Get f0 calculations for all methods specified
192
+ for method in methods:
193
+ f0 = None
194
+ if method == "pm":
195
+ f0 = (
196
+ parselmouth.Sound(x, self.sr)
197
+ .to_pitch_ac(
198
+ time_step=time_step / 1000,
199
+ voicing_threshold=0.6,
200
+ pitch_floor=f0_min,
201
+ pitch_ceiling=f0_max,
202
+ )
203
+ .selected_array["frequency"]
204
+ )
205
+ pad_size = (p_len - len(f0) + 1) // 2
206
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
207
+ f0 = np.pad(
208
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
209
+ )
210
+ elif method == "crepe":
211
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
212
+ f0 = f0[1:] # Get rid of extra first frame
213
+ elif method == "crepe-tiny":
214
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
215
+ f0 = f0[1:] # Get rid of extra first frame
216
+ elif method == "mangio-crepe":
217
+ f0 = self.get_f0_crepe_computation(
218
+ x, f0_min, f0_max, p_len, crepe_hop_length
219
+ )
220
+ elif method == "crepe-lite":
221
+ f0 = self.get_f0_crepe_computation(
222
+ x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
223
+ )
224
+ elif method == "harvest":
225
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
226
+ if filter_radius > 2:
227
+ f0 = signal.medfilt(f0, 3)
228
+ f0 = f0[1:] # Get rid of first frame.
229
+ elif method == "dio": # Potentially buggy?
230
+ f0, t = pyworld.dio(
231
+ x.astype(np.double),
232
+ fs=self.sr,
233
+ f0_ceil=f0_max,
234
+ f0_floor=f0_min,
235
+ frame_period=10,
236
+ )
237
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
238
+ f0 = signal.medfilt(f0, 3)
239
+ f0 = f0[1:]
240
+ # elif method == "pyin": Not Working just yet
241
+ # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
242
+ # Push method to the stack
243
+ f0_computation_stack.append(f0)
244
+
245
+ for fc in f0_computation_stack:
246
+ print(len(fc))
247
+
248
+ print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
249
+ f0_median_hybrid = None
250
+ if len(f0_computation_stack) == 1:
251
+ f0_median_hybrid = f0_computation_stack[0]
252
+ else:
253
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
254
+ return f0_median_hybrid
255
+
256
+ def get_f0(
257
+ self,
258
+ input_audio_path,
259
+ x,
260
+ p_len,
261
+ f0_up_key,
262
+ f0_method,
263
+ filter_radius,
264
+ crepe_hop_length,
265
+ inp_f0=None,
266
+ ):
267
+ global input_audio_path2wav
268
+ time_step = self.window / self.sr * 1000
269
+ f0_min = 50
270
+ f0_max = 1100
271
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
272
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
273
+ if f0_method == "pm":
274
+ f0 = (
275
+ parselmouth.Sound(x, self.sr)
276
+ .to_pitch_ac(
277
+ time_step=time_step / 1000,
278
+ voicing_threshold=0.6,
279
+ pitch_floor=f0_min,
280
+ pitch_ceiling=f0_max,
281
+ )
282
+ .selected_array["frequency"]
283
+ )
284
+ pad_size = (p_len - len(f0) + 1) // 2
285
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
286
+ f0 = np.pad(
287
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
288
+ )
289
+ elif f0_method == "harvest":
290
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
291
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
292
+ if filter_radius > 2:
293
+ f0 = signal.medfilt(f0, 3)
294
+ elif f0_method == "dio": # Potentially Buggy?
295
+ f0, t = pyworld.dio(
296
+ x.astype(np.double),
297
+ fs=self.sr,
298
+ f0_ceil=f0_max,
299
+ f0_floor=f0_min,
300
+ frame_period=10,
301
+ )
302
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
303
+ f0 = signal.medfilt(f0, 3)
304
+ elif f0_method == "crepe":
305
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
306
+ elif f0_method == "crepe-tiny":
307
+ f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
308
+ elif f0_method == "mangio-crepe":
309
+ f0 = self.get_f0_crepe_computation(
310
+ x, f0_min, f0_max, p_len, crepe_hop_length
311
+ )
312
+ elif f0_method == "crepe-lite":
313
+ f0 = self.get_f0_crepe_computation(
314
+ x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
315
+ )
316
+ elif f0_method == "rmvpe":
317
+ if hasattr(self, "model_rmvpe") == False:
318
+ from rmvpe import RMVPE
319
+
320
+ print("loading rmvpe model")
321
+ self.model_rmvpe = RMVPE(
322
+ "rmvpe.pt", is_half=self.is_half, device=self.device
323
+ )
324
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
325
+
326
+ elif "hybrid" in f0_method:
327
+ # Perform hybrid median pitch estimation
328
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
329
+ f0 = self.get_f0_hybrid_computation(
330
+ f0_method,
331
+ input_audio_path,
332
+ x,
333
+ f0_min,
334
+ f0_max,
335
+ p_len,
336
+ filter_radius,
337
+ crepe_hop_length,
338
+ time_step,
339
+ )
340
+
341
+ f0 *= pow(2, f0_up_key / 12)
342
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
343
+ tf0 = self.sr // self.window # 每秒f0点数
344
+ if inp_f0 is not None:
345
+ delta_t = np.round(
346
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
347
+ ).astype("int16")
348
+ replace_f0 = np.interp(
349
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
350
+ )
351
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
352
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
353
+ :shape
354
+ ]
355
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
356
+ f0bak = f0.copy()
357
+ f0_mel = 1127 * np.log(1 + f0 / 700)
358
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
359
+ f0_mel_max - f0_mel_min
360
+ ) + 1
361
+ f0_mel[f0_mel <= 1] = 1
362
+ f0_mel[f0_mel > 255] = 255
363
+ f0_coarse = np.rint(f0_mel).astype(np.int)
364
+
365
+ return f0_coarse, f0bak # 1-0
366
+
367
+ def vc(
368
+ self,
369
+ model,
370
+ net_g,
371
+ sid,
372
+ audio0,
373
+ pitch,
374
+ pitchf,
375
+ times,
376
+ index,
377
+ big_npy,
378
+ index_rate,
379
+ version,
380
+ protect,
381
+ ): # ,file_index,file_big_npy
382
+ feats = torch.from_numpy(audio0)
383
+ if self.is_half:
384
+ feats = feats.half()
385
+ else:
386
+ feats = feats.float()
387
+ if feats.dim() == 2: # double channels
388
+ feats = feats.mean(-1)
389
+ assert feats.dim() == 1, feats.dim()
390
+ feats = feats.view(1, -1)
391
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
392
+
393
+ inputs = {
394
+ "source": feats.to(self.device),
395
+ "padding_mask": padding_mask,
396
+ "output_layer": 9 if version == "v1" else 12,
397
+ }
398
+ t0 = ttime()
399
+ with torch.no_grad():
400
+ logits = model.extract_features(**inputs)
401
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
402
+ if protect < 0.5 and pitch != None and pitchf != None:
403
+ feats0 = feats.clone()
404
+ if (
405
+ isinstance(index, type(None)) == False
406
+ and isinstance(big_npy, type(None)) == False
407
+ and index_rate != 0
408
+ ):
409
+ npy = feats[0].cpu().numpy()
410
+ if self.is_half:
411
+ npy = npy.astype("float32")
412
+
413
+ # _, I = index.search(npy, 1)
414
+ # npy = big_npy[I.squeeze()]
415
+
416
+ score, ix = index.search(npy, k=8)
417
+ weight = np.square(1 / score)
418
+ weight /= weight.sum(axis=1, keepdims=True)
419
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
420
+
421
+ if self.is_half:
422
+ npy = npy.astype("float16")
423
+ feats = (
424
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
425
+ + (1 - index_rate) * feats
426
+ )
427
+
428
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
429
+ if protect < 0.5 and pitch != None and pitchf != None:
430
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
431
+ 0, 2, 1
432
+ )
433
+ t1 = ttime()
434
+ p_len = audio0.shape[0] // self.window
435
+ if feats.shape[1] < p_len:
436
+ p_len = feats.shape[1]
437
+ if pitch != None and pitchf != None:
438
+ pitch = pitch[:, :p_len]
439
+ pitchf = pitchf[:, :p_len]
440
+
441
+ if protect < 0.5 and pitch != None and pitchf != None:
442
+ pitchff = pitchf.clone()
443
+ pitchff[pitchf > 0] = 1
444
+ pitchff[pitchf < 1] = protect
445
+ pitchff = pitchff.unsqueeze(-1)
446
+ feats = feats * pitchff + feats0 * (1 - pitchff)
447
+ feats = feats.to(feats0.dtype)
448
+ p_len = torch.tensor([p_len], device=self.device).long()
449
+ with torch.no_grad():
450
+ if pitch != None and pitchf != None:
451
+ audio1 = (
452
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
453
+ .data.cpu()
454
+ .float()
455
+ .numpy()
456
+ )
457
+ else:
458
+ audio1 = (
459
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
460
+ )
461
+ del feats, p_len, padding_mask
462
+ if torch.cuda.is_available():
463
+ torch.cuda.empty_cache()
464
+ t2 = ttime()
465
+ times[0] += t1 - t0
466
+ times[2] += t2 - t1
467
+ return audio1
468
+
469
+ def pipeline(
470
+ self,
471
+ model,
472
+ net_g,
473
+ sid,
474
+ audio,
475
+ input_audio_path,
476
+ times,
477
+ f0_up_key,
478
+ f0_method,
479
+ file_index,
480
+ # file_big_npy,
481
+ index_rate,
482
+ if_f0,
483
+ filter_radius,
484
+ tgt_sr,
485
+ resample_sr,
486
+ rms_mix_rate,
487
+ version,
488
+ protect,
489
+ crepe_hop_length,
490
+ f0_file=None,
491
+ ):
492
+ if (
493
+ file_index != ""
494
+ # and file_big_npy != ""
495
+ # and os.path.exists(file_big_npy) == True
496
+ and os.path.exists(file_index) == True
497
+ and index_rate != 0
498
+ ):
499
+ try:
500
+ index = faiss.read_index(file_index)
501
+ # big_npy = np.load(file_big_npy)
502
+ big_npy = index.reconstruct_n(0, index.ntotal)
503
+ except:
504
+ traceback.print_exc()
505
+ index = big_npy = None
506
+ else:
507
+ index = big_npy = None
508
+ audio = signal.filtfilt(bh, ah, audio)
509
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
510
+ opt_ts = []
511
+ if audio_pad.shape[0] > self.t_max:
512
+ audio_sum = np.zeros_like(audio)
513
+ for i in range(self.window):
514
+ audio_sum += audio_pad[i : i - self.window]
515
+ for t in range(self.t_center, audio.shape[0], self.t_center):
516
+ opt_ts.append(
517
+ t
518
+ - self.t_query
519
+ + np.where(
520
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
521
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
522
+ )[0][0]
523
+ )
524
+ s = 0
525
+ audio_opt = []
526
+ t = None
527
+ t1 = ttime()
528
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
529
+ p_len = audio_pad.shape[0] // self.window
530
+ inp_f0 = None
531
+ if hasattr(f0_file, "name") == True:
532
+ try:
533
+ with open(f0_file.name, "r") as f:
534
+ lines = f.read().strip("\n").split("\n")
535
+ inp_f0 = []
536
+ for line in lines:
537
+ inp_f0.append([float(i) for i in line.split(",")])
538
+ inp_f0 = np.array(inp_f0, dtype="float32")
539
+ except:
540
+ traceback.print_exc()
541
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
542
+ pitch, pitchf = None, None
543
+ if if_f0 == 1:
544
+ pitch, pitchf = self.get_f0(
545
+ input_audio_path,
546
+ audio_pad,
547
+ p_len,
548
+ f0_up_key,
549
+ f0_method,
550
+ filter_radius,
551
+ crepe_hop_length,
552
+ inp_f0,
553
+ )
554
+ pitch = pitch[:p_len]
555
+ pitchf = pitchf[:p_len]
556
+ if self.device == "mps":
557
+ pitchf = pitchf.astype(np.float32)
558
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
559
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
560
+ t2 = ttime()
561
+ times[1] += t2 - t1
562
+ for t in opt_ts:
563
+ t = t // self.window * self.window
564
+ if if_f0 == 1:
565
+ audio_opt.append(
566
+ self.vc(
567
+ model,
568
+ net_g,
569
+ sid,
570
+ audio_pad[s : t + self.t_pad2 + self.window],
571
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
572
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
573
+ times,
574
+ index,
575
+ big_npy,
576
+ index_rate,
577
+ version,
578
+ protect,
579
+ )[self.t_pad_tgt : -self.t_pad_tgt]
580
+ )
581
+ else:
582
+ audio_opt.append(
583
+ self.vc(
584
+ model,
585
+ net_g,
586
+ sid,
587
+ audio_pad[s : t + self.t_pad2 + self.window],
588
+ None,
589
+ None,
590
+ times,
591
+ index,
592
+ big_npy,
593
+ index_rate,
594
+ version,
595
+ protect,
596
+ )[self.t_pad_tgt : -self.t_pad_tgt]
597
+ )
598
+ s = t
599
+ if if_f0 == 1:
600
+ audio_opt.append(
601
+ self.vc(
602
+ model,
603
+ net_g,
604
+ sid,
605
+ audio_pad[t:],
606
+ pitch[:, t // self.window :] if t is not None else pitch,
607
+ pitchf[:, t // self.window :] if t is not None else pitchf,
608
+ times,
609
+ index,
610
+ big_npy,
611
+ index_rate,
612
+ version,
613
+ protect,
614
+ )[self.t_pad_tgt : -self.t_pad_tgt]
615
+ )
616
+ else:
617
+ audio_opt.append(
618
+ self.vc(
619
+ model,
620
+ net_g,
621
+ sid,
622
+ audio_pad[t:],
623
+ None,
624
+ None,
625
+ times,
626
+ index,
627
+ big_npy,
628
+ index_rate,
629
+ version,
630
+ protect,
631
+ )[self.t_pad_tgt : -self.t_pad_tgt]
632
+ )
633
+ audio_opt = np.concatenate(audio_opt)
634
+ if rms_mix_rate != 1:
635
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
636
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
637
+ audio_opt = librosa.resample(
638
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
639
+ )
640
+ audio_max = np.abs(audio_opt).max() / 0.99
641
+ max_int16 = 32768
642
+ if audio_max > 1:
643
+ max_int16 /= audio_max
644
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
645
+ del pitch, pitchf, sid
646
+ if torch.cuda.is_available():
647
+ torch.cuda.empty_cache()
648
+ return audio_opt