Adorable-Qin
commited on
Commit
·
2d4dca5
1
Parent(s):
7c4481d
Change F0 extractor to Crepe
Browse files
ckpts/svc/vocalist_l1_contentvec+whisper/args.json
CHANGED
|
@@ -140,7 +140,7 @@
|
|
| 140 |
"pin_memory": true,
|
| 141 |
"pitch_bin": 256,
|
| 142 |
"pitch_dir": "pitches",
|
| 143 |
-
"pitch_extractor": "parselmouth"
|
| 144 |
"pitch_max": 1100.0,
|
| 145 |
"pitch_min": 50.0,
|
| 146 |
"processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
|
|
|
|
| 140 |
"pin_memory": true,
|
| 141 |
"pitch_bin": 256,
|
| 142 |
"pitch_dir": "pitches",
|
| 143 |
+
"pitch_extractor": "crepe", // "parselmouth"
|
| 144 |
"pitch_max": 1100.0,
|
| 145 |
"pitch_min": 50.0,
|
| 146 |
"processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
|
utils/f0.py
CHANGED
|
@@ -207,7 +207,7 @@ def get_f0_features_using_harvest(audio, mel_len, fs, hop_length, f0_min, f0_max
|
|
| 207 |
return f0
|
| 208 |
|
| 209 |
|
| 210 |
-
def
|
| 211 |
audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
|
| 212 |
):
|
| 213 |
"""Using torchcrepe to extract the f0 feature.
|
|
@@ -259,6 +259,25 @@ def get_f0_features_using_crepe(
|
|
| 259 |
f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
|
| 260 |
return f0
|
| 261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
def get_f0(audio, cfg):
|
| 264 |
if cfg.pitch_extractor == "dio":
|
|
@@ -267,6 +286,8 @@ def get_f0(audio, cfg):
|
|
| 267 |
f0 = get_f0_features_using_pyin(audio, cfg)
|
| 268 |
elif cfg.pitch_extractor == "parselmouth":
|
| 269 |
f0, _ = get_f0_features_using_parselmouth(audio, cfg)
|
|
|
|
|
|
|
| 270 |
# elif cfg.data.f0_extractor == 'cwt': # todo
|
| 271 |
|
| 272 |
return f0
|
|
|
|
| 207 |
return f0
|
| 208 |
|
| 209 |
|
| 210 |
+
def get_f0_features_using_crepe_legacy(
|
| 211 |
audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
|
| 212 |
):
|
| 213 |
"""Using torchcrepe to extract the f0 feature.
|
|
|
|
| 259 |
f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
|
| 260 |
return f0
|
| 261 |
|
| 262 |
+
def get_f0_features_using_crepe(audio, cfg):
|
| 263 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 264 |
+
audio_torch = torch.FloatTensor(audio).unsqueeze(0).to(device)
|
| 265 |
+
|
| 266 |
+
crepe_pitch, pd = torchcrepe.predict(audio_torch, cfg.sample_rate, cfg.hop_size, fmin=cfg.f0_min, fmax=cfg.f0_max, return_periodicity=True)
|
| 267 |
+
|
| 268 |
+
threshold = 0.3
|
| 269 |
+
|
| 270 |
+
# Filter, de-silence, set up threshold for unvoiced part
|
| 271 |
+
pd = torchcrepe.filter.median(pd, 3)
|
| 272 |
+
pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_torch, cfg.sample_rate, 256)
|
| 273 |
+
crepe_pitch = torchcrepe.threshold.At(threshold)(crepe_pitch, pd)
|
| 274 |
+
crepe_pitch = torchcrepe.filter.mean(crepe_pitch, 3)
|
| 275 |
+
|
| 276 |
+
# Convert unvoiced part to 0hz
|
| 277 |
+
crepe_pitch = torch.where(torch.isnan(crepe_pitch), torch.full_like(crepe_pitch, 0), crepe_pitch)
|
| 278 |
+
|
| 279 |
+
return crepe_pitch[0].cpu().numpy()
|
| 280 |
+
|
| 281 |
|
| 282 |
def get_f0(audio, cfg):
|
| 283 |
if cfg.pitch_extractor == "dio":
|
|
|
|
| 286 |
f0 = get_f0_features_using_pyin(audio, cfg)
|
| 287 |
elif cfg.pitch_extractor == "parselmouth":
|
| 288 |
f0, _ = get_f0_features_using_parselmouth(audio, cfg)
|
| 289 |
+
elif cfg.pitch_extractor == "crepe":
|
| 290 |
+
f0 = get_f0_features_using_crepe(audio, cfg)
|
| 291 |
# elif cfg.data.f0_extractor == 'cwt': # todo
|
| 292 |
|
| 293 |
return f0
|