Spaces:
Running
Running
import dataclasses | |
import pathlib | |
import audioflux | |
import librosa | |
import numpy as np | |
import parselmouth | |
import pyworld as pw | |
import resampy | |
import torch | |
import torchcrepe | |
import torchfcpe | |
from anyf0.rmvpe import RMVPE | |
def hz_to_cents(F, F_ref=55.0): | |
""" | |
Converts frequency in Hz to cents. | |
Parameters | |
---------- | |
F : float or ndarray | |
Frequency value in Hz | |
F_ref : float | |
Reference frequency in Hz (Default value = 55.0) | |
Returns | |
------- | |
F_cents : float or ndarray | |
Frequency in cents | |
""" | |
# Avoid division by 0 | |
F_temp = np.array(F).astype(float) | |
F_temp[F_temp == 0] = np.nan | |
F_cents = 1200 * np.log2(F_temp / F_ref) | |
return F_cents | |
class F0Extractor: | |
wav_path: pathlib.Path | |
sample_rate: int = 44100 | |
hop_length: int = 512 | |
f0_min: int = 50 | |
f0_max: int = 1600 | |
method: str = "praat_ac" | |
x: np.ndarray = dataclasses.field(init=False) | |
def __post_init__(self): | |
self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate) | |
def hop_size(self) -> float: | |
return self.hop_length / self.sample_rate | |
def wav16k(self) -> np.ndarray: | |
return resampy.resample(self.x, self.sample_rate, 16000) | |
def extract_f0(self) -> np.ndarray: | |
f0 = None | |
match self.method: | |
case "dio": | |
_f0, t = pw.dio( | |
self.x.astype("double"), | |
self.sample_rate, | |
f0_floor=self.f0_min, | |
f0_ceil=self.f0_max, | |
channels_in_octave=2, | |
frame_period=(1000 * self.hop_size), | |
) | |
f0 = pw.stonemask(self.x.astype("double"), _f0, t, self.sample_rate) | |
f0 = f0.astype("float") | |
case "harvest": | |
f0, _ = pw.harvest( | |
self.x.astype("double"), | |
self.sample_rate, | |
f0_floor=self.f0_min, | |
f0_ceil=self.f0_max, | |
frame_period=(1000 * self.hop_size), | |
) | |
f0 = f0.astype("float") | |
case "pyin": | |
f0, _, _ = librosa.pyin( | |
y=self.wav16k, | |
fmin=self.f0_min, | |
fmax=self.f0_max, | |
sr=16000, | |
hop_length=80, | |
) | |
case "piptrack": | |
pitches, magnitudes = librosa.piptrack( | |
y=self.wav16k, | |
fmin=self.f0_min, | |
fmax=self.f0_max, | |
sr=16000, | |
hop_length=80, | |
) | |
max_indexes = np.argmax(magnitudes, axis=0) | |
f0 = pitches[max_indexes, range(magnitudes.shape[1])] | |
case "cep" | "hps" | "lhs" | "ncf" | "pef": | |
f0 = { | |
"cep": audioflux.PitchCEP, | |
"hps": audioflux.PitchHPS, | |
"lhs": audioflux.PitchLHS, | |
"ncf": audioflux.PitchNCF, | |
"pef": audioflux.PitchPEF, | |
}[self.method]( | |
16000, | |
low_fre=self.f0_min, | |
high_fre=self.f0_max, | |
slide_length=80, | |
).pitch(np.pad(self.wav16k, (2048, 2048))) | |
case "stft": | |
f0, _ = audioflux.PitchSTFT( | |
16000, | |
low_fre=self.f0_min, | |
high_fre=self.f0_max, | |
slide_length=80, | |
).pitch(np.pad(self.wav16k, (2048, 2048))) | |
case "yin": | |
f0, _, _ = audioflux.PitchYIN( | |
16000, | |
low_fre=self.f0_min, | |
high_fre=self.f0_max, | |
slide_length=80, | |
).pitch(np.pad(self.wav16k, (2048, 2048))) | |
case "torchcrepe": | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(device) | |
f0 = torchcrepe.predict( | |
wav16k_torch, | |
sample_rate=16000, | |
hop_length=80, | |
batch_size=1024, | |
fmin=self.f0_min, | |
fmax=self.f0_max, | |
device=device, | |
) | |
f0 = f0[0].cpu().numpy() | |
case "torchfcpe": | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
audio = librosa.to_mono(self.x) | |
audio_length = len(audio) | |
f0_target_length = (audio_length // self.hop_length) + 1 | |
audio = torch.from_numpy(audio).float().unsqueeze(0).unsqueeze(-1).to(device) | |
model = torchfcpe.spawn_bundled_infer_model(device=device) | |
f0 = model.infer( | |
audio, | |
sr=self.sample_rate, | |
decoder_mode='local_argmax', | |
threshold=0.006, | |
f0_min=self.f0_min, | |
f0_max=self.f0_max, | |
interp_uv=False, | |
output_interp_target_length=f0_target_length, | |
) | |
f0 = f0.squeeze().cpu().numpy() | |
case "rmvpe": | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model_rmvpe = RMVPE( | |
"rmvpe.pt", | |
is_half=True, | |
device=device, | |
hop_length=80 | |
) | |
f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03) | |
case "praat_ac" | "praat_cc": | |
l_pad = int(np.ceil(1.5 / self.f0_min * self.sample_rate)) | |
r_pad = int(self.hop_size * ((len(self.x) - 1) // self.hop_size + 1) - len(self.x) + l_pad + 1) | |
f0 = ( | |
getattr( | |
parselmouth.Sound(np.pad(self.x, (l_pad, r_pad)), self.sample_rate), | |
"to_pitch_" + self.method.rpartition('_')[-1] | |
)( | |
time_step=self.hop_size, | |
voicing_threshold=0.6, | |
pitch_floor=self.f0_min, | |
pitch_ceiling=self.f0_max, | |
) | |
.selected_array["frequency"] | |
) | |
case "praat_shs": | |
l_pad = int(np.ceil(1.5 / self.f0_min * self.sample_rate)) | |
r_pad = int(self.hop_size * ((len(self.x) - 1) // self.hop_size + 1) - len(self.x) + l_pad + 1) | |
f0 = parselmouth.Sound( | |
np.pad(self.x, (l_pad, r_pad)), self.sample_rate | |
).to_pitch_shs( | |
time_step=self.hop_size, | |
minimum_pitch=self.f0_min, | |
maximum_frequency_component=self.f0_max, | |
).selected_array["frequency"] | |
case _: | |
raise ValueError(f"Unknown method: {self.method}") | |
return hz_to_cents(f0, librosa.midi_to_hz(0)) | |
def plot_f0(self, f0): | |
from matplotlib import pyplot as plt | |
plt.figure(figsize=(10, 4)) | |
plt.plot(f0) | |
plt.title(self.method) | |
plt.xlabel("Time (frames)") | |
plt.ylabel("F0 (cents)") | |
plt.show() |