Spaces:
Running
Running
""" | |
| Description: libf0 SWIPE slim implementation | |
| Contributors: Sebastian Rosenzweig, Simon Schwär, Meinard Müller | |
| License: The MIT license, https://opensource.org/licenses/MIT | |
| This file is part of libf0. | |
""" | |
import numpy as np | |
import librosa | |
from .yin import parabolic_interpolation | |
from scipy.interpolate import interp1d | |
def swipe_slim(x, Fs=22050, H=256, F_min=55.0, F_max=1760.0, R=10, strength_threshold=0): | |
""" | |
Slim and didactical implementation of a sawtooth waveform inspired pitch estimator (SWIPE). | |
This version uses a log-frequency spectrogram instead of ERB filters. Furthermore, it is implemented more | |
efficiently. See `swipe()` for the original implementation. | |
.. [#] A. Camacho and J. G. Harris, | |
"A sawtooth waveform inspired pitch estimator for speech and music." | |
The Journal of the Acoustical Society of America, vol. 124, no. 3, pp. 1638–1652, Sep. 2008 | |
Parameters | |
---------- | |
x : ndarray | |
Audio signal | |
Fs : int | |
Sampling rate | |
H : int | |
Hop size | |
F_min : float or int | |
Minimal frequency | |
F_max : float or int | |
Maximal frequency | |
R : float | |
resolution of the pitch candidate bins in cents (default = 10) | |
strength_threshold : float | |
confidence threshold [0, 1] for the pitch detection (default value = 0) | |
Returns | |
------- | |
f0 : ndarray | |
Estimated F0-trajectory | |
t : ndarray | |
Time axis | |
conf : ndarray | |
Confidence / Pitch Strength | |
""" | |
# compute time and frequency axis | |
t = np.arange(0, len(x), H) / Fs # time axis | |
F_coef_log = np.arange(0, np.log2(Fs/2/F_min), R/1200) | |
F_coef_log_hz = F_min * 2 ** F_coef_log # pitch candidates | |
# pre-compute kernels, one kernel for each pitch candidate in range [F_min : F_max] | |
F_min_idx = np.argmin(np.abs(F_coef_log_hz - F_min)) | |
F_max_idx = np.argmin(np.abs(F_coef_log_hz - F_max)) | |
B = F_max_idx - F_min_idx # Number of pitch candidates | |
kernels = np.zeros((B, len(F_coef_log_hz))) | |
for i, f in enumerate(F_coef_log_hz[F_min_idx:F_max_idx]): | |
kernels[i, :] = compute_kernel(f, F_coef_log_hz) | |
# determine optimal window length for each candidate | |
L_opt = np.log2(Fs * 8 / np.array([F_min, F_max])) # exponents for optimal window sizes 2^L, see paper Section II.G | |
L_rnd = np.arange(np.round(L_opt[1]), np.round(L_opt[0])+1).astype(np.int32) # range of rounded exponents | |
N_pow2 = 2 ** L_rnd # Compute rounded power-2 windows sizes | |
# Quantization error between optimal window size (see paper Section II.G) and rounded power-2 windows size | |
# Using only the largest N here, since errors for other N can be derived from err by subtracting exponent (cyclic) | |
err = np.abs(np.log2(8 * Fs / F_coef_log_hz[F_min_idx:F_max_idx]) - np.log2(np.max(N_pow2))) | |
S = np.zeros((B, len(t))) # "pitch-strength" matrix | |
# loop through all window sizes | |
for octave, N in enumerate(N_pow2): | |
# Compute STFT | |
x_pad = np.pad(x, (0, N)) # to avoid problems during time axis interpolation | |
H = N // 2 | |
X = librosa.stft(x_pad, n_fft=N, hop_length=H, win_length=N, window='hann', pad_mode='constant', center=True) | |
Y = np.abs(X) | |
T_coef_lin_s = np.arange(0, X.shape[1]) * H / Fs | |
F_coef_lin_hz = np.arange(N // 2 + 1) * Fs / N | |
# Resample to log-frequency axis | |
compute_Y_log = interp1d(F_coef_lin_hz, Y, kind='cubic', axis=0) | |
Y_log = compute_Y_log(F_coef_log_hz) | |
# Normalize magnitudes | |
Y_log /= np.sqrt(np.sum(Y_log ** 2, axis=0)) + np.finfo(float).eps | |
# Correlate kernels with log-spectrum for pitch candidates where N is optimal | |
S_N = np.matmul(kernels, Y_log) | |
# Resample time axis | |
compute_S_N_res = interp1d(T_coef_lin_s, S_N, kind='linear', axis=1) | |
S_N_res = compute_S_N_res(t) | |
# Weight pitch strength according to quantization error | |
candidates = (err > octave - 1) & (err < octave + 1) # consider pitches +/- 1 octave from current window | |
mu = 1 - np.abs(err[candidates] - octave) | |
S[candidates, :] += np.multiply(mu.reshape(-1, 1), S_N_res[candidates, :]) | |
# Obtain pitch estimates and corresponding confidence | |
max_indices = np.argmax(S, axis=0) | |
conf = np.max(S, axis=0) | |
# Parabolic Interpolation of pitch estimates for refinement | |
time_idx = np.arange(S.shape[1]) | |
indeces_shift, _ = parabolic_interpolation(S[max_indices-1, time_idx], | |
S[max_indices, time_idx], | |
S[max_indices+1, time_idx]) | |
compute_f0_log = interp1d(np.arange(len(F_coef_log)), F_coef_log, kind='linear') | |
f0_hz = F_min * 2 ** compute_f0_log(max_indices+indeces_shift) | |
# Thresholding | |
f0_hz[conf < strength_threshold] = 0 # discard estimates where confidence is low | |
return f0_hz, t, conf | |
def compute_kernel(f, F_coef_log_hz): | |
""" | |
Compute a SWIPE' kernel. | |
Parameters | |
---------- | |
f : float | |
Frequency in Hz | |
F_coef_log_hz : | |
Logarithmic frequency axis in Hz | |
Returns | |
------- | |
k : ndarray | |
Kernel | |
""" | |
k = np.zeros(len(F_coef_log_hz)) | |
n_harmonics = np.floor(F_coef_log_hz[-1] / f).astype(np.int32) | |
prime_numbers = prime_and_one(100)[:n_harmonics] # only consider prime harmonics for kernel peaks | |
ratio = F_coef_log_hz / f | |
# loop through all prime harmonics | |
for p in prime_numbers: | |
a = np.abs(ratio - p) # normalized distance between harmonic and current pitch candidate | |
main_peak_bins = a < 0.25 | |
k[main_peak_bins] = np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1), | |
ratio[main_peak_bins].reshape(1, -1))).flatten() | |
valley_bins = np.logical_and(0.25 < a, a < 0.75) | |
k[valley_bins] += np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1), | |
ratio[valley_bins].reshape(1, -1))).flatten() / 2 | |
# Apply decay | |
k = np.multiply(k, np.sqrt(1.0 / F_coef_log_hz)) | |
# K+-normalize kernel | |
k = k / np.linalg.norm(k[k > 0]) | |
return k | |
def prime_and_one(upto=1000000): | |
""" | |
Returns a set of prime numbers, adapted from http://rebrained.com/?p=458 | |
Parameters | |
---------- | |
upto : int | |
Find prime numbers up to this number | |
Returns | |
------- | |
A set of prime numbers including 1 & 2 | |
""" | |
primes = np.arange(3, upto+1, 2) | |
isprime = np.ones((upto-1)//2, dtype=np.bool8) | |
for factor in primes[:int(np.sqrt(upto))//2]: | |
if isprime[(factor-2)//2]: | |
isprime[(factor*3-2)//2::factor] = 0 | |
return np.concatenate((np.array([1, 2]), primes[isprime])) | |