""" | Description: libf0 SWIPE slim implementation | Contributors: Sebastian Rosenzweig, Simon Schwär, Meinard Müller | License: The MIT license, https://opensource.org/licenses/MIT | This file is part of libf0. """ import numpy as np import librosa from .yin import parabolic_interpolation from scipy.interpolate import interp1d def swipe_slim(x, Fs=22050, H=256, F_min=55.0, F_max=1760.0, R=10, strength_threshold=0): """ Slim and didactical implementation of a sawtooth waveform inspired pitch estimator (SWIPE). This version uses a log-frequency spectrogram instead of ERB filters. Furthermore, it is implemented more efficiently. See `swipe()` for the original implementation. .. [#] A. Camacho and J. G. Harris, "A sawtooth waveform inspired pitch estimator for speech and music." The Journal of the Acoustical Society of America, vol. 124, no. 3, pp. 1638–1652, Sep. 2008 Parameters ---------- x : ndarray Audio signal Fs : int Sampling rate H : int Hop size F_min : float or int Minimal frequency F_max : float or int Maximal frequency R : float resolution of the pitch candidate bins in cents (default = 10) strength_threshold : float confidence threshold [0, 1] for the pitch detection (default value = 0) Returns ------- f0 : ndarray Estimated F0-trajectory t : ndarray Time axis conf : ndarray Confidence / Pitch Strength """ # compute time and frequency axis t = np.arange(0, len(x), H) / Fs # time axis F_coef_log = np.arange(0, np.log2(Fs/2/F_min), R/1200) F_coef_log_hz = F_min * 2 ** F_coef_log # pitch candidates # pre-compute kernels, one kernel for each pitch candidate in range [F_min : F_max] F_min_idx = np.argmin(np.abs(F_coef_log_hz - F_min)) F_max_idx = np.argmin(np.abs(F_coef_log_hz - F_max)) B = F_max_idx - F_min_idx # Number of pitch candidates kernels = np.zeros((B, len(F_coef_log_hz))) for i, f in enumerate(F_coef_log_hz[F_min_idx:F_max_idx]): kernels[i, :] = compute_kernel(f, F_coef_log_hz) # determine optimal window length for each candidate L_opt = np.log2(Fs * 8 / np.array([F_min, F_max])) # exponents for optimal window sizes 2^L, see paper Section II.G L_rnd = np.arange(np.round(L_opt[1]), np.round(L_opt[0])+1).astype(np.int32) # range of rounded exponents N_pow2 = 2 ** L_rnd # Compute rounded power-2 windows sizes # Quantization error between optimal window size (see paper Section II.G) and rounded power-2 windows size # Using only the largest N here, since errors for other N can be derived from err by subtracting exponent (cyclic) err = np.abs(np.log2(8 * Fs / F_coef_log_hz[F_min_idx:F_max_idx]) - np.log2(np.max(N_pow2))) S = np.zeros((B, len(t))) # "pitch-strength" matrix # loop through all window sizes for octave, N in enumerate(N_pow2): # Compute STFT x_pad = np.pad(x, (0, N)) # to avoid problems during time axis interpolation H = N // 2 X = librosa.stft(x_pad, n_fft=N, hop_length=H, win_length=N, window='hann', pad_mode='constant', center=True) Y = np.abs(X) T_coef_lin_s = np.arange(0, X.shape[1]) * H / Fs F_coef_lin_hz = np.arange(N // 2 + 1) * Fs / N # Resample to log-frequency axis compute_Y_log = interp1d(F_coef_lin_hz, Y, kind='cubic', axis=0) Y_log = compute_Y_log(F_coef_log_hz) # Normalize magnitudes Y_log /= np.sqrt(np.sum(Y_log ** 2, axis=0)) + np.finfo(float).eps # Correlate kernels with log-spectrum for pitch candidates where N is optimal S_N = np.matmul(kernels, Y_log) # Resample time axis compute_S_N_res = interp1d(T_coef_lin_s, S_N, kind='linear', axis=1) S_N_res = compute_S_N_res(t) # Weight pitch strength according to quantization error candidates = (err > octave - 1) & (err < octave + 1) # consider pitches +/- 1 octave from current window mu = 1 - np.abs(err[candidates] - octave) S[candidates, :] += np.multiply(mu.reshape(-1, 1), S_N_res[candidates, :]) # Obtain pitch estimates and corresponding confidence max_indices = np.argmax(S, axis=0) conf = np.max(S, axis=0) # Parabolic Interpolation of pitch estimates for refinement time_idx = np.arange(S.shape[1]) indeces_shift, _ = parabolic_interpolation(S[max_indices-1, time_idx], S[max_indices, time_idx], S[max_indices+1, time_idx]) compute_f0_log = interp1d(np.arange(len(F_coef_log)), F_coef_log, kind='linear') f0_hz = F_min * 2 ** compute_f0_log(max_indices+indeces_shift) # Thresholding f0_hz[conf < strength_threshold] = 0 # discard estimates where confidence is low return f0_hz, t, conf def compute_kernel(f, F_coef_log_hz): """ Compute a SWIPE' kernel. Parameters ---------- f : float Frequency in Hz F_coef_log_hz : Logarithmic frequency axis in Hz Returns ------- k : ndarray Kernel """ k = np.zeros(len(F_coef_log_hz)) n_harmonics = np.floor(F_coef_log_hz[-1] / f).astype(np.int32) prime_numbers = prime_and_one(100)[:n_harmonics] # only consider prime harmonics for kernel peaks ratio = F_coef_log_hz / f # loop through all prime harmonics for p in prime_numbers: a = np.abs(ratio - p) # normalized distance between harmonic and current pitch candidate main_peak_bins = a < 0.25 k[main_peak_bins] = np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1), ratio[main_peak_bins].reshape(1, -1))).flatten() valley_bins = np.logical_and(0.25 < a, a < 0.75) k[valley_bins] += np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1), ratio[valley_bins].reshape(1, -1))).flatten() / 2 # Apply decay k = np.multiply(k, np.sqrt(1.0 / F_coef_log_hz)) # K+-normalize kernel k = k / np.linalg.norm(k[k > 0]) return k def prime_and_one(upto=1000000): """ Returns a set of prime numbers, adapted from http://rebrained.com/?p=458 Parameters ---------- upto : int Find prime numbers up to this number Returns ------- A set of prime numbers including 1 & 2 """ primes = np.arange(3, upto+1, 2) isprime = np.ones((upto-1)//2, dtype=np.bool8) for factor in primes[:int(np.sqrt(upto))//2]: if isprime[(factor-2)//2]: isprime[(factor*3-2)//2::factor] = 0 return np.concatenate((np.array([1, 2]), primes[isprime]))