Spaces:
Running
Running
File size: 6,772 Bytes
9791162 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
"""
| Description: libf0 SWIPE slim implementation
| Contributors: Sebastian Rosenzweig, Simon Schwär, Meinard Müller
| License: The MIT license, https://opensource.org/licenses/MIT
| This file is part of libf0.
"""
import numpy as np
import librosa
from .yin import parabolic_interpolation
from scipy.interpolate import interp1d
def swipe_slim(x, Fs=22050, H=256, F_min=55.0, F_max=1760.0, R=10, strength_threshold=0):
"""
Slim and didactical implementation of a sawtooth waveform inspired pitch estimator (SWIPE).
This version uses a log-frequency spectrogram instead of ERB filters. Furthermore, it is implemented more
efficiently. See `swipe()` for the original implementation.
.. [#] A. Camacho and J. G. Harris,
"A sawtooth waveform inspired pitch estimator for speech and music."
The Journal of the Acoustical Society of America, vol. 124, no. 3, pp. 1638–1652, Sep. 2008
Parameters
----------
x : ndarray
Audio signal
Fs : int
Sampling rate
H : int
Hop size
F_min : float or int
Minimal frequency
F_max : float or int
Maximal frequency
R : float
resolution of the pitch candidate bins in cents (default = 10)
strength_threshold : float
confidence threshold [0, 1] for the pitch detection (default value = 0)
Returns
-------
f0 : ndarray
Estimated F0-trajectory
t : ndarray
Time axis
conf : ndarray
Confidence / Pitch Strength
"""
# compute time and frequency axis
t = np.arange(0, len(x), H) / Fs # time axis
F_coef_log = np.arange(0, np.log2(Fs/2/F_min), R/1200)
F_coef_log_hz = F_min * 2 ** F_coef_log # pitch candidates
# pre-compute kernels, one kernel for each pitch candidate in range [F_min : F_max]
F_min_idx = np.argmin(np.abs(F_coef_log_hz - F_min))
F_max_idx = np.argmin(np.abs(F_coef_log_hz - F_max))
B = F_max_idx - F_min_idx # Number of pitch candidates
kernels = np.zeros((B, len(F_coef_log_hz)))
for i, f in enumerate(F_coef_log_hz[F_min_idx:F_max_idx]):
kernels[i, :] = compute_kernel(f, F_coef_log_hz)
# determine optimal window length for each candidate
L_opt = np.log2(Fs * 8 / np.array([F_min, F_max])) # exponents for optimal window sizes 2^L, see paper Section II.G
L_rnd = np.arange(np.round(L_opt[1]), np.round(L_opt[0])+1).astype(np.int32) # range of rounded exponents
N_pow2 = 2 ** L_rnd # Compute rounded power-2 windows sizes
# Quantization error between optimal window size (see paper Section II.G) and rounded power-2 windows size
# Using only the largest N here, since errors for other N can be derived from err by subtracting exponent (cyclic)
err = np.abs(np.log2(8 * Fs / F_coef_log_hz[F_min_idx:F_max_idx]) - np.log2(np.max(N_pow2)))
S = np.zeros((B, len(t))) # "pitch-strength" matrix
# loop through all window sizes
for octave, N in enumerate(N_pow2):
# Compute STFT
x_pad = np.pad(x, (0, N)) # to avoid problems during time axis interpolation
H = N // 2
X = librosa.stft(x_pad, n_fft=N, hop_length=H, win_length=N, window='hann', pad_mode='constant', center=True)
Y = np.abs(X)
T_coef_lin_s = np.arange(0, X.shape[1]) * H / Fs
F_coef_lin_hz = np.arange(N // 2 + 1) * Fs / N
# Resample to log-frequency axis
compute_Y_log = interp1d(F_coef_lin_hz, Y, kind='cubic', axis=0)
Y_log = compute_Y_log(F_coef_log_hz)
# Normalize magnitudes
Y_log /= np.sqrt(np.sum(Y_log ** 2, axis=0)) + np.finfo(float).eps
# Correlate kernels with log-spectrum for pitch candidates where N is optimal
S_N = np.matmul(kernels, Y_log)
# Resample time axis
compute_S_N_res = interp1d(T_coef_lin_s, S_N, kind='linear', axis=1)
S_N_res = compute_S_N_res(t)
# Weight pitch strength according to quantization error
candidates = (err > octave - 1) & (err < octave + 1) # consider pitches +/- 1 octave from current window
mu = 1 - np.abs(err[candidates] - octave)
S[candidates, :] += np.multiply(mu.reshape(-1, 1), S_N_res[candidates, :])
# Obtain pitch estimates and corresponding confidence
max_indices = np.argmax(S, axis=0)
conf = np.max(S, axis=0)
# Parabolic Interpolation of pitch estimates for refinement
time_idx = np.arange(S.shape[1])
indeces_shift, _ = parabolic_interpolation(S[max_indices-1, time_idx],
S[max_indices, time_idx],
S[max_indices+1, time_idx])
compute_f0_log = interp1d(np.arange(len(F_coef_log)), F_coef_log, kind='linear')
f0_hz = F_min * 2 ** compute_f0_log(max_indices+indeces_shift)
# Thresholding
f0_hz[conf < strength_threshold] = 0 # discard estimates where confidence is low
return f0_hz, t, conf
def compute_kernel(f, F_coef_log_hz):
"""
Compute a SWIPE' kernel.
Parameters
----------
f : float
Frequency in Hz
F_coef_log_hz :
Logarithmic frequency axis in Hz
Returns
-------
k : ndarray
Kernel
"""
k = np.zeros(len(F_coef_log_hz))
n_harmonics = np.floor(F_coef_log_hz[-1] / f).astype(np.int32)
prime_numbers = prime_and_one(100)[:n_harmonics] # only consider prime harmonics for kernel peaks
ratio = F_coef_log_hz / f
# loop through all prime harmonics
for p in prime_numbers:
a = np.abs(ratio - p) # normalized distance between harmonic and current pitch candidate
main_peak_bins = a < 0.25
k[main_peak_bins] = np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1),
ratio[main_peak_bins].reshape(1, -1))).flatten()
valley_bins = np.logical_and(0.25 < a, a < 0.75)
k[valley_bins] += np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1),
ratio[valley_bins].reshape(1, -1))).flatten() / 2
# Apply decay
k = np.multiply(k, np.sqrt(1.0 / F_coef_log_hz))
# K+-normalize kernel
k = k / np.linalg.norm(k[k > 0])
return k
def prime_and_one(upto=1000000):
"""
Returns a set of prime numbers, adapted from http://rebrained.com/?p=458
Parameters
----------
upto : int
Find prime numbers up to this number
Returns
-------
A set of prime numbers including 1 & 2
"""
primes = np.arange(3, upto+1, 2)
isprime = np.ones((upto-1)//2, dtype=np.bool8)
for factor in primes[:int(np.sqrt(upto))//2]:
if isprime[(factor-2)//2]:
isprime[(factor*3-2)//2::factor] = 0
return np.concatenate((np.array([1, 2]), primes[isprime]))
|