sovits-test / pitch /core /swipe_slim.py
atsushieee's picture
Upload folder using huggingface_hub
9791162
raw
history blame
6.77 kB
"""
| Description: libf0 SWIPE slim implementation
| Contributors: Sebastian Rosenzweig, Simon Schwär, Meinard Müller
| License: The MIT license, https://opensource.org/licenses/MIT
| This file is part of libf0.
"""
import numpy as np
import librosa
from .yin import parabolic_interpolation
from scipy.interpolate import interp1d
def swipe_slim(x, Fs=22050, H=256, F_min=55.0, F_max=1760.0, R=10, strength_threshold=0):
"""
Slim and didactical implementation of a sawtooth waveform inspired pitch estimator (SWIPE).
This version uses a log-frequency spectrogram instead of ERB filters. Furthermore, it is implemented more
efficiently. See `swipe()` for the original implementation.
.. [#] A. Camacho and J. G. Harris,
"A sawtooth waveform inspired pitch estimator for speech and music."
The Journal of the Acoustical Society of America, vol. 124, no. 3, pp. 1638–1652, Sep. 2008
Parameters
----------
x : ndarray
Audio signal
Fs : int
Sampling rate
H : int
Hop size
F_min : float or int
Minimal frequency
F_max : float or int
Maximal frequency
R : float
resolution of the pitch candidate bins in cents (default = 10)
strength_threshold : float
confidence threshold [0, 1] for the pitch detection (default value = 0)
Returns
-------
f0 : ndarray
Estimated F0-trajectory
t : ndarray
Time axis
conf : ndarray
Confidence / Pitch Strength
"""
# compute time and frequency axis
t = np.arange(0, len(x), H) / Fs # time axis
F_coef_log = np.arange(0, np.log2(Fs/2/F_min), R/1200)
F_coef_log_hz = F_min * 2 ** F_coef_log # pitch candidates
# pre-compute kernels, one kernel for each pitch candidate in range [F_min : F_max]
F_min_idx = np.argmin(np.abs(F_coef_log_hz - F_min))
F_max_idx = np.argmin(np.abs(F_coef_log_hz - F_max))
B = F_max_idx - F_min_idx # Number of pitch candidates
kernels = np.zeros((B, len(F_coef_log_hz)))
for i, f in enumerate(F_coef_log_hz[F_min_idx:F_max_idx]):
kernels[i, :] = compute_kernel(f, F_coef_log_hz)
# determine optimal window length for each candidate
L_opt = np.log2(Fs * 8 / np.array([F_min, F_max])) # exponents for optimal window sizes 2^L, see paper Section II.G
L_rnd = np.arange(np.round(L_opt[1]), np.round(L_opt[0])+1).astype(np.int32) # range of rounded exponents
N_pow2 = 2 ** L_rnd # Compute rounded power-2 windows sizes
# Quantization error between optimal window size (see paper Section II.G) and rounded power-2 windows size
# Using only the largest N here, since errors for other N can be derived from err by subtracting exponent (cyclic)
err = np.abs(np.log2(8 * Fs / F_coef_log_hz[F_min_idx:F_max_idx]) - np.log2(np.max(N_pow2)))
S = np.zeros((B, len(t))) # "pitch-strength" matrix
# loop through all window sizes
for octave, N in enumerate(N_pow2):
# Compute STFT
x_pad = np.pad(x, (0, N)) # to avoid problems during time axis interpolation
H = N // 2
X = librosa.stft(x_pad, n_fft=N, hop_length=H, win_length=N, window='hann', pad_mode='constant', center=True)
Y = np.abs(X)
T_coef_lin_s = np.arange(0, X.shape[1]) * H / Fs
F_coef_lin_hz = np.arange(N // 2 + 1) * Fs / N
# Resample to log-frequency axis
compute_Y_log = interp1d(F_coef_lin_hz, Y, kind='cubic', axis=0)
Y_log = compute_Y_log(F_coef_log_hz)
# Normalize magnitudes
Y_log /= np.sqrt(np.sum(Y_log ** 2, axis=0)) + np.finfo(float).eps
# Correlate kernels with log-spectrum for pitch candidates where N is optimal
S_N = np.matmul(kernels, Y_log)
# Resample time axis
compute_S_N_res = interp1d(T_coef_lin_s, S_N, kind='linear', axis=1)
S_N_res = compute_S_N_res(t)
# Weight pitch strength according to quantization error
candidates = (err > octave - 1) & (err < octave + 1) # consider pitches +/- 1 octave from current window
mu = 1 - np.abs(err[candidates] - octave)
S[candidates, :] += np.multiply(mu.reshape(-1, 1), S_N_res[candidates, :])
# Obtain pitch estimates and corresponding confidence
max_indices = np.argmax(S, axis=0)
conf = np.max(S, axis=0)
# Parabolic Interpolation of pitch estimates for refinement
time_idx = np.arange(S.shape[1])
indeces_shift, _ = parabolic_interpolation(S[max_indices-1, time_idx],
S[max_indices, time_idx],
S[max_indices+1, time_idx])
compute_f0_log = interp1d(np.arange(len(F_coef_log)), F_coef_log, kind='linear')
f0_hz = F_min * 2 ** compute_f0_log(max_indices+indeces_shift)
# Thresholding
f0_hz[conf < strength_threshold] = 0 # discard estimates where confidence is low
return f0_hz, t, conf
def compute_kernel(f, F_coef_log_hz):
"""
Compute a SWIPE' kernel.
Parameters
----------
f : float
Frequency in Hz
F_coef_log_hz :
Logarithmic frequency axis in Hz
Returns
-------
k : ndarray
Kernel
"""
k = np.zeros(len(F_coef_log_hz))
n_harmonics = np.floor(F_coef_log_hz[-1] / f).astype(np.int32)
prime_numbers = prime_and_one(100)[:n_harmonics] # only consider prime harmonics for kernel peaks
ratio = F_coef_log_hz / f
# loop through all prime harmonics
for p in prime_numbers:
a = np.abs(ratio - p) # normalized distance between harmonic and current pitch candidate
main_peak_bins = a < 0.25
k[main_peak_bins] = np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1),
ratio[main_peak_bins].reshape(1, -1))).flatten()
valley_bins = np.logical_and(0.25 < a, a < 0.75)
k[valley_bins] += np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1),
ratio[valley_bins].reshape(1, -1))).flatten() / 2
# Apply decay
k = np.multiply(k, np.sqrt(1.0 / F_coef_log_hz))
# K+-normalize kernel
k = k / np.linalg.norm(k[k > 0])
return k
def prime_and_one(upto=1000000):
"""
Returns a set of prime numbers, adapted from http://rebrained.com/?p=458
Parameters
----------
upto : int
Find prime numbers up to this number
Returns
-------
A set of prime numbers including 1 & 2
"""
primes = np.arange(3, upto+1, 2)
isprime = np.ones((upto-1)//2, dtype=np.bool8)
for factor in primes[:int(np.sqrt(upto))//2]:
if isprime[(factor-2)//2]:
isprime[(factor*3-2)//2::factor] = 0
return np.concatenate((np.array([1, 2]), primes[isprime]))