Spaces:

atsushieee
/

sovits-test

Running

App Files Files Community

sovits-test / pitch /core /swipe_slim.py

atsushieee

Upload folder using huggingface_hub

9791162 3 months ago

raw

history blame

6.77 kB

	"""
	\| Description: libf0 SWIPE slim implementation
	\| Contributors: Sebastian Rosenzweig, Simon Schwär, Meinard Müller
	\| License: The MIT license, https://opensource.org/licenses/MIT
	\| This file is part of libf0.
	"""
	import numpy as np
	import librosa
	from .yin import parabolic_interpolation
	from scipy.interpolate import interp1d


	def swipe_slim(x, Fs=22050, H=256, F_min=55.0, F_max=1760.0, R=10, strength_threshold=0):
	"""
	Slim and didactical implementation of a sawtooth waveform inspired pitch estimator (SWIPE).
	This version uses a log-frequency spectrogram instead of ERB filters. Furthermore, it is implemented more
	efficiently. See `swipe()` for the original implementation.

	.. [#] A. Camacho and J. G. Harris,
	"A sawtooth waveform inspired pitch estimator for speech and music."
	The Journal of the Acoustical Society of America, vol. 124, no. 3, pp. 1638–1652, Sep. 2008

	Parameters
	----------
	x : ndarray
	Audio signal
	Fs : int
	Sampling rate
	H : int
	Hop size
	F_min : float or int
	Minimal frequency
	F_max : float or int
	Maximal frequency
	R : float
	resolution of the pitch candidate bins in cents (default = 10)
	strength_threshold : float
	confidence threshold [0, 1] for the pitch detection (default value = 0)

	Returns
	-------
	f0 : ndarray
	Estimated F0-trajectory
	t : ndarray
	Time axis
	conf : ndarray
	Confidence / Pitch Strength
	"""

	# compute time and frequency axis
	t = np.arange(0, len(x), H) / Fs # time axis
	F_coef_log = np.arange(0, np.log2(Fs/2/F_min), R/1200)
	F_coef_log_hz = F_min * 2 ** F_coef_log # pitch candidates

	# pre-compute kernels, one kernel for each pitch candidate in range [F_min : F_max]
	F_min_idx = np.argmin(np.abs(F_coef_log_hz - F_min))
	F_max_idx = np.argmin(np.abs(F_coef_log_hz - F_max))
	B = F_max_idx - F_min_idx # Number of pitch candidates
	kernels = np.zeros((B, len(F_coef_log_hz)))
	for i, f in enumerate(F_coef_log_hz[F_min_idx:F_max_idx]):
	kernels[i, :] = compute_kernel(f, F_coef_log_hz)

	# determine optimal window length for each candidate
	L_opt = np.log2(Fs * 8 / np.array([F_min, F_max])) # exponents for optimal window sizes 2^L, see paper Section II.G
	L_rnd = np.arange(np.round(L_opt[1]), np.round(L_opt[0])+1).astype(np.int32) # range of rounded exponents
	N_pow2 = 2 ** L_rnd # Compute rounded power-2 windows sizes
	# Quantization error between optimal window size (see paper Section II.G) and rounded power-2 windows size
	# Using only the largest N here, since errors for other N can be derived from err by subtracting exponent (cyclic)
	err = np.abs(np.log2(8 * Fs / F_coef_log_hz[F_min_idx:F_max_idx]) - np.log2(np.max(N_pow2)))

	S = np.zeros((B, len(t))) # "pitch-strength" matrix

	# loop through all window sizes
	for octave, N in enumerate(N_pow2):
	# Compute STFT
	x_pad = np.pad(x, (0, N)) # to avoid problems during time axis interpolation
	H = N // 2
	X = librosa.stft(x_pad, n_fft=N, hop_length=H, win_length=N, window='hann', pad_mode='constant', center=True)
	Y = np.abs(X)
	T_coef_lin_s = np.arange(0, X.shape[1]) * H / Fs
	F_coef_lin_hz = np.arange(N // 2 + 1) * Fs / N

	# Resample to log-frequency axis
	compute_Y_log = interp1d(F_coef_lin_hz, Y, kind='cubic', axis=0)
	Y_log = compute_Y_log(F_coef_log_hz)

	# Normalize magnitudes
	Y_log /= np.sqrt(np.sum(Y_log ** 2, axis=0)) + np.finfo(float).eps

	# Correlate kernels with log-spectrum for pitch candidates where N is optimal
	S_N = np.matmul(kernels, Y_log)

	# Resample time axis
	compute_S_N_res = interp1d(T_coef_lin_s, S_N, kind='linear', axis=1)
	S_N_res = compute_S_N_res(t)

	# Weight pitch strength according to quantization error
	candidates = (err > octave - 1) & (err < octave + 1) # consider pitches +/- 1 octave from current window
	mu = 1 - np.abs(err[candidates] - octave)

	S[candidates, :] += np.multiply(mu.reshape(-1, 1), S_N_res[candidates, :])

	# Obtain pitch estimates and corresponding confidence
	max_indices = np.argmax(S, axis=0)
	conf = np.max(S, axis=0)

	# Parabolic Interpolation of pitch estimates for refinement
	time_idx = np.arange(S.shape[1])
	indeces_shift, _ = parabolic_interpolation(S[max_indices-1, time_idx],
	S[max_indices, time_idx],
	S[max_indices+1, time_idx])
	compute_f0_log = interp1d(np.arange(len(F_coef_log)), F_coef_log, kind='linear')
	f0_hz = F_min * 2 ** compute_f0_log(max_indices+indeces_shift)

	# Thresholding
	f0_hz[conf < strength_threshold] = 0 # discard estimates where confidence is low

	return f0_hz, t, conf


	def compute_kernel(f, F_coef_log_hz):
	"""
	Compute a SWIPE' kernel.

	Parameters
	----------
	f : float
	Frequency in Hz
	F_coef_log_hz :
	Logarithmic frequency axis in Hz

	Returns
	-------
	k : ndarray
	Kernel
	"""
	k = np.zeros(len(F_coef_log_hz))
	n_harmonics = np.floor(F_coef_log_hz[-1] / f).astype(np.int32)
	prime_numbers = prime_and_one(100)[:n_harmonics] # only consider prime harmonics for kernel peaks

	ratio = F_coef_log_hz / f

	# loop through all prime harmonics
	for p in prime_numbers:
	a = np.abs(ratio - p) # normalized distance between harmonic and current pitch candidate
	main_peak_bins = a < 0.25
	k[main_peak_bins] = np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1),
	ratio[main_peak_bins].reshape(1, -1))).flatten()
	valley_bins = np.logical_and(0.25 < a, a < 0.75)
	k[valley_bins] += np.cos(np.dot(np.array(2 * np.pi).reshape(-1, 1),
	ratio[valley_bins].reshape(1, -1))).flatten() / 2

	# Apply decay
	k = np.multiply(k, np.sqrt(1.0 / F_coef_log_hz))

	# K+-normalize kernel
	k = k / np.linalg.norm(k[k > 0])

	return k


	def prime_and_one(upto=1000000):
	"""
	Returns a set of prime numbers, adapted from http://rebrained.com/?p=458

	Parameters
	----------
	upto : int
	Find prime numbers up to this number

	Returns
	-------
	A set of prime numbers including 1 & 2
	"""
	primes = np.arange(3, upto+1, 2)
	isprime = np.ones((upto-1)//2, dtype=np.bool8)
	for factor in primes[:int(np.sqrt(upto))//2]:
	if isprime[(factor-2)//2]:
	isprime[(factor*3-2)//2::factor] = 0
	return np.concatenate((np.array([1, 2]), primes[isprime]))