Spaces:

atsushieee
/

sovits-test

Running

App Files Files Community

sovits-test / pitch /core /yin.py

atsushieee

Upload folder using huggingface_hub

9791162 2 months ago

raw

history blame

7.72 kB

	"""
	\| Description: libf0 YIN implementation
	\| Contributors: Sebastian Rosenzweig, Simon Schwär, Edgar Suárez, Meinard Müller
	\| License: The MIT license, https://opensource.org/licenses/MIT
	\| This file is part of libf0.
	"""
	import numpy as np
	from numba import njit


	def yin(x, Fs=22050, N=2048, H=256, F_min=55.0, F_max=1760.0, threshold=0.15, verbose=False):
	"""
	Implementation of the YIN algorithm.

	.. [#] Alain De Cheveigné and Hideki Kawahara.
	"YIN, a fundamental frequency estimator for speech and music."
	The Journal of the Acoustical Society of America 111.4 (2002): 1917-1930.

	Parameters
	----------
	x : ndarray [shape=(L, )], real - valued
	Audio signal
	Fs : int
	Sampling frequency
	N : int
	Window size
	H : int
	Hop size
	F_min : float
	Minimal frequency
	F_max : float
	Maximal frequency
	threshold : float
	Threshold for cumulative mean normalized difference function
	verbose : bool
	Switch to activate/deactivate status bar

	Returns
	-------
	f0 : ndarray
	Estimated F0-trajectory
	t : ndarray
	Time axis
	ap: ndarray
	Aperiodicity (indicator for voicing: the lower, the more reliable the estimate)
	"""

	if F_min > F_max:
	raise Exception("F_min must be smaller than F_max!")

	if F_min < Fs/N:
	raise Exception(f"The condition (F_min >= Fs/N) was not met. With Fs = {Fs}, N = {N} and F_min = {F_min} you have the following options: \n1) Set F_min >= {np.ceil(Fs/N)} Hz. \n2) Set N >= {np.ceil(Fs/F_min).astype(int)}. \n3) Set Fs <= {np.floor(F_min * N)} Hz.")

	x_pad = np.concatenate((np.zeros(N//2), x, np.zeros(N//2))) # Add zeros for centered estimates
	M = int(np.floor((len(x_pad) - N) / H)) + 1 # Compute number of estimates that will be generated
	f0 = np.zeros(M) # Estimated fundamental frequencies (0 for unspecified frames)
	t = np.arange(M)*H/Fs # Time axis
	ap = np.zeros(M) # Aperiodicity

	lag_min = max(int(np.ceil(Fs / F_max)), 1) # lag of maximal frequency in samples
	lag_max = int(np.ceil(Fs / F_min)) # lag of minimal frequency in samples

	for m in range(M):
	if verbose:
	print(f"YIN Progress: {np.ceil(100*m/M).astype(int)}%", end='\r')
	# Take a frame from input signal
	frame = x_pad[mH:mH + N]

	# Cumulative Mean Normalized Difference Function
	cmndf = cumulative_mean_normalized_difference_function(frame, lag_max)

	# Absolute Thresholding
	lag_est = absolute_thresholding(cmndf, threshold, lag_min, lag_max, parabolic_interp=True)

	# Refine estimate by constraining search to vicinity of best local estimate (default: +/- 25 cents)
	tol_cents = 25
	lag_min_local = int(np.round(Fs / ((Fs / lag_est) * 2 ** (tol_cents/1200))))
	if lag_min_local < lag_min:
	lag_min_local = lag_min
	lag_max_local = int(np.round(Fs / ((Fs / lag_est) * 2 ** (-tol_cents/1200))))
	if lag_max_local > lag_max:
	lag_max_local = lag_max
	lag_new = absolute_thresholding(cmndf, threshold=np.inf, lag_min=lag_min_local, lag_max=lag_max_local,
	parabolic_interp=True)

	# Compute Fundamental Frequency Estimate
	f0[m] = Fs / lag_new

	# Compute Aperiodicity
	ap[m] = aperiodicity(frame, lag_new)

	return f0, t, ap


	@njit
	def cumulative_mean_normalized_difference_function(frame, lag_max):
	"""
	Computes Cumulative Mean Normalized Difference Function (CMNDF).

	Parameters
	----------
	frame : ndarray
	Audio frame
	lag_max : int
	Maximum expected lag in the CMNDF

	Returns
	-------
	cmndf : ndarray
	Cumulative Mean Normalized Difference Function
	"""

	cmndf = np.zeros(lag_max+1) # Initialize CMNDF
	cmndf[0] = 1
	diff_mean = 0

	for tau in range(1, lag_max+1):
	# Difference function
	diff = np.sum((frame[0:-tau] - frame[0 + tau:]) ** 2)
	# Iterative mean of the difference function
	diff_mean = diff_mean*(tau-1)/tau + diff/tau

	cmndf[tau] = diff / (diff_mean + np.finfo(np.float64).eps)

	return cmndf


	def absolute_thresholding(cmndf, threshold, lag_min, lag_max, parabolic_interp=True):
	"""
	Absolute thresholding:
	Set an absolute threshold and choose the smallest value of tau that gives a minimum of d' deeper than that
	threshold. If none is found, the global minimum is chosen instead.

	Parameters
	----------
	cmndf : ndarray
	Cumulative Mean Normalized Difference Function
	threshold : float
	Threshold
	lag_min : float
	Minimal lag
	lag_max : float
	Maximal lag
	parabolic_interp : bool
	Switch to activate/deactivate parabolic interpolation

	Returns
	-------

	"""

	# take shortcut if search range only allows for one possible lag
	if lag_min == lag_max:
	return lag_min

	# find local minima below absolute threshold in interval [lag_min:lag_max]
	local_min_idxs = (np.argwhere((cmndf[1:-1] < cmndf[0:-2]) & (cmndf[1:-1] < cmndf[2:]))).flatten() + 1
	below_thr_idxs = np.argwhere(cmndf[lag_min:lag_max] < threshold).flatten() + lag_min
	# numba compatible intersection of indices sets
	min_idxs = np.unique(np.array([i for i in local_min_idxs for j in below_thr_idxs if i == j]))

	# if no local minima below threshold are found, return global minimum
	if not min_idxs.size:
	return np.argmin(cmndf[lag_min:lag_max]) + lag_min

	# find first local minimum
	lag = np.min(min_idxs) # choose first local minimum

	# Optional: Parabolic Interpolation of local minima
	if parabolic_interp:
	lag_corr, cmndf[lag] = parabolic_interpolation(cmndf[lag-1], cmndf[lag], cmndf[lag+1])
	lag += lag_corr

	return lag


	@njit
	def parabolic_interpolation(y1, y2, y3):
	"""
	Parabolic interpolation of an extremal value given three samples with equal spacing on the x-axis.
	The middle value y2 is assumed to be the extremal sample of the three.

	Parameters
	----------
	y1: f(x1)
	y2: f(x2)
	y3: f(x3)

	Returns
	-------
	x_interp: Interpolated x-value (relative to x3-x2)
	y_interp: Interpolated y-value, f(x_interp)
	"""

	a = np.finfo(np.float64).eps + (y1 + y3 - 2 * y2) / 2
	b = (y3 - y1) / 2
	x_interp = -b / (2 * a)
	y_interp = y2 - (b ** 2) / (4 * a)

	return x_interp, y_interp


	def aperiodicity(frame, lag_est):
	"""
	Compute aperiodicity of given frame (serves as indicator for reliability or voicing detection).

	Parameters
	----------
	frame : ndarray
	Frame
	lag_est : float
	Estimated lag

	Returns
	-------
	ap: float
	Aperiodicity (the lower, the more reliable the estimate)
	"""

	lag_int = int(np.floor(lag_est)) # uncorrected period estimate
	frac = lag_est - lag_int # residual

	# Pad frame to insure constant size
	frame_pad = np.concatenate((frame, np.flip(frame))) # mirror padding

	# Shift frame by estimated period
	if frac == 0:
	frame_shift = frame_pad[lag_int:lag_int+len(frame)]
	else:
	# linear interpolation between adjacent shifts
	frame_shift = (1 - frac) * frame_pad[lag_int:lag_int+len(frame)] + \
	frac * frame_pad[lag_int+1:lag_int+1+len(frame)]

	pwr = (np.mean(frame 2) + np.mean(frame_shift 2)) / 2 # average power over fixed and shifted frame
	res = np.mean((frame - frame_shift) ** 2) / 2 # residual power
	ap = res / (pwr + np.finfo(np.float64).eps)

	return ap