File size: 7,715 Bytes
9791162 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
| Description: libf0 YIN implementation
| Contributors: Sebastian Rosenzweig, Simon Schwär, Edgar Suárez, Meinard Müller
| License: The MIT license,
| This file is part of libf0.
import numpy as np
from numba import njit
def yin(x, Fs=22050, N=2048, H=256, F_min=55.0, F_max=1760.0, threshold=0.15, verbose=False):
Implementation of the YIN algorithm.
.. [#] Alain De Cheveigné and Hideki Kawahara.
"YIN, a fundamental frequency estimator for speech and music."
The Journal of the Acoustical Society of America 111.4 (2002): 1917-1930.
x : ndarray [shape=(L, )], real - valued
Audio signal
Fs : int
Sampling frequency
N : int
Window size
H : int
Hop size
F_min : float
Minimal frequency
F_max : float
Maximal frequency
threshold : float
Threshold for cumulative mean normalized difference function
verbose : bool
Switch to activate/deactivate status bar
f0 : ndarray
Estimated F0-trajectory
t : ndarray
Time axis
ap: ndarray
Aperiodicity (indicator for voicing: the lower, the more reliable the estimate)
if F_min > F_max:
raise Exception("F_min must be smaller than F_max!")
if F_min < Fs/N:
raise Exception(f"The condition (F_min >= Fs/N) was not met. With Fs = {Fs}, N = {N} and F_min = {F_min} you have the following options: \n1) Set F_min >= {np.ceil(Fs/N)} Hz. \n2) Set N >= {np.ceil(Fs/F_min).astype(int)}. \n3) Set Fs <= {np.floor(F_min * N)} Hz.")
x_pad = np.concatenate((np.zeros(N//2), x, np.zeros(N//2))) # Add zeros for centered estimates
M = int(np.floor((len(x_pad) - N) / H)) + 1 # Compute number of estimates that will be generated
f0 = np.zeros(M) # Estimated fundamental frequencies (0 for unspecified frames)
t = np.arange(M)*H/Fs # Time axis
ap = np.zeros(M) # Aperiodicity
lag_min = max(int(np.ceil(Fs / F_max)), 1) # lag of maximal frequency in samples
lag_max = int(np.ceil(Fs / F_min)) # lag of minimal frequency in samples
for m in range(M):
if verbose:
print(f"YIN Progress: {np.ceil(100*m/M).astype(int)}%", end='\r')
# Take a frame from input signal
frame = x_pad[m*H:m*H + N]
# Cumulative Mean Normalized Difference Function
cmndf = cumulative_mean_normalized_difference_function(frame, lag_max)
# Absolute Thresholding
lag_est = absolute_thresholding(cmndf, threshold, lag_min, lag_max, parabolic_interp=True)
# Refine estimate by constraining search to vicinity of best local estimate (default: +/- 25 cents)
tol_cents = 25
lag_min_local = int(np.round(Fs / ((Fs / lag_est) * 2 ** (tol_cents/1200))))
if lag_min_local < lag_min:
lag_min_local = lag_min
lag_max_local = int(np.round(Fs / ((Fs / lag_est) * 2 ** (-tol_cents/1200))))
if lag_max_local > lag_max:
lag_max_local = lag_max
lag_new = absolute_thresholding(cmndf, threshold=np.inf, lag_min=lag_min_local, lag_max=lag_max_local,
# Compute Fundamental Frequency Estimate
f0[m] = Fs / lag_new
# Compute Aperiodicity
ap[m] = aperiodicity(frame, lag_new)
return f0, t, ap
def cumulative_mean_normalized_difference_function(frame, lag_max):
Computes Cumulative Mean Normalized Difference Function (CMNDF).
frame : ndarray
Audio frame
lag_max : int
Maximum expected lag in the CMNDF
cmndf : ndarray
Cumulative Mean Normalized Difference Function
cmndf = np.zeros(lag_max+1) # Initialize CMNDF
cmndf[0] = 1
diff_mean = 0
for tau in range(1, lag_max+1):
# Difference function
diff = np.sum((frame[0:-tau] - frame[0 + tau:]) ** 2)
# Iterative mean of the difference function
diff_mean = diff_mean*(tau-1)/tau + diff/tau
cmndf[tau] = diff / (diff_mean + np.finfo(np.float64).eps)
return cmndf
def absolute_thresholding(cmndf, threshold, lag_min, lag_max, parabolic_interp=True):
Absolute thresholding:
Set an absolute threshold and choose the smallest value of tau that gives a minimum of d' deeper than that
threshold. If none is found, the global minimum is chosen instead.
cmndf : ndarray
Cumulative Mean Normalized Difference Function
threshold : float
lag_min : float
Minimal lag
lag_max : float
Maximal lag
parabolic_interp : bool
Switch to activate/deactivate parabolic interpolation
# take shortcut if search range only allows for one possible lag
if lag_min == lag_max:
return lag_min
# find local minima below absolute threshold in interval [lag_min:lag_max]
local_min_idxs = (np.argwhere((cmndf[1:-1] < cmndf[0:-2]) & (cmndf[1:-1] < cmndf[2:]))).flatten() + 1
below_thr_idxs = np.argwhere(cmndf[lag_min:lag_max] < threshold).flatten() + lag_min
# numba compatible intersection of indices sets
min_idxs = np.unique(np.array([i for i in local_min_idxs for j in below_thr_idxs if i == j]))
# if no local minima below threshold are found, return global minimum
if not min_idxs.size:
return np.argmin(cmndf[lag_min:lag_max]) + lag_min
# find first local minimum
lag = np.min(min_idxs) # choose first local minimum
# Optional: Parabolic Interpolation of local minima
if parabolic_interp:
lag_corr, cmndf[lag] = parabolic_interpolation(cmndf[lag-1], cmndf[lag], cmndf[lag+1])
lag += lag_corr
return lag
def parabolic_interpolation(y1, y2, y3):
Parabolic interpolation of an extremal value given three samples with equal spacing on the x-axis.
The middle value y2 is assumed to be the extremal sample of the three.
y1: f(x1)
y2: f(x2)
y3: f(x3)
x_interp: Interpolated x-value (relative to x3-x2)
y_interp: Interpolated y-value, f(x_interp)
a = np.finfo(np.float64).eps + (y1 + y3 - 2 * y2) / 2
b = (y3 - y1) / 2
x_interp = -b / (2 * a)
y_interp = y2 - (b ** 2) / (4 * a)
return x_interp, y_interp
def aperiodicity(frame, lag_est):
Compute aperiodicity of given frame (serves as indicator for reliability or voicing detection).
frame : ndarray
lag_est : float
Estimated lag
ap: float
Aperiodicity (the lower, the more reliable the estimate)
lag_int = int(np.floor(lag_est)) # uncorrected period estimate
frac = lag_est - lag_int # residual
# Pad frame to insure constant size
frame_pad = np.concatenate((frame, np.flip(frame))) # mirror padding
# Shift frame by estimated period
if frac == 0:
frame_shift = frame_pad[lag_int:lag_int+len(frame)]
# linear interpolation between adjacent shifts
frame_shift = (1 - frac) * frame_pad[lag_int:lag_int+len(frame)] + \
frac * frame_pad[lag_int+1:lag_int+1+len(frame)]
pwr = (np.mean(frame ** 2) + np.mean(frame_shift ** 2)) / 2 # average power over fixed and shifted frame
res = np.mean((frame - frame_shift) ** 2) / 2 # residual power
ap = res / (pwr + np.finfo(np.float64).eps)
return ap