import warnings import librosa import numpy as np import resampy import torch import crepe ############################################################################### # Constants ############################################################################### # Minimum decibel level MIN_DB = -100. # Reference decibel level REF_DB = 20. ############################################################################### # A-weighted loudness ############################################################################### def a_weighted(audio, sample_rate, hop_length=None, pad=True): """Retrieve the per-frame loudness""" # Save device device = audio.device # Default hop length of 10 ms hop_length = sample_rate // 100 if hop_length is None else hop_length # Convert to numpy audio = audio.detach().cpu().numpy().squeeze(0) # Resample if sample_rate != crepe.SAMPLE_RATE: audio = resampy.resample(audio, sample_rate, crepe.SAMPLE_RATE) hop_length = int(hop_length * crepe.SAMPLE_RATE / sample_rate) # Cache weights if not hasattr(a_weighted, 'weights'): a_weighted.weights = perceptual_weights() # Take stft stft = librosa.stft(audio, n_fft=crepe.WINDOW_SIZE, hop_length=hop_length, win_length=crepe.WINDOW_SIZE, center=pad, pad_mode='constant') # Compute magnitude on db scale db = librosa.amplitude_to_db(np.abs(stft)) # Apply A-weighting weighted = db + a_weighted.weights # Threshold weighted[weighted < MIN_DB] = MIN_DB # Average over weighted frequencies return torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None] def perceptual_weights(): """A-weighted frequency-dependent perceptual loudness weights""" frequencies = librosa.fft_frequencies(sr=crepe.SAMPLE_RATE, n_fft=crepe.WINDOW_SIZE) # A warning is raised for nearly inaudible frequencies, but it ends up # defaulting to -100 db. That default is fine for our purposes. with warnings.catch_warnings(): warnings.simplefilter('ignore', RuntimeWarning) return librosa.A_weighting(frequencies)[:, None] - REF_DB