import os import argparse import csv import json import glob from typing import Any, Optional, Union, Collection import tqdm import numpy as np import librosa from librosa.core.spectrum import _spectrogram import musdb import essentia import essentia.standard import pyloudnorm as pyln from utils import str2bool, db2linear def spectral_crest( *, y: Optional[np.ndarray] = None, S: Optional[np.ndarray] = None, n_fft: int = 2048, hop_length: int = 512, win_length: Optional[int] = None, window: str = "hann", center: bool = True, pad_mode: str = "constant", amin: float = 1e-10, power: float = 2.0, ) -> np.ndarray: """Compute spectral crest Spectral crest (or tonality coefficient) is a measure of the ratio of the maximum of the spectrum to the arithmetic mean of the spectrum A higher spectral crest => more tonality, A lower spectral crest => more noisy. Parameters ---------- y : np.ndarray [shape=(..., n)] or None audio time series. Multi-channel is supported. S : np.ndarray [shape=(..., d, t)] or None (optional) pre-computed spectrogram magnitude n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] hop length for STFT. See `librosa.stft` for details. win_length : int <= n_fft [scalar] Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match ``n_fft``. If unspecified, defaults to ``win_length = n_fft``. window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] - a window specification (string, tuple, or number); see `scipy.signal.get_window` - a window function, such as `scipy.signal.windows.hann` - a vector or array of length ``n_fft`` .. see also:: `librosa.filters.get_window` center : boolean - If `True`, the signal ``y`` is padded so that frame ``t`` is centered at ``y[t * hop_length]``. - If `False`, then frame `t` begins at ``y[t * hop_length]`` pad_mode : string If ``center=True``, the padding mode to use at the edges of the signal. By default, STFT uses zero padding. amin : float > 0 [scalar] minimum threshold for ``S`` (=added noise floor for numerical stability) power : float > 0 [scalar] Exponent for the magnitude spectrogram. e.g., 1 for energy, 2 for power, etc. Power spectrogram is usually used for computing spectral flatness. Returns ------- crest : np.ndarray [shape=(..., 1, t)] spectral crest for each frame. """ S, n_fft = _spectrogram( y=y, S=S, n_fft=n_fft, hop_length=hop_length, power=1.0, win_length=win_length, window=window, center=center, pad_mode=pad_mode, ) S_thresh = np.maximum(amin, S**power) # gmean = np.exp(np.mean(np.log(S_thresh), axis=-2, keepdims=True)) gmax = np.max(S_thresh, axis=-2, keepdims=True) amean = np.mean(S_thresh, axis=-2, keepdims=True) crest: np.ndarray = gmax / amean return crest parser = argparse.ArgumentParser(description="model test.py") parser.add_argument( "--target", type=str, default="all", help="target source. all, vocals, drums, bass, other", ) parser.add_argument( "--root", type=str, default="/path/to/musdb18hq_loudnorm" ) parser.add_argument("--exp_name", type=str, default="delimit_6_s") parser.add_argument( "--output_directory", type=str, default="/path/to/results", ) parser.add_argument( "--calc_results", type=str2bool, default=True, help="calculate results or musdb-hq or musdb-XL test dataset", ) args, _ = parser.parse_known_args() args.sample_rate = 44100 args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}" if args.calc_results: track_list = glob.glob( f"{args.output_directory}/test/{args.exp_name}/*/{args.target}.wav" ) else: if args.target == "all": track_list = glob.glob(f"{args.root}/*/mixture.wav") else: track_list = glob.glob(f"{args.root}/*/{args.target}.wav") i = 0 dynamic_complexity = essentia.standard.DynamicComplexity() loudness_range = essentia.standard.LoudnessEBUR128() spectral_centroid = essentia.standard.SpectralCentroidTime() crest = essentia.standard.Crest() dynamic_spread = essentia.standard.DistributionShape() central_moments = essentia.standard.CentralMoments() dict_song_score = {} list_rms = [] list_crest_factor = [] list_dc_score = [] list_lra_score = [] list_sc_hertz = [] list_sf_score = [] list_spectral_crest_score = [] for track in tqdm.tqdm(track_list): audio_name = os.path.basename(os.path.dirname(track)) gt_source_librosa = librosa.load(f"{track}", sr=args.sample_rate, mono=False)[ 0 ] # (nb_channels, nb_samples) gt_source_librosa_mono = librosa.to_mono(gt_source_librosa) # (nb_samples) gt_source_essentia = essentia.standard.AudioLoader(filename=f"{track}")()[ 0 ] # (nb_samples, nb_channels) gt_source_essentia_cat = np.concatenate( [gt_source_essentia[:, 0], gt_source_essentia[:, 1]] ) # (nb_samples * nb_channels) gt_source_essentia_mono = np.mean(gt_source_essentia, axis=1) # (nb_samples) rms = np.sqrt(np.mean(gt_source_essentia_cat**2)) crest_factor = np.max(np.abs(gt_source_essentia_cat)) / rms dc_score, _ = dynamic_complexity(gt_source_essentia_mono) _, _, _, lra_score = loudness_range(gt_source_essentia) sc_hertz = spectral_centroid(gt_source_essentia_mono) sf_score = np.mean(librosa.feature.spectral_flatness(gt_source_librosa_mono)) spectral_crest_score = np.mean(spectral_crest(y=gt_source_librosa_mono)) dict_song_score[audio_name] = { "rms": float(rms), "crest_factor": float(crest_factor), "dynamic_complexity_score": float(dc_score), "lra_score": float(lra_score), "spectral_centroid_hertz": float(sc_hertz), "spectral_flatness_score": float(sf_score), "spectral_crest_score": float(spectral_crest_score), } list_rms.append(rms) list_crest_factor.append(crest_factor) list_dc_score.append(dc_score) list_lra_score.append(lra_score) list_sc_hertz.append(sc_hertz) list_sf_score.append(sf_score) list_spectral_crest_score.append(spectral_crest_score) i += 1 if args.calc_results: print(f"{args.exp_name} on {args.target}") else: print(f"{os.path.basename(args.root)} on {args.target}") print(f"rms: {np.mean(list_rms)}") print(f"crest_factor: {np.mean(list_crest_factor)}") print(f"dynamic_complexity_score: {np.mean(list_dc_score)}") print(f"lra_score: {np.mean(list_lra_score)}") print(f"sc_hertz: {np.mean(list_sc_hertz)}") print(f"sf_score: {np.mean(list_sf_score)}") print(f"spectral_crest_score: {np.mean(list_spectral_crest_score)}") # save dict_song_score to json file if args.target == "all": file_name = "score_features" else: file_name = f"score_feature_{args.target}" if args.calc_results: with open( f"{args.output_directory}/test/{args.exp_name}/{file_name}.json", "w" ) as f: json.dump(dict_song_score, f, indent=4) else: with open(f"{args.root}/{file_name}.json", "w") as f: json.dump(dict_song_score, f, indent=4)