Spaces:

jeonchangbin49
/

De-limiter

Running

App Files Files Community

De-limiter / eval_delimit /score_features.py

jeonchangbin49

first commit

a00b67a over 1 year ago

raw

history blame contribute delete

7.42 kB

	import os
	import argparse
	import csv
	import json
	import glob
	from typing import Any, Optional, Union, Collection

	import tqdm
	import numpy as np
	import librosa
	from librosa.core.spectrum import _spectrogram
	import musdb
	import essentia
	import essentia.standard
	import pyloudnorm as pyln

	from utils import str2bool, db2linear


	def spectral_crest(
	*,
	y: Optional[np.ndarray] = None,
	S: Optional[np.ndarray] = None,
	n_fft: int = 2048,
	hop_length: int = 512,
	win_length: Optional[int] = None,
	window: str = "hann",
	center: bool = True,
	pad_mode: str = "constant",
	amin: float = 1e-10,
	power: float = 2.0,
	) -> np.ndarray:
	"""Compute spectral crest

	Spectral crest (or tonality coefficient) is a measure of
	the ratio of the maximum of the spectrum to the arithmetic mean of the spectrum

	A higher spectral crest => more tonality,
	A lower spectral crest => more noisy.


	Parameters
	----------
	y : np.ndarray [shape=(..., n)] or None
	audio time series. Multi-channel is supported.
	S : np.ndarray [shape=(..., d, t)] or None
	(optional) pre-computed spectrogram magnitude
	n_fft : int > 0 [scalar]
	FFT window size
	hop_length : int > 0 [scalar]
	hop length for STFT. See `librosa.stft` for details.
	win_length : int <= n_fft [scalar]
	Each frame of audio is windowed by `window()`.
	The window will be of length `win_length` and then padded
	with zeros to match ``n_fft``.
	If unspecified, defaults to ``win_length = n_fft``.
	window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
	- a window specification (string, tuple, or number);
	see `scipy.signal.get_window`
	- a window function, such as `scipy.signal.windows.hann`
	- a vector or array of length ``n_fft``
	.. see also:: `librosa.filters.get_window`
	center : boolean
	- If `True`, the signal ``y`` is padded so that frame
	``t`` is centered at ``y[t * hop_length]``.
	- If `False`, then frame `t` begins at ``y[t * hop_length]``
	pad_mode : string
	If ``center=True``, the padding mode to use at the edges of the signal.
	By default, STFT uses zero padding.
	amin : float > 0 [scalar]
	minimum threshold for ``S`` (=added noise floor for numerical stability)
	power : float > 0 [scalar]
	Exponent for the magnitude spectrogram.
	e.g., 1 for energy, 2 for power, etc.
	Power spectrogram is usually used for computing spectral flatness.

	Returns
	-------
	crest : np.ndarray [shape=(..., 1, t)]
	spectral crest for each frame.


	"""

	S, n_fft = _spectrogram(
	y=y,
	S=S,
	n_fft=n_fft,
	hop_length=hop_length,
	power=1.0,
	win_length=win_length,
	window=window,
	center=center,
	pad_mode=pad_mode,
	)

	S_thresh = np.maximum(amin, S**power)
	# gmean = np.exp(np.mean(np.log(S_thresh), axis=-2, keepdims=True))
	gmax = np.max(S_thresh, axis=-2, keepdims=True)
	amean = np.mean(S_thresh, axis=-2, keepdims=True)
	crest: np.ndarray = gmax / amean
	return crest


	parser = argparse.ArgumentParser(description="model test.py")

	parser.add_argument(
	"--target",
	type=str,
	default="all",
	help="target source. all, vocals, drums, bass, other",
	)
	parser.add_argument(
	"--root", type=str, default="/path/to/musdb18hq_loudnorm"
	)
	parser.add_argument("--exp_name", type=str, default="delimit_6_s")
	parser.add_argument(
	"--output_directory",
	type=str,
	default="/path/to/results",
	)
	parser.add_argument(
	"--calc_results",
	type=str2bool,
	default=True,
	help="calculate results or musdb-hq or musdb-XL test dataset",
	)


	args, _ = parser.parse_known_args()

	args.sample_rate = 44100

	args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}"

	if args.calc_results:
	track_list = glob.glob(
	f"{args.output_directory}/test/{args.exp_name}/*/{args.target}.wav"
	)
	else:
	if args.target == "all":
	track_list = glob.glob(f"{args.root}/*/mixture.wav")
	else:
	track_list = glob.glob(f"{args.root}/*/{args.target}.wav")

	i = 0


	dynamic_complexity = essentia.standard.DynamicComplexity()
	loudness_range = essentia.standard.LoudnessEBUR128()
	spectral_centroid = essentia.standard.SpectralCentroidTime()
	crest = essentia.standard.Crest()
	dynamic_spread = essentia.standard.DistributionShape()
	central_moments = essentia.standard.CentralMoments()

	dict_song_score = {}
	list_rms = []
	list_crest_factor = []
	list_dc_score = []
	list_lra_score = []
	list_sc_hertz = []
	list_sf_score = []
	list_spectral_crest_score = []

	for track in tqdm.tqdm(track_list):
	audio_name = os.path.basename(os.path.dirname(track))
	gt_source_librosa = librosa.load(f"{track}", sr=args.sample_rate, mono=False)[
	0
	] # (nb_channels, nb_samples)
	gt_source_librosa_mono = librosa.to_mono(gt_source_librosa) # (nb_samples)

	gt_source_essentia = essentia.standard.AudioLoader(filename=f"{track}")()[
	0
	] # (nb_samples, nb_channels)
	gt_source_essentia_cat = np.concatenate(
	[gt_source_essentia[:, 0], gt_source_essentia[:, 1]]
	) # (nb_samples * nb_channels)
	gt_source_essentia_mono = np.mean(gt_source_essentia, axis=1) # (nb_samples)

	rms = np.sqrt(np.mean(gt_source_essentia_cat**2))
	crest_factor = np.max(np.abs(gt_source_essentia_cat)) / rms

	dc_score, _ = dynamic_complexity(gt_source_essentia_mono)
	_, _, _, lra_score = loudness_range(gt_source_essentia)
	sc_hertz = spectral_centroid(gt_source_essentia_mono)
	sf_score = np.mean(librosa.feature.spectral_flatness(gt_source_librosa_mono))
	spectral_crest_score = np.mean(spectral_crest(y=gt_source_librosa_mono))

	dict_song_score[audio_name] = {
	"rms": float(rms),
	"crest_factor": float(crest_factor),
	"dynamic_complexity_score": float(dc_score),
	"lra_score": float(lra_score),
	"spectral_centroid_hertz": float(sc_hertz),
	"spectral_flatness_score": float(sf_score),
	"spectral_crest_score": float(spectral_crest_score),
	}
	list_rms.append(rms)
	list_crest_factor.append(crest_factor)
	list_dc_score.append(dc_score)
	list_lra_score.append(lra_score)
	list_sc_hertz.append(sc_hertz)
	list_sf_score.append(sf_score)
	list_spectral_crest_score.append(spectral_crest_score)

	i += 1

	if args.calc_results:
	print(f"{args.exp_name} on {args.target}")
	else:
	print(f"{os.path.basename(args.root)} on {args.target}")
	print(f"rms: {np.mean(list_rms)}")
	print(f"crest_factor: {np.mean(list_crest_factor)}")
	print(f"dynamic_complexity_score: {np.mean(list_dc_score)}")
	print(f"lra_score: {np.mean(list_lra_score)}")
	print(f"sc_hertz: {np.mean(list_sc_hertz)}")
	print(f"sf_score: {np.mean(list_sf_score)}")
	print(f"spectral_crest_score: {np.mean(list_spectral_crest_score)}")


	# save dict_song_score to json file
	if args.target == "all":
	file_name = "score_features"
	else:
	file_name = f"score_feature_{args.target}"
	if args.calc_results:
	with open(
	f"{args.output_directory}/test/{args.exp_name}/{file_name}.json", "w"
	) as f:
	json.dump(dict_song_score, f, indent=4)
	else:
	with open(f"{args.root}/{file_name}.json", "w") as f:
	json.dump(dict_song_score, f, indent=4)