File size: 6,807 Bytes
e34aada |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import librosa
import numpy as np
import pyloudnorm as pyln
import torch
from scipy.signal import get_window
from utils.audio.dct import dct
from utils.audio.vad import trim_long_silences
def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
'''compute right padding (final frame) or both sides padding (first and final frames)
'''
assert pad_sides in (1, 2)
# return int(fsize // 2)
pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
if pad_sides == 1:
return 0, pad
else:
return pad // 2, pad // 2 + pad % 2
def amp_to_db(x):
return 20 * np.log10(np.maximum(1e-5, x))
def db_to_amp(x):
return 10.0 ** (x * 0.05)
def normalize(S, min_level_db):
return (S - min_level_db) / -min_level_db
def denormalize(D, min_level_db):
return (D * -min_level_db) + min_level_db
def librosa_wav2spec(wav_path,
fft_size=None,
hop_size=256,
win_length=1024,
window="hann",
num_mels=80,
fmin=80,
fmax=-1,
eps=1e-6,
sample_rate=22050,
loud_norm=False,
trim_long_sil=False,
center=True):
if isinstance(wav_path, str):
if trim_long_sil:
wav, _, _ = trim_long_silences(wav_path, sample_rate)
else:
wav, _ = librosa.core.load(wav_path, sr=sample_rate)
else:
wav = wav_path
if fft_size is None:
fft_size = win_length
if loud_norm:
meter = pyln.Meter(sample_rate) # create BS.1770 meter
loudness = meter.integrated_loudness(wav)
wav = pyln.normalize.loudness(wav, loudness, -16.0)
if np.abs(wav).max() > 1:
wav = wav / np.abs(wav).max()
# get amplitude spectrogram
x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
win_length=win_length, window=window, center=center)
linear_spc = np.abs(x_stft) # (n_bins, T)
# get mel basis
fmin = 0 if fmin == -1 else fmin
fmax = sample_rate / 2 if fmax == -1 else fmax
mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=fmin, fmax=fmax)
# calculate mel spec
mel = mel_basis @ linear_spc
mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T)
if center:
l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1)
wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
wav = wav[:mel.shape[1] * hop_size]
# log linear spec
linear_spc = np.log10(np.maximum(eps, linear_spc))
return {'wav': wav, 'mel': mel.T, 'linear': linear_spc.T, 'mel_basis': mel_basis}
def librosa_wav2mfcc(wav_path,
fft_size=None,
hop_size=256,
win_length=1024,
window="hann",
num_mels=80,
fmin=80,
fmax=-1,
sample_rate=22050,
center=True):
if isinstance(wav_path, str):
wav, _ = librosa.core.load(wav_path, sr=sample_rate)
else:
wav = wav_path
mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13,
n_fft=fft_size, n_mels=num_mels, fmin=fmin, fmax=fmax,
hop_length=hop_size,
win_length=win_length, window=window, center=center)
return mfcc.T
def torch_wav2spec(wav,
mel_basis,
fft_size=1024,
hop_size=256,
win_length=1024,
eps=1e-6):
fft_window = get_window('hann', win_length, fftbins=True)
fft_window = torch.FloatTensor(fft_window).to(wav.device)
mel_basis = torch.FloatTensor(mel_basis).to(wav.device)
x_stft = torch.stft(wav, fft_size, hop_size, win_length, fft_window,
center=False, pad_mode='constant', normalized=False, onesided=True, return_complex=True)
linear_spc = torch.abs(x_stft)
mel = mel_basis @ linear_spc
mel = torch.log10(torch.clamp_min(mel, eps)) # (n_mel_bins, T)
return mel.transpose(1, 2)
def mel2mfcc_torch(mel, n_coef=13):
return dct(mel, norm='ortho')[:, :, :n_coef]
def librosa_wav2linearspec(wav_path,
fft_size=None,
hop_size=256,
win_length=1024,
window="hann",
num_mels=80,
fmin=80,
fmax=-1,
eps=1e-6,
sample_rate=22050,
loud_norm=False,
trim_long_sil=False,
center=True):
if isinstance(wav_path, str):
if trim_long_sil:
wav, _, _ = trim_long_silences(wav_path, sample_rate)
else:
wav, _ = librosa.core.load(wav_path, sr=sample_rate)
else:
wav = wav_path
if fft_size is None:
fft_size = win_length
if loud_norm:
meter = pyln.Meter(sample_rate) # create BS.1770 meter
loudness = meter.integrated_loudness(wav)
wav = pyln.normalize.loudness(wav, loudness, -16.0)
if np.abs(wav).max() > 1:
wav = wav / np.abs(wav).max()
# get amplitude spectrogram
x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
win_length=win_length, window=window, center=center)
linear_spc = np.abs(x_stft) # (n_bins, T)
# pad wav
if center:
l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1)
wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
wav = wav[:linear_spc.shape[1] * hop_size]
# log linear spec
linear_spc = np.log10(np.maximum(eps, linear_spc))
return {'wav': wav, 'linear': linear_spc.T}
def librosa_linear2mel(linear_spec, hparams, num_mels=160, eps=1e-6):
fft_size=hparams['fft_size']
hop_size=hparams['hop_size']
win_length=hparams['win_size']
fmin=hparams['fmin']
fmax=hparams['fmax']
sample_rate=hparams['audio_sample_rate']
# get mel basis
fmin = 0 if fmin == -1 else fmin
fmax = sample_rate / 2 if fmax == -1 else fmax
mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
mel_basis = torch.FloatTensor(mel_basis).to(linear_spec.device)[None, :].repeat(linear_spec.shape[0], 1, 1)
# perform linear spec to mel spec
linear_spec = torch.pow(10, linear_spec)
mel = torch.bmm(mel_basis, linear_spec.transpose(1, 2))
mel = torch.log10(torch.clamp_min(mel, eps)) # (n_mel_bins, T)
return mel.transpose(1, 2)
|