|
from torchaudio import transforms as T |
|
import torch |
|
import torch.nn as nn |
|
|
|
MEAN, STD = 0.5347, 0.0772 |
|
SR = 16000 |
|
NFFT = 1024 |
|
HOPLEN = 320 |
|
NMELS = 128 |
|
FMIN = 50 |
|
FMAX = 8000 |
|
|
|
class Normalization(torch.nn.Module): |
|
def __init__(self): |
|
super().__init__() |
|
|
|
def forward(self, x): |
|
return (x - x.min()) / (x.max() - x.min()) |
|
|
|
class Standardization(torch.nn.Module): |
|
def __init__(self, mean, std): |
|
super().__init__() |
|
|
|
self.mean = mean |
|
self.std = std |
|
|
|
def forward(self, x): |
|
return (x - self.mean) / self.std |
|
|
|
class MelSpectrogramProcessor: |
|
def __init__(self, sample_rate=SR, n_mels=NMELS, n_fft=NFFT, hop_length=HOPLEN, f_min=FMIN, f_max=FMAX, device='cpu'): |
|
self.transform = nn.Sequential( |
|
T.MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length, f_min=f_min, f_max=f_max), |
|
T.AmplitudeToDB(), |
|
Normalization(), |
|
Standardization(mean=MEAN, std=STD), |
|
).to(device) |
|
|
|
def process(self, waveform): |
|
return self.transform(waveform) |