""" This code is modified version of LCNN baseline from ASVSpoof2021 challenge - https://github.com/asvspoof-challenge/2021/blob/main/LA/Baseline-LFCC-LCNN/project/baseline_LA/model.py """ import sys import torch import torch.nn as torch_nn from src import frontends NUM_COEFFICIENTS = 384 # For blstm class BLSTMLayer(torch_nn.Module): """ Wrapper over dilated conv1D Input tensor: (batchsize=1, length, dim_in) Output tensor: (batchsize=1, length, dim_out) We want to keep the length the same """ def __init__(self, input_dim, output_dim): super().__init__() if output_dim % 2 != 0: print("Output_dim of BLSTMLayer is {:d}".format(output_dim)) print("BLSTMLayer expects a layer size of even number") sys.exit(1) # bi-directional LSTM self.l_blstm = torch_nn.LSTM( input_dim, output_dim // 2, bidirectional=True ) def forward(self, x): # permute to (length, batchsize=1, dim) blstm_data, _ = self.l_blstm(x.permute(1, 0, 2)) # permute it backt to (batchsize=1, length, dim) return blstm_data.permute(1, 0, 2) class MaxFeatureMap2D(torch_nn.Module): """ Max feature map (along 2D) MaxFeatureMap2D(max_dim=1) l_conv2d = MaxFeatureMap2D(1) data_in = torch.rand([1, 4, 5, 5]) data_out = l_conv2d(data_in) Input: ------ data_in: tensor of shape (batch, channel, ...) Output: ------- data_out: tensor of shape (batch, channel//2, ...) Note ---- By default, Max-feature-map is on channel dimension, and maxout is used on (channel ...) """ def __init__(self, max_dim = 1): super().__init__() self.max_dim = max_dim def forward(self, inputs): # suppose inputs (batchsize, channel, length, dim) shape = list(inputs.size()) if self.max_dim >= len(shape): print("MaxFeatureMap: maximize on %d dim" % (self.max_dim)) print("But input has %d dimensions" % (len(shape))) sys.exit(1) if shape[self.max_dim] // 2 * 2 != shape[self.max_dim]: print("MaxFeatureMap: maximize on %d dim" % (self.max_dim)) print("But this dimension has an odd number of data") sys.exit(1) shape[self.max_dim] = shape[self.max_dim]//2 shape.insert(self.max_dim, 2) # view to (batchsize, 2, channel//2, ...) # maximize on the 2nd dim m, i = inputs.view(*shape).max(self.max_dim) return m ############## ## FOR MODEL ############## class LCNN(torch_nn.Module): """ Model definition """ def __init__(self, **kwargs): super().__init__() input_channels = kwargs.get("input_channels", 1) num_coefficients = kwargs.get("num_coefficients", NUM_COEFFICIENTS) # Working sampling rate self.num_coefficients = num_coefficients # dimension of embedding vectors # here, the embedding is just the activation before sigmoid() self.v_emd_dim = 1 # it can handle models with multiple front-end configuration # by default, only a single front-end self.m_transform = torch_nn.Sequential( torch_nn.Conv2d(input_channels, 64, (5, 5), 1, padding=(2, 2)), MaxFeatureMap2D(), torch.nn.MaxPool2d((2, 2), (2, 2)), torch_nn.Conv2d(32, 64, (1, 1), 1, padding=(0, 0)), MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 96, (3, 3), 1, padding=(1, 1)), MaxFeatureMap2D(), torch.nn.MaxPool2d((2, 2), (2, 2)), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 96, (1, 1), 1, padding=(0, 0)), MaxFeatureMap2D(), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 128, (3, 3), 1, padding=(1, 1)), MaxFeatureMap2D(), torch.nn.MaxPool2d((2, 2), (2, 2)), torch_nn.Conv2d(64, 128, (1, 1), 1, padding=(0, 0)), MaxFeatureMap2D(), torch_nn.BatchNorm2d(64, affine=False), torch_nn.Conv2d(64, 64, (3, 3), 1, padding=(1, 1)), MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, (1, 1), 1, padding=(0, 0)), MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, (3, 3), 1, padding=(1, 1)), MaxFeatureMap2D(), torch_nn.MaxPool2d((2, 2), (2, 2)), torch_nn.Dropout(0.7) ) self.m_before_pooling = torch_nn.Sequential( BLSTMLayer((self.num_coefficients//16) * 32, (self.num_coefficients//16) * 32), BLSTMLayer((self.num_coefficients//16) * 32, (self.num_coefficients//16) * 32) ) self.m_output_act = torch_nn.Linear((self.num_coefficients // 16) * 32, self.v_emd_dim) def _compute_embedding(self, x): """ definition of forward method Assume x (batchsize, length, dim) Output x (batchsize * number_filter, output_dim) """ # resample if necessary # x = self.m_resampler(x.squeeze(-1)).unsqueeze(-1) # number of sub models batch_size = x.shape[0] # buffer to store output scores from sub-models output_emb = torch.zeros( [batch_size, self.v_emd_dim], device=x.device, dtype=x.dtype ) # compute scores for each sub-models idx = 0 # compute scores # 1. unsqueeze to (batch, 1, frame_length, fft_bin) # 2. compute hidden features x = x.permute(0,1,3,2) hidden_features = self.m_transform(x) # 3. (batch, channel, frame//N, feat_dim//N) -> # (batch, frame//N, channel * feat_dim//N) # where N is caused by conv with stride hidden_features = hidden_features.permute(0, 2, 1, 3).contiguous() frame_num = hidden_features.shape[1] hidden_features = hidden_features.view(batch_size, frame_num, -1) # 4. pooling # 4. pass through LSTM then summingc hidden_features_lstm = self.m_before_pooling(hidden_features) # 5. pass through the output layer tmp_emb = self.m_output_act((hidden_features_lstm + hidden_features).mean(1)) output_emb[idx * batch_size : (idx+1) * batch_size] = tmp_emb return output_emb def _compute_score(self, feature_vec): # feature_vec is [batch * submodel, 1] return torch.sigmoid(feature_vec).squeeze(1) def forward(self, x): feature_vec = self._compute_embedding(x) return feature_vec class FrontendLCNN(LCNN): """ Model definition """ def __init__(self, device: str = "cuda", **kwargs): super().__init__(**kwargs) self.device = device frontend_name = kwargs.get("frontend_algorithm", []) self.frontend = frontends.get_frontend(frontend_name) print(f"Using {frontend_name} frontend") def _compute_frontend(self, x): frontend = self.frontend(x) if frontend.ndim < 4: return frontend.unsqueeze(1) # (bs, 1, n_lfcc, frames) return frontend # (bs, n, n_lfcc, frames) def forward(self, x): x = self._compute_frontend(x) feature_vec = self._compute_embedding(x) return feature_vec if __name__ == "__main__": device = "cuda" print("Definition of model") model = FrontendLCNN(input_channels=2, num_coefficients=80, device=device, frontend_algorithm=["mel_spec"]) model = model.to(device) batch_size = 12 mock_input = torch.rand((batch_size, 64_600,), device=device) output = model(mock_input) print(output.shape)