ldhldh's picture
Upload 11 files
bb5a96d verified
"""
This code is modified version of LCNN baseline
from ASVSpoof2021 challenge - https://github.com/asvspoof-challenge/2021/blob/main/LA/Baseline-LFCC-LCNN/project/baseline_LA/model.py
"""
import sys
import torch
import torch.nn as torch_nn
from src import frontends
NUM_COEFFICIENTS = 384
# For blstm
class BLSTMLayer(torch_nn.Module):
""" Wrapper over dilated conv1D
Input tensor: (batchsize=1, length, dim_in)
Output tensor: (batchsize=1, length, dim_out)
We want to keep the length the same
"""
def __init__(self, input_dim, output_dim):
super().__init__()
if output_dim % 2 != 0:
print("Output_dim of BLSTMLayer is {:d}".format(output_dim))
print("BLSTMLayer expects a layer size of even number")
sys.exit(1)
# bi-directional LSTM
self.l_blstm = torch_nn.LSTM(
input_dim,
output_dim // 2,
bidirectional=True
)
def forward(self, x):
# permute to (length, batchsize=1, dim)
blstm_data, _ = self.l_blstm(x.permute(1, 0, 2))
# permute it backt to (batchsize=1, length, dim)
return blstm_data.permute(1, 0, 2)
class MaxFeatureMap2D(torch_nn.Module):
""" Max feature map (along 2D)
MaxFeatureMap2D(max_dim=1)
l_conv2d = MaxFeatureMap2D(1)
data_in = torch.rand([1, 4, 5, 5])
data_out = l_conv2d(data_in)
Input:
------
data_in: tensor of shape (batch, channel, ...)
Output:
-------
data_out: tensor of shape (batch, channel//2, ...)
Note
----
By default, Max-feature-map is on channel dimension,
and maxout is used on (channel ...)
"""
def __init__(self, max_dim = 1):
super().__init__()
self.max_dim = max_dim
def forward(self, inputs):
# suppose inputs (batchsize, channel, length, dim)
shape = list(inputs.size())
if self.max_dim >= len(shape):
print("MaxFeatureMap: maximize on %d dim" % (self.max_dim))
print("But input has %d dimensions" % (len(shape)))
sys.exit(1)
if shape[self.max_dim] // 2 * 2 != shape[self.max_dim]:
print("MaxFeatureMap: maximize on %d dim" % (self.max_dim))
print("But this dimension has an odd number of data")
sys.exit(1)
shape[self.max_dim] = shape[self.max_dim]//2
shape.insert(self.max_dim, 2)
# view to (batchsize, 2, channel//2, ...)
# maximize on the 2nd dim
m, i = inputs.view(*shape).max(self.max_dim)
return m
##############
## FOR MODEL
##############
class LCNN(torch_nn.Module):
""" Model definition
"""
def __init__(self, **kwargs):
super().__init__()
input_channels = kwargs.get("input_channels", 1)
num_coefficients = kwargs.get("num_coefficients", NUM_COEFFICIENTS)
# Working sampling rate
self.num_coefficients = num_coefficients
# dimension of embedding vectors
# here, the embedding is just the activation before sigmoid()
self.v_emd_dim = 1
# it can handle models with multiple front-end configuration
# by default, only a single front-end
self.m_transform = torch_nn.Sequential(
torch_nn.Conv2d(input_channels, 64, (5, 5), 1, padding=(2, 2)),
MaxFeatureMap2D(),
torch.nn.MaxPool2d((2, 2), (2, 2)),
torch_nn.Conv2d(32, 64, (1, 1), 1, padding=(0, 0)),
MaxFeatureMap2D(),
torch_nn.BatchNorm2d(32, affine=False),
torch_nn.Conv2d(32, 96, (3, 3), 1, padding=(1, 1)),
MaxFeatureMap2D(),
torch.nn.MaxPool2d((2, 2), (2, 2)),
torch_nn.BatchNorm2d(48, affine=False),
torch_nn.Conv2d(48, 96, (1, 1), 1, padding=(0, 0)),
MaxFeatureMap2D(),
torch_nn.BatchNorm2d(48, affine=False),
torch_nn.Conv2d(48, 128, (3, 3), 1, padding=(1, 1)),
MaxFeatureMap2D(),
torch.nn.MaxPool2d((2, 2), (2, 2)),
torch_nn.Conv2d(64, 128, (1, 1), 1, padding=(0, 0)),
MaxFeatureMap2D(),
torch_nn.BatchNorm2d(64, affine=False),
torch_nn.Conv2d(64, 64, (3, 3), 1, padding=(1, 1)),
MaxFeatureMap2D(),
torch_nn.BatchNorm2d(32, affine=False),
torch_nn.Conv2d(32, 64, (1, 1), 1, padding=(0, 0)),
MaxFeatureMap2D(),
torch_nn.BatchNorm2d(32, affine=False),
torch_nn.Conv2d(32, 64, (3, 3), 1, padding=(1, 1)),
MaxFeatureMap2D(),
torch_nn.MaxPool2d((2, 2), (2, 2)),
torch_nn.Dropout(0.7)
)
self.m_before_pooling = torch_nn.Sequential(
BLSTMLayer((self.num_coefficients//16) * 32, (self.num_coefficients//16) * 32),
BLSTMLayer((self.num_coefficients//16) * 32, (self.num_coefficients//16) * 32)
)
self.m_output_act = torch_nn.Linear((self.num_coefficients // 16) * 32, self.v_emd_dim)
def _compute_embedding(self, x):
""" definition of forward method
Assume x (batchsize, length, dim)
Output x (batchsize * number_filter, output_dim)
"""
# resample if necessary
# x = self.m_resampler(x.squeeze(-1)).unsqueeze(-1)
# number of sub models
batch_size = x.shape[0]
# buffer to store output scores from sub-models
output_emb = torch.zeros(
[batch_size, self.v_emd_dim],
device=x.device,
dtype=x.dtype
)
# compute scores for each sub-models
idx = 0
# compute scores
# 1. unsqueeze to (batch, 1, frame_length, fft_bin)
# 2. compute hidden features
x = x.permute(0,1,3,2)
hidden_features = self.m_transform(x)
# 3. (batch, channel, frame//N, feat_dim//N) ->
# (batch, frame//N, channel * feat_dim//N)
# where N is caused by conv with stride
hidden_features = hidden_features.permute(0, 2, 1, 3).contiguous()
frame_num = hidden_features.shape[1]
hidden_features = hidden_features.view(batch_size, frame_num, -1)
# 4. pooling
# 4. pass through LSTM then summingc
hidden_features_lstm = self.m_before_pooling(hidden_features)
# 5. pass through the output layer
tmp_emb = self.m_output_act((hidden_features_lstm + hidden_features).mean(1))
output_emb[idx * batch_size : (idx+1) * batch_size] = tmp_emb
return output_emb
def _compute_score(self, feature_vec):
# feature_vec is [batch * submodel, 1]
return torch.sigmoid(feature_vec).squeeze(1)
def forward(self, x):
feature_vec = self._compute_embedding(x)
return feature_vec
class FrontendLCNN(LCNN):
""" Model definition
"""
def __init__(self, device: str = "cuda", **kwargs):
super().__init__(**kwargs)
self.device = device
frontend_name = kwargs.get("frontend_algorithm", [])
self.frontend = frontends.get_frontend(frontend_name)
print(f"Using {frontend_name} frontend")
def _compute_frontend(self, x):
frontend = self.frontend(x)
if frontend.ndim < 4:
return frontend.unsqueeze(1) # (bs, 1, n_lfcc, frames)
return frontend # (bs, n, n_lfcc, frames)
def forward(self, x):
x = self._compute_frontend(x)
feature_vec = self._compute_embedding(x)
return feature_vec
if __name__ == "__main__":
device = "cuda"
print("Definition of model")
model = FrontendLCNN(input_channels=2, num_coefficients=80, device=device, frontend_algorithm=["mel_spec"])
model = model.to(device)
batch_size = 12
mock_input = torch.rand((batch_size, 64_600,), device=device)
output = model(mock_input)
print(output.shape)