Spaces:

ldhldh
/

streamlit_demo

Runtime error

App Files Files Community

ldhldh commited on May 26, 2024

Commit

bb5a96d

verified ·

1 Parent(s): 274c600

Upload 11 files

Browse files

Files changed (11) hide show

src/models/__init__.py +0 -0
src/models/assets/mel_filters.npz +3 -0
src/models/lcnn.py +247 -0
src/models/meso_net.py +146 -0
src/models/models.py +73 -0
src/models/rawnet3.py +323 -0
src/models/specrnet.py +226 -0
src/models/whisper_lcnn.py +89 -0
src/models/whisper_main.py +323 -0
src/models/whisper_meso_net.py +88 -0
src/models/whisper_specrnet.py +97 -0

src/models/__init__.py ADDED Viewed

File without changes

src/models/assets/mel_filters.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd2cc75e70e36fcbdd8ffbc2499062f30094093e6bf2cbafa9859f59972b420b
+size 2048

src/models/lcnn.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+This code is modified version of LCNN baseline
+from ASVSpoof2021 challenge - https://github.com/asvspoof-challenge/2021/blob/main/LA/Baseline-LFCC-LCNN/project/baseline_LA/model.py
+"""
+import sys
+import torch
+import torch.nn as torch_nn
+from src import frontends
+NUM_COEFFICIENTS = 384
+# For blstm
+class BLSTMLayer(torch_nn.Module):
+    """ Wrapper over dilated conv1D
+    Input tensor:  (batchsize=1, length, dim_in)
+    Output tensor: (batchsize=1, length, dim_out)
+    We want to keep the length the same
+    """
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        if output_dim % 2 != 0:
+            print("Output_dim of BLSTMLayer is {:d}".format(output_dim))
+            print("BLSTMLayer expects a layer size of even number")
+            sys.exit(1)
+        # bi-directional LSTM
+        self.l_blstm = torch_nn.LSTM(
+            input_dim,
+            output_dim // 2,
+            bidirectional=True
+        )
+    def forward(self, x):
+        # permute to (length, batchsize=1, dim)
+        blstm_data, _ = self.l_blstm(x.permute(1, 0, 2))
+        # permute it backt to (batchsize=1, length, dim)
+        return blstm_data.permute(1, 0, 2)
+class MaxFeatureMap2D(torch_nn.Module):
+    """ Max feature map (along 2D)
+    MaxFeatureMap2D(max_dim=1)
+    l_conv2d = MaxFeatureMap2D(1)
+    data_in = torch.rand([1, 4, 5, 5])
+    data_out = l_conv2d(data_in)
+    Input:
+    ------
+    data_in: tensor of shape (batch, channel, ...)
+    Output:
+    -------
+    data_out: tensor of shape (batch, channel//2, ...)
+    Note
+    ----
+    By default, Max-feature-map is on channel dimension,
+    and maxout is used on (channel ...)
+    """
+    def __init__(self, max_dim = 1):
+        super().__init__()
+        self.max_dim = max_dim
+    def forward(self, inputs):
+        # suppose inputs (batchsize, channel, length, dim)
+        shape = list(inputs.size())
+        if self.max_dim >= len(shape):
+            print("MaxFeatureMap: maximize on %d dim" % (self.max_dim))
+            print("But input has %d dimensions" % (len(shape)))
+            sys.exit(1)
+        if shape[self.max_dim] // 2 * 2 != shape[self.max_dim]:
+            print("MaxFeatureMap: maximize on %d dim" % (self.max_dim))
+            print("But this dimension has an odd number of data")
+            sys.exit(1)
+        shape[self.max_dim] = shape[self.max_dim]//2
+        shape.insert(self.max_dim, 2)
+        # view to (batchsize, 2, channel//2, ...)
+        # maximize on the 2nd dim
+        m, i = inputs.view(*shape).max(self.max_dim)
+        return m
+##############
+## FOR MODEL
+##############
+class LCNN(torch_nn.Module):
+    """ Model definition
+    """
+    def __init__(self, **kwargs):
+        super().__init__()
+        input_channels = kwargs.get("input_channels", 1)
+        num_coefficients = kwargs.get("num_coefficients", NUM_COEFFICIENTS)
+        # Working sampling rate
+        self.num_coefficients = num_coefficients
+        # dimension of embedding vectors
+        # here, the embedding is just the activation before sigmoid()
+        self.v_emd_dim = 1
+        # it can handle models with multiple front-end configuration
+        # by default, only a single front-end
+        self.m_transform = torch_nn.Sequential(
+            torch_nn.Conv2d(input_channels, 64, (5, 5), 1, padding=(2, 2)),
+            MaxFeatureMap2D(),
+            torch.nn.MaxPool2d((2, 2), (2, 2)),
+            torch_nn.Conv2d(32, 64, (1, 1), 1, padding=(0, 0)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(32, affine=False),
+            torch_nn.Conv2d(32, 96, (3, 3), 1, padding=(1, 1)),
+            MaxFeatureMap2D(),
+            torch.nn.MaxPool2d((2, 2), (2, 2)),
+            torch_nn.BatchNorm2d(48, affine=False),
+            torch_nn.Conv2d(48, 96, (1, 1), 1, padding=(0, 0)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(48, affine=False),
+            torch_nn.Conv2d(48, 128, (3, 3), 1, padding=(1, 1)),
+            MaxFeatureMap2D(),
+            torch.nn.MaxPool2d((2, 2), (2, 2)),
+            torch_nn.Conv2d(64, 128, (1, 1), 1, padding=(0, 0)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(64, affine=False),
+            torch_nn.Conv2d(64, 64, (3, 3), 1, padding=(1, 1)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(32, affine=False),
+            torch_nn.Conv2d(32, 64, (1, 1), 1, padding=(0, 0)),
+            MaxFeatureMap2D(),
+            torch_nn.BatchNorm2d(32, affine=False),
+            torch_nn.Conv2d(32, 64, (3, 3), 1, padding=(1, 1)),
+            MaxFeatureMap2D(),
+            torch_nn.MaxPool2d((2, 2), (2, 2)),
+            torch_nn.Dropout(0.7)
+        )
+        self.m_before_pooling = torch_nn.Sequential(
+            BLSTMLayer((self.num_coefficients//16) * 32, (self.num_coefficients//16) * 32),
+            BLSTMLayer((self.num_coefficients//16) * 32, (self.num_coefficients//16) * 32)
+        )
+        self.m_output_act = torch_nn.Linear((self.num_coefficients // 16) * 32, self.v_emd_dim)
+    def _compute_embedding(self, x):
+        """ definition of forward method
+        Assume x (batchsize, length, dim)
+        Output x (batchsize * number_filter, output_dim)
+        """
+        # resample if necessary
+        # x = self.m_resampler(x.squeeze(-1)).unsqueeze(-1)
+        # number of sub models
+        batch_size = x.shape[0]
+        # buffer to store output scores from sub-models
+        output_emb = torch.zeros(
+            [batch_size, self.v_emd_dim],
+            device=x.device,
+            dtype=x.dtype
+        )
+        # compute scores for each sub-models
+        idx = 0
+        # compute scores
+        #  1. unsqueeze to (batch, 1, frame_length, fft_bin)
+        #  2. compute hidden features
+        x = x.permute(0,1,3,2)
+        hidden_features = self.m_transform(x)
+        #  3. (batch, channel, frame//N, feat_dim//N) ->
+        #     (batch, frame//N, channel * feat_dim//N)
+        #     where N is caused by conv with stride
+        hidden_features = hidden_features.permute(0, 2, 1, 3).contiguous()
+        frame_num = hidden_features.shape[1]
+        hidden_features = hidden_features.view(batch_size, frame_num, -1)
+        #  4. pooling
+        #  4. pass through LSTM then summingc
+        hidden_features_lstm = self.m_before_pooling(hidden_features)
+        #  5. pass through the output layer
+        tmp_emb = self.m_output_act((hidden_features_lstm + hidden_features).mean(1))
+        output_emb[idx * batch_size : (idx+1) * batch_size] = tmp_emb
+        return output_emb
+    def _compute_score(self, feature_vec):
+        # feature_vec is [batch * submodel, 1]
+        return torch.sigmoid(feature_vec).squeeze(1)
+    def forward(self, x):
+        feature_vec = self._compute_embedding(x)
+        return feature_vec
+class FrontendLCNN(LCNN):
+    """ Model definition
+    """
+    def __init__(self, device: str = "cuda", **kwargs):
+        super().__init__(**kwargs)
+        self.device = device
+        frontend_name = kwargs.get("frontend_algorithm", [])
+        self.frontend = frontends.get_frontend(frontend_name)
+        print(f"Using {frontend_name} frontend")
+    def _compute_frontend(self, x):
+        frontend = self.frontend(x)
+        if frontend.ndim < 4:
+            return frontend.unsqueeze(1)  # (bs, 1, n_lfcc, frames)
+        return frontend # (bs, n, n_lfcc, frames)
+    def forward(self, x):
+        x = self._compute_frontend(x)
+        feature_vec = self._compute_embedding(x)
+        return feature_vec
+if __name__ == "__main__":
+    device = "cuda"
+    print("Definition of model")
+    model = FrontendLCNN(input_channels=2, num_coefficients=80, device=device, frontend_algorithm=["mel_spec"])
+    model = model.to(device)
+    batch_size = 12
+    mock_input = torch.rand((batch_size, 64_600,), device=device)
+    output = model(mock_input)
+    print(output.shape)

src/models/meso_net.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""
+This code is modified version of MesoNet DeepFake detection solution
+from FakeAVCeleb repository - https://github.com/DASH-Lab/FakeAVCeleb/blob/main/models/MesoNet.py.
+"""
+import torch
+import torch.nn as nn
+from src import frontends
+class MesoInception4(nn.Module):
+    """
+    Pytorch Implemention of MesoInception4
+    Author: Honggu Liu
+    Date: July 7, 2019
+    """
+    def __init__(self, num_classes=1, **kwargs):
+        super().__init__()
+        self.fc1_dim = kwargs.get("fc1_dim", 1024)
+        input_channels = kwargs.get("input_channels", 3)
+        self.num_classes = num_classes
+        #InceptionLayer1
+        self.Incption1_conv1 = nn.Conv2d(input_channels, 1, 1, padding=0, bias=False)
+        self.Incption1_conv2_1 = nn.Conv2d(input_channels, 4, 1, padding=0, bias=False)
+        self.Incption1_conv2_2 = nn.Conv2d(4, 4, 3, padding=1, bias=False)
+        self.Incption1_conv3_1 = nn.Conv2d(input_channels, 4, 1, padding=0, bias=False)
+        self.Incption1_conv3_2 = nn.Conv2d(4, 4, 3, padding=2, dilation=2, bias=False)
+        self.Incption1_conv4_1 = nn.Conv2d(input_channels, 2, 1, padding=0, bias=False)
+        self.Incption1_conv4_2 = nn.Conv2d(2, 2, 3, padding=3, dilation=3, bias=False)
+        self.Incption1_bn = nn.BatchNorm2d(11)
+        #InceptionLayer2
+        self.Incption2_conv1 = nn.Conv2d(11, 2, 1, padding=0, bias=False)
+        self.Incption2_conv2_1 = nn.Conv2d(11, 4, 1, padding=0, bias=False)
+        self.Incption2_conv2_2 = nn.Conv2d(4, 4, 3, padding=1, bias=False)
+        self.Incption2_conv3_1 = nn.Conv2d(11, 4, 1, padding=0, bias=False)
+        self.Incption2_conv3_2 = nn.Conv2d(4, 4, 3, padding=2, dilation=2, bias=False)
+        self.Incption2_conv4_1 = nn.Conv2d(11, 2, 1, padding=0, bias=False)
+        self.Incption2_conv4_2 = nn.Conv2d(2, 2, 3, padding=3, dilation=3, bias=False)
+        self.Incption2_bn = nn.BatchNorm2d(12)
+        #Normal Layer
+        self.conv1 = nn.Conv2d(12, 16, 5, padding=2, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+        self.leakyrelu = nn.LeakyReLU(0.1)
+        self.bn1 = nn.BatchNorm2d(16)
+        self.maxpooling1 = nn.MaxPool2d(kernel_size=(2, 2))
+        self.conv2 = nn.Conv2d(16, 16, 5, padding=2, bias=False)
+        self.maxpooling2 = nn.MaxPool2d(kernel_size=(4, 4))
+        self.dropout = nn.Dropout2d(0.5)
+        self.fc1 = nn.Linear(self.fc1_dim, 16)
+        self.fc2 = nn.Linear(16, num_classes)
+    #InceptionLayer
+    def InceptionLayer1(self, input):
+        x1 = self.Incption1_conv1(input)
+        x2 = self.Incption1_conv2_1(input)
+        x2 = self.Incption1_conv2_2(x2)
+        x3 = self.Incption1_conv3_1(input)
+        x3 = self.Incption1_conv3_2(x3)
+        x4 = self.Incption1_conv4_1(input)
+        x4 = self.Incption1_conv4_2(x4)
+        y = torch.cat((x1, x2, x3, x4), 1)
+        y = self.Incption1_bn(y)
+        y = self.maxpooling1(y)
+        return y
+    def InceptionLayer2(self, input):
+        x1 = self.Incption2_conv1(input)
+        x2 = self.Incption2_conv2_1(input)
+        x2 = self.Incption2_conv2_2(x2)
+        x3 = self.Incption2_conv3_1(input)
+        x3 = self.Incption2_conv3_2(x3)
+        x4 = self.Incption2_conv4_1(input)
+        x4 = self.Incption2_conv4_2(x4)
+        y = torch.cat((x1, x2, x3, x4), 1)
+        y = self.Incption2_bn(y)
+        y = self.maxpooling1(y)
+        return y
+    def forward(self, input):
+        x = self._compute_embedding(input)
+        return x
+    def _compute_embedding(self, input):
+        x = self.InceptionLayer1(input) #(Batch, 11, 128, 128)
+        x = self.InceptionLayer2(x) #(Batch, 12, 64, 64)
+        x = self.conv1(x) #(Batch, 16, 64 ,64)
+        x = self.relu(x)
+        x = self.bn1(x)
+        x = self.maxpooling1(x) #(Batch, 16, 32, 32)
+        x = self.conv2(x) #(Batch, 16, 32, 32)
+        x = self.relu(x)
+        x = self.bn1(x)
+        x = self.maxpooling2(x) #(Batch, 16, 8, 8)
+        x = x.view(x.size(0), -1) #(Batch, 16*8*8)
+        x = self.dropout(x)
+        x = nn.AdaptiveAvgPool1d(self.fc1_dim)(x)
+        x = self.fc1(x) #(Batch, 16)  ### <-- o tu
+        x = self.leakyrelu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return x
+class FrontendMesoInception4(MesoInception4):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.device = kwargs['device']
+        frontend_name = kwargs.get("frontend_algorithm", [])
+        self.frontend = frontends.get_frontend(frontend_name)
+        print(f"Using {frontend_name} frontend")
+    def forward(self, x):
+        x = self.frontend(x)
+        x = self._compute_embedding(x)
+        return x
+if __name__ == "__main__":
+    model = FrontendMesoInception4(
+        input_channels=2,
+        fc1_dim=1024,
+        device='cuda',
+        frontend_algorithm="lfcc"
+    )
+    def count_parameters(model) -> int:
+        pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        return pytorch_total_params
+    print(count_parameters(model))

src/models/models.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Dict
+from src.models import (
+    lcnn,
+    specrnet,
+    whisper_specrnet,
+    rawnet3,
+    whisper_lcnn,
+    meso_net,
+    whisper_meso_net
+)
+def get_model(model_name: str, config: Dict, device: str):
+    if model_name == "rawnet3":
+        return rawnet3.prepare_model()
+    elif model_name == "lcnn":
+        return lcnn.FrontendLCNN(device=device, **config)
+    elif model_name == "specrnet":
+        return specrnet.FrontendSpecRNet(
+            device=device,
+            **config,
+        )
+    elif model_name == "mesonet":
+        return meso_net.FrontendMesoInception4(
+            input_channels=config.get("input_channels", 1),
+            fc1_dim=config.get("fc1_dim", 1024),
+            frontend_algorithm=config.get("frontend_algorithm", "lfcc"),
+            device=device,
+        )
+    elif model_name == "whisper_lcnn":
+        return whisper_lcnn.WhisperLCNN(
+            input_channels=config.get("input_channels", 1),
+            freeze_encoder=config.get("freeze_encoder", False),
+            device=device,
+        )
+    elif model_name == "whisper_specrnet":
+        return whisper_specrnet.WhisperSpecRNet(
+            input_channels=config.get("input_channels", 1),
+            freeze_encoder=config.get("freeze_encoder", False),
+            device=device,
+        )
+    elif model_name == "whisper_mesonet":
+        return whisper_meso_net.WhisperMesoNet(
+            input_channels=config.get("input_channels", 1),
+            freeze_encoder=config.get("freeze_encoder", True),
+            fc1_dim=config.get("fc1_dim", 1024),
+            device=device,
+        )
+    elif model_name == "whisper_frontend_lcnn":
+        return whisper_lcnn.WhisperMultiFrontLCNN(
+            input_channels=config.get("input_channels", 2),
+            freeze_encoder=config.get("freeze_encoder", False),
+            frontend_algorithm=config.get("frontend_algorithm", "lfcc"),
+            device=device,
+        )
+    elif model_name == "whisper_frontend_specrnet":
+        return whisper_specrnet.WhisperMultiFrontSpecRNet(
+            input_channels=config.get("input_channels", 2),
+            freeze_encoder=config.get("freeze_encoder", False),
+            frontend_algorithm=config.get("frontend_algorithm", "lfcc"),
+            device=device,
+        )
+    elif model_name == "whisper_frontend_mesonet":
+        return whisper_meso_net.WhisperMultiFrontMesoNet(
+            input_channels=config.get("input_channels", 2),
+            fc1_dim=config.get("fc1_dim", 1024),
+            freeze_encoder=config.get("freeze_encoder", True),
+            frontend_algorithm=config.get("frontend_algorithm", "lfcc"),
+            device=device,
+        )
+    else:
+        raise ValueError(f"Model '{model_name}' not supported")

src/models/rawnet3.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+This file contains implementation of RawNet3 architecture.
+The original implementation can be found here: https://github.com/Jungjee/RawNet/tree/master/python/RawNet3
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from asteroid_filterbanks import Encoder, ParamSincFB  # pip install asteroid_filterbanks
+class RawNet3(nn.Module):
+    def __init__(self, block, model_scale, context, summed, C=1024, **kwargs):
+        super().__init__()
+        nOut = kwargs["nOut"]
+        self.context = context
+        self.encoder_type = kwargs["encoder_type"]
+        self.log_sinc = kwargs["log_sinc"]
+        self.norm_sinc = kwargs["norm_sinc"]
+        self.out_bn = kwargs["out_bn"]
+        self.summed = summed
+        self.preprocess = nn.Sequential(
+            PreEmphasis(), nn.InstanceNorm1d(1, eps=1e-4, affine=True)
+        )
+        self.conv1 = Encoder(
+            ParamSincFB(
+                C // 4,
+                251,
+                stride=kwargs["sinc_stride"],
+            )
+        )
+        self.relu = nn.ReLU()
+        self.bn1 = nn.BatchNorm1d(C // 4)
+        self.layer1 = block(
+            C // 4, C, kernel_size=3, dilation=2, scale=model_scale, pool=5
+        )
+        self.layer2 = block(
+            C, C, kernel_size=3, dilation=3, scale=model_scale, pool=3
+        )
+        self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=model_scale)
+        self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1)
+        if self.context:
+            attn_input = 1536 * 3
+        else:
+            attn_input = 1536
+        print("self.encoder_type", self.encoder_type)
+        if self.encoder_type == "ECA":
+            attn_output = 1536
+        elif self.encoder_type == "ASP":
+            attn_output = 1
+        else:
+            raise ValueError("Undefined encoder")
+        self.attention = nn.Sequential(
+            nn.Conv1d(attn_input, 128, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Conv1d(128, attn_output, kernel_size=1),
+            nn.Softmax(dim=2),
+        )
+        self.bn5 = nn.BatchNorm1d(3072)
+        self.fc6 = nn.Linear(3072, nOut)
+        self.bn6 = nn.BatchNorm1d(nOut)
+        self.mp3 = nn.MaxPool1d(3)
+    def forward(self, x):
+        """
+        :param x: input mini-batch (bs, samp)
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            x = self.preprocess(x)
+            x = torch.abs(self.conv1(x))
+            if self.log_sinc:
+                x = torch.log(x + 1e-6)
+            if self.norm_sinc == "mean":
+                x = x - torch.mean(x, dim=-1, keepdim=True)
+            elif self.norm_sinc == "mean_std":
+                m = torch.mean(x, dim=-1, keepdim=True)
+                s = torch.std(x, dim=-1, keepdim=True)
+                s[s < 0.001] = 0.001
+                x = (x - m) / s
+        if self.summed:
+            x1 = self.layer1(x)
+            x2 = self.layer2(x1)
+            x3 = self.layer3(self.mp3(x1) + x2)
+        else:
+            x1 = self.layer1(x)
+            x2 = self.layer2(x1)
+            x3 = self.layer3(x2)
+        x = self.layer4(torch.cat((self.mp3(x1), x2, x3), dim=1))
+        x = self.relu(x)
+        t = x.size()[-1]
+        if self.context:
+            global_x = torch.cat(
+                (
+                    x,
+                    torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t),
+                    torch.sqrt(
+                        torch.var(x, dim=2, keepdim=True).clamp(
+                            min=1e-4, max=1e4
+                        )
+                    ).repeat(1, 1, t),
+                ),
+                dim=1,
+            )
+        else:
+            global_x = x
+        w = self.attention(global_x)
+        mu = torch.sum(x * w, dim=2)
+        sg = torch.sqrt(
+            (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)
+        )
+        x = torch.cat((mu, sg), 1)
+        x = self.bn5(x)
+        x = self.fc6(x)
+        if self.out_bn:
+            x = self.bn6(x)
+        return x
+class PreEmphasis(torch.nn.Module):
+    def __init__(self, coef: float = 0.97) -> None:
+        super().__init__()
+        self.coef = coef
+        # make kernel
+        # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
+        self.register_buffer(
+            "flipped_filter",
+            torch.FloatTensor([-self.coef, 1.0]).unsqueeze(0).unsqueeze(0),
+        )
+    def forward(self, input: torch.tensor) -> torch.tensor:
+        assert (
+            len(input.size()) == 2
+        ), "The number of dimensions of input tensor must be 2!"
+        # reflect padding to match lengths of in/out
+        input = input.unsqueeze(1)
+        input = F.pad(input, (1, 0), "reflect")
+        return F.conv1d(input, self.flipped_filter)
+class AFMS(nn.Module):
+    """
+    Alpha-Feature map scaling, added to the output of each residual block[1,2].
+    Reference:
+    [1] RawNet2 : https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1011.pdf
+    [2] AMFS    : https://www.koreascience.or.kr/article/JAKO202029757857763.page
+    """
+    def __init__(self, nb_dim: int) -> None:
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones((nb_dim, 1)))
+        self.fc = nn.Linear(nb_dim, nb_dim)
+        self.sig = nn.Sigmoid()
+    def forward(self, x):
+        y = F.adaptive_avg_pool1d(x, 1).view(x.size(0), -1)
+        y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1)
+        x = x + self.alpha
+        x = x * y
+        return x
+class Bottle2neck(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        kernel_size=None,
+        dilation=None,
+        scale=4,
+        pool=False,
+    ):
+        super().__init__()
+        width = int(math.floor(planes / scale))
+        self.conv1 = nn.Conv1d(inplanes, width * scale, kernel_size=1)
+        self.bn1 = nn.BatchNorm1d(width * scale)
+        self.nums = scale - 1
+        convs = []
+        bns = []
+        num_pad = math.floor(kernel_size / 2) * dilation
+        for i in range(self.nums):
+            convs.append(
+                nn.Conv1d(
+                    width,
+                    width,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    padding=num_pad,
+                )
+            )
+            bns.append(nn.BatchNorm1d(width))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.conv3 = nn.Conv1d(width * scale, planes, kernel_size=1)
+        self.bn3 = nn.BatchNorm1d(planes)
+        self.relu = nn.ReLU()
+        self.width = width
+        self.mp = nn.MaxPool1d(pool) if pool else False
+        self.afms = AFMS(planes)
+        if inplanes != planes:  # if change in number of filters
+            self.residual = nn.Sequential(
+                nn.Conv1d(inplanes, planes, kernel_size=1, stride=1, bias=False)
+            )
+        else:
+            self.residual = nn.Identity()
+    def forward(self, x):
+        residual = self.residual(x)
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = self.convs[i](sp)
+            sp = self.relu(sp)
+            sp = self.bns[i](sp)
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+        out = torch.cat((out, spx[self.nums]), 1)
+        out = self.conv3(out)
+        out = self.relu(out)
+        out = self.bn3(out)
+        out += residual
+        if self.mp:
+            out = self.mp(out)
+        out = self.afms(out)
+        return out
+def prepare_model():
+    model = RawNet3(
+        Bottle2neck,
+        model_scale=8,
+        context=True,
+        summed=True,
+        encoder_type="ECA",
+        nOut=1,  # number of slices
+        out_bn=False,
+        sinc_stride=10,
+        log_sinc=True,
+        norm_sinc="mean",
+        grad_mult=1,
+    )
+    return model
+if __name__ == "__main__":
+    model = RawNet3(
+        Bottle2neck,
+        model_scale=8,
+        context=True,
+        summed=True,
+        encoder_type="ECA",
+        nOut=1,  # number of slices
+        out_bn=False,
+        sinc_stride=10,
+        log_sinc=True,
+        norm_sinc="mean",
+        grad_mult=1,
+    )
+    gpu = False
+    model.eval()
+    print("RawNet3 initialised & weights loaded!")
+    if torch.cuda.is_available():
+        print("Cuda available, conducting inference on GPU")
+        model = model.to("cuda")
+        gpu = True
+    audios = torch.rand(32, 64_600)
+    out = model(audios)
+    print(out.shape)

src/models/specrnet.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""
+This file contains implementation of SpecRNet architecture.
+We base our codebase on the implementation of RawNet2 by Hemlata Tak ([email protected]).
+It is available here: https://github.com/asvspoof-challenge/2021/blob/main/LA/Baseline-RawNet2/model.py
+"""
+from typing import Dict
+import torch.nn as nn
+from src import frontends
+def get_config(input_channels: int) -> Dict:
+    return {
+        "filts": [input_channels, [input_channels, 20], [20, 64], [64, 64]],
+        "nb_fc_node": 64,
+        "gru_node": 64,
+        "nb_gru_layer": 2,
+        "nb_classes": 1,
+    }
+class Residual_block2D(nn.Module):
+    def __init__(self, nb_filts, first=False):
+        super().__init__()
+        self.first = first
+        if not self.first:
+            self.bn1 = nn.BatchNorm2d(num_features=nb_filts[0])
+        self.lrelu = nn.LeakyReLU(negative_slope=0.3)
+        self.conv1 = nn.Conv2d(
+            in_channels=nb_filts[0],
+            out_channels=nb_filts[1],
+            kernel_size=3,
+            padding=1,
+            stride=1,
+        )
+        self.bn2 = nn.BatchNorm2d(num_features=nb_filts[1])
+        self.conv2 = nn.Conv2d(
+            in_channels=nb_filts[1],
+            out_channels=nb_filts[1],
+            padding=1,
+            kernel_size=3,
+            stride=1,
+        )
+        if nb_filts[0] != nb_filts[1]:
+            self.downsample = True
+            self.conv_downsample = nn.Conv2d(
+                in_channels=nb_filts[0],
+                out_channels=nb_filts[1],
+                padding=0,
+                kernel_size=1,
+                stride=1,
+            )
+        else:
+            self.downsample = False
+        self.mp = nn.MaxPool2d(2)
+    def forward(self, x):
+        identity = x
+        if not self.first:
+            out = self.bn1(x)
+            out = self.lrelu(out)
+        else:
+            out = x
+        out = self.conv1(x)
+        out = self.bn2(out)
+        out = self.lrelu(out)
+        out = self.conv2(out)
+        if self.downsample:
+            identity = self.conv_downsample(identity)
+        out += identity
+        out = self.mp(out)
+        return out
+class SpecRNet(nn.Module):
+    def __init__(self, input_channels, **kwargs):
+        super().__init__()
+        config = get_config(input_channels=input_channels)
+        self.device = kwargs.get("device", "cuda")
+        self.first_bn = nn.BatchNorm2d(num_features=config["filts"][0])
+        self.selu = nn.SELU(inplace=True)
+        self.block0 = nn.Sequential(
+            Residual_block2D(nb_filts=config["filts"][1], first=True)
+        )
+        self.block2 = nn.Sequential(Residual_block2D(nb_filts=config["filts"][2]))
+        config["filts"][2][0] = config["filts"][2][1]
+        self.block4 = nn.Sequential(Residual_block2D(nb_filts=config["filts"][2]))
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc_attention0 = self._make_attention_fc(
+            in_features=config["filts"][1][-1], l_out_features=config["filts"][1][-1]
+        )
+        self.fc_attention2 = self._make_attention_fc(
+            in_features=config["filts"][2][-1], l_out_features=config["filts"][2][-1]
+        )
+        self.fc_attention4 = self._make_attention_fc(
+            in_features=config["filts"][2][-1], l_out_features=config["filts"][2][-1]
+        )
+        self.bn_before_gru = nn.BatchNorm2d(num_features=config["filts"][2][-1])
+        self.gru = nn.GRU(
+            input_size=config["filts"][2][-1],
+            hidden_size=config["gru_node"],
+            num_layers=config["nb_gru_layer"],
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.fc1_gru = nn.Linear(
+            in_features=config["gru_node"] * 2, out_features=config["nb_fc_node"] * 2
+        )
+        self.fc2_gru = nn.Linear(
+            in_features=config["nb_fc_node"] * 2,
+            out_features=config["nb_classes"],
+            bias=True,
+        )
+        self.sig = nn.Sigmoid()
+    def _compute_embedding(self, x):
+        x = self.first_bn(x)
+        x = self.selu(x)
+        x0 = self.block0(x)
+        y0 = self.avgpool(x0).view(x0.size(0), -1)
+        y0 = self.fc_attention0(y0)
+        y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)
+        y0 = y0.unsqueeze(-1)
+        x = x0 * y0 + y0
+        x = nn.MaxPool2d(2)(x)
+        x2 = self.block2(x)
+        y2 = self.avgpool(x2).view(x2.size(0), -1)
+        y2 = self.fc_attention2(y2)
+        y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)
+        y2 = y2.unsqueeze(-1)
+        x = x2 * y2 + y2
+        x = nn.MaxPool2d(2)(x)
+        x4 = self.block4(x)
+        y4 = self.avgpool(x4).view(x4.size(0), -1)
+        y4 = self.fc_attention4(y4)
+        y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)
+        y4 = y4.unsqueeze(-1)
+        x = x4 * y4 + y4
+        x = nn.MaxPool2d(2)(x)
+        x = self.bn_before_gru(x)
+        x = self.selu(x)
+        x = nn.AdaptiveAvgPool2d((1, None))(x)
+        x = x.squeeze(-2)
+        x = x.permute(0, 2, 1)
+        self.gru.flatten_parameters()
+        x, _ = self.gru(x)
+        x = x[:, -1, :]
+        x = self.fc1_gru(x)
+        x = self.fc2_gru(x)
+        return x
+    def forward(self, x):
+        x = self._compute_embedding(x)
+        return x
+    def _make_attention_fc(self, in_features, l_out_features):
+        l_fc = []
+        l_fc.append(nn.Linear(in_features=in_features, out_features=l_out_features))
+        return nn.Sequential(*l_fc)
+class FrontendSpecRNet(SpecRNet):
+    def __init__(self, input_channels, **kwargs):
+        super().__init__(input_channels, **kwargs)
+        self.device = kwargs['device']
+        frontend_name = kwargs.get("frontend_algorithm", [])
+        self.frontend = frontends.get_frontend(frontend_name)
+        print(f"Using {frontend_name} frontend")
+    def _compute_frontend(self, x):
+        frontend = self.frontend(x)
+        if frontend.ndim < 4:
+            return frontend.unsqueeze(1)  # (bs, 1, n_lfcc, frames)
+        return frontend # (bs, n, n_lfcc, frames)
+    def forward(self, x):
+        x = self._compute_frontend(x)
+        x = self._compute_embedding(x)
+        return x
+if __name__ == "__main__":
+    print("Definition of model")
+    device = "cuda"
+    input_channels = 1
+    config = {
+        "filts": [input_channels, [input_channels, 20], [20, 64], [64, 64]],
+        "nb_fc_node": 64,
+        "gru_node": 64,
+        "nb_gru_layer": 2,
+        "nb_classes": 1,
+    }
+    def count_parameters(model) -> int:
+        pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        return pytorch_total_params
+    model = FrontendSpecRNet(input_channels=1, device=device, frontend_algorithm=["lfcc"])
+    model = model.to(device)
+    print(count_parameters(model))

src/models/whisper_lcnn.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+from src.models.whisper_main import ModelDimensions, Whisper, log_mel_spectrogram
+from src.models.lcnn import LCNN
+from src import frontends
+from src.commons import WHISPER_MODEL_WEIGHTS_PATH
+class WhisperLCNN(LCNN):
+    def __init__(self, input_channels, freeze_encoder, **kwargs):
+        super().__init__(input_channels=input_channels, **kwargs)
+        self.device = kwargs['device']
+        checkpoint = torch.load(WHISPER_MODEL_WEIGHTS_PATH)
+        dims = ModelDimensions(**checkpoint["dims"].__dict__)
+        model = Whisper(dims)
+        model = model.to(self.device)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        self.whisper_model = model
+        if freeze_encoder:
+            for param in self.whisper_model.parameters():
+                param.requires_grad = False
+    def compute_whisper_features(self, x):
+        specs = []
+        for sample in x:
+            specs.append(log_mel_spectrogram(sample))
+        x = torch.stack(specs)
+        x = self.whisper_model(x)
+        x = x.permute(0, 2, 1)  # (bs, frames, 3 x n_lfcc)
+        x = x.unsqueeze(1)  # (bs, 1, frames, 3 x n_lfcc)
+        x = x.repeat(
+            (1, 1, 1, 2)
+        )  # (bs, 1, frames, 3 x n_lfcc) -> (bs, 1, frames, 3000)
+        return x
+    def forward(self, x):
+        # we assume that the data is correct (i.e. 30s)
+        x = self.compute_whisper_features(x)
+        out = self._compute_embedding(x)
+        return out
+class WhisperMultiFrontLCNN(WhisperLCNN):
+    def __init__(self, input_channels, freeze_encoder, **kwargs):
+        super().__init__(input_channels=input_channels, freeze_encoder=freeze_encoder, **kwargs)
+        self.frontend = frontends.get_frontend(kwargs['frontend_algorithm'])
+        print(f"Using {self.frontend} frontend!")
+    def forward(self, x):
+        # Frontend computation
+        frontend_x = self.frontend(x)
+        x = self.compute_whisper_features(x)
+        x = torch.cat([x, frontend_x], 1)
+        out = self._compute_embedding(x)
+        return out
+if __name__ == "__main__":
+    import numpy as np
+    input_channels = 1
+    device = "cpu"
+    classifier = WhisperLCNN(
+        input_channels=input_channels,
+        freeze_encoder=True,
+        device=device,
+    )
+    input_channels = 2
+    classifier_2 = WhisperMultiFrontLCNN(
+        input_channels=input_channels,
+        freeze_encoder=True,
+        device=device,
+        frontend_algorithm="lfcc"
+    )
+    x = np.random.rand(2, 30 * 16_000).astype(np.float32)
+    x = torch.from_numpy(x)
+    out = classifier(x)
+    print(out.shape)
+    out = classifier_2(x)
+    print(out.shape)

src/models/whisper_main.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# Based on https://github.com/openai/whisper/blob/main/whisper/model.py
+from dataclasses import dataclass
+from functools import lru_cache
+import os
+from typing import Iterable, Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch import nn
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+N_MELS = 80
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk
+N_FRAMES = exact_div(
+    N_SAMPLES, HOP_LENGTH
+)  # 3000: number of frames in a mel spectrogram input
+def pad_or_trim(
+    array: Union[torch.Tensor, np.ndarray],
+    length: int = N_SAMPLES,
+    *,
+    axis: int = -1,
+) -> torch.Tensor:
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if not torch.is_tensor(array):
+        array = torch.from_numpy(array)
+    if array.shape[axis] > length:
+        array = array.index_select(
+            dim=axis, index=torch.arange(length, device=array.device)
+        )
+    if array.shape[axis] < length:
+        # pad multiple times
+        num_repeats = int(length / array.shape[axis]) + 1
+        array = torch.tile(array, (1, num_repeats))[:, :length]
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "assets/mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(audio: torch.Tensor, n_mels: int = N_MELS):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[:, :-1].abs() ** 2
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec
+@dataclass
+class ModelDimensions:
+    n_mels: int
+    n_audio_ctx: int
+    n_audio_state: int
+    n_audio_head: int
+    n_audio_layer: int
+    n_vocab: int
+    n_text_ctx: int
+    n_text_state: int
+    n_text_head: int
+    n_text_layer: int
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(
+        self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+def sinusoids(length, channels, max_timescale=10_000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        q = self.query(x)
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+        wv = self.qkv_attention(q, k, v, mask)
+        return self.out(wv)
+    def qkv_attention(
+        self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
+    ):
+        n_batch, n_ctx, n_state = q.shape
+        scale = (n_state // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        qk = q @ k
+        if mask is not None:
+            qk = qk + mask[:n_ctx, :n_ctx]
+        w = F.softmax(qk.float(), dim=-1).to(q.dtype)
+        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+        self.cross_attn = (
+            MultiHeadAttention(n_state, n_head) if cross_attention else None
+        )
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
+        if self.cross_attn:
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(
+        self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+        assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
+        x = (x + self.positional_embedding).to(x.dtype)
+        for block in self.blocks:
+            x = block(x)
+        x = self.ln_post(x)
+        return x
+class TextDecoder(nn.Module):
+    def __init__(
+        self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.token_embedding = nn.Embedding(n_vocab, n_state)
+        self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [
+                ResidualAttentionBlock(n_state, n_head, cross_attention=True)
+                for _ in range(n_layer)
+            ]
+        )
+        self.ln = LayerNorm(n_state)
+        mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
+        self.register_buffer("mask", mask, persistent=False)
+    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = (
+            self.token_embedding(x)
+            + self.positional_embedding[offset : offset + x.shape[-1]]
+        )
+        x = x.to(xa.dtype)
+        for block in self.blocks:
+            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
+        x = self.ln(x)
+        logits = (
+            x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
+        ).float()
+        return logits
+class Whisper(nn.Module):
+    def __init__(self, dims: ModelDimensions):
+        super().__init__()
+        self.dims = dims
+        self.encoder = AudioEncoder(
+            self.dims.n_mels,
+            self.dims.n_audio_ctx,
+            self.dims.n_audio_state,
+            self.dims.n_audio_head,
+            self.dims.n_audio_layer,
+        )
+    def forward(self, mel: torch.Tensor):
+        return self.encoder(mel)
+    @property
+    def device(self):
+        return next(self.parameters()).device

src/models/whisper_meso_net.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+from src import frontends
+from src.models.whisper_main import ModelDimensions, Whisper, log_mel_spectrogram
+from src.models.meso_net import MesoInception4
+from src.commons import WHISPER_MODEL_WEIGHTS_PATH
+class WhisperMesoNet(MesoInception4):
+    def __init__(self, freeze_encoder, **kwargs):
+        super().__init__(**kwargs)
+        self.device = kwargs['device']
+        checkpoint = torch.load(WHISPER_MODEL_WEIGHTS_PATH)
+        dims = ModelDimensions(**checkpoint["dims"].__dict__)
+        model = Whisper(dims)
+        model = model.to(self.device)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        self.whisper_model = model
+        if freeze_encoder:
+            for param in self.whisper_model.parameters():
+                param.requires_grad = False
+    def compute_whisper_features(self, x):
+        specs = []
+        for sample in x:
+            specs.append(log_mel_spectrogram(sample))
+        x = torch.stack(specs)
+        x = self.whisper_model(x)
+        x = x.permute(0, 2, 1)  # (bs, frames, 3 x n_lfcc)
+        x = x.unsqueeze(1)  # (bs, 1, frames, 3 x n_lfcc)
+        x = x.repeat(
+            (1, 1, 1, 2)
+        )  # (bs, 1, frames, 3 x n_lfcc) -> (bs, 1, frames, 3000)
+        return x
+    def forward(self, x):
+        # we assume that the data is correct (i.e. 30s)
+        x = self.compute_whisper_features(x)
+        out = self._compute_embedding(x)
+        return out
+class WhisperMultiFrontMesoNet(WhisperMesoNet):
+    def __init__(self, freeze_encoder, **kwargs):
+        super().__init__(freeze_encoder=freeze_encoder, **kwargs)
+        self.frontend = frontends.get_frontend(kwargs['frontend_algorithm'])
+        print(f"Using {self.frontend} frontend!")
+    def forward(self, x):
+        # Frontend computation
+        frontend_x = self.frontend(x)
+        x = self.compute_whisper_features(x)
+        x = torch.cat([x, frontend_x], 1)
+        out = self._compute_embedding(x)
+        return out
+if __name__ == "__main__":
+    import numpy as np
+    input_channels = 1
+    device = "cpu"
+    classifier = WhisperMesoNet(
+        input_channels=input_channels,
+        freeze_encoder=True,
+        fc1_dim=1024,
+        device=device,
+    )
+    input_channels = 2
+    classifier_2 = WhisperMultiFrontMesoNet(
+        input_channels=input_channels,
+        freeze_encoder=True,
+        fc1_dim=1024,
+        device=device,
+        frontend_algorithm="lfcc"
+    )
+    x = np.random.rand(2, 30 * 16_000).astype(np.float32)
+    x = torch.from_numpy(x)
+    out = classifier(x)
+    print(out.shape)
+    out = classifier_2(x)
+    print(out.shape)

src/models/whisper_specrnet.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import numpy as np
+import torch
+from src import frontends
+from src.models.whisper_main import ModelDimensions, Whisper, log_mel_spectrogram
+from src.models.specrnet import SpecRNet
+from src.commons import WHISPER_MODEL_WEIGHTS_PATH
+class WhisperSpecRNet(SpecRNet):
+    def __init__(self, input_channels, freeze_encoder, **kwargs):
+        super().__init__(input_channels=input_channels, **kwargs)
+        self.device = kwargs["device"]
+        checkpoint = torch.load(WHISPER_MODEL_WEIGHTS_PATH)
+        dims = ModelDimensions(**checkpoint["dims"].__dict__)
+        model = Whisper(dims)
+        model = model.to(self.device)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        self.whisper_model = model
+        if freeze_encoder:
+            for param in self.whisper_model.parameters():
+                param.requires_grad = False
+    def compute_whisper_features(self, x):
+        specs = []
+        for sample in x:
+            specs.append(log_mel_spectrogram(sample))
+        x = torch.stack(specs)
+        x = self.whisper_model(x)
+        x = x.permute(0, 2, 1)  # (bs, frames, 3 x n_lfcc)
+        x = x.unsqueeze(1)  # (bs, 1, frames, 3 x n_lfcc)
+        x = x.repeat(
+            (1, 1, 1, 2)
+        )  # (bs, 1, frames, 3 x n_lfcc) -> (bs, 1, frames, 3000)
+        return x
+    def forward(self, x):
+        # we assume that the data is correct (i.e. 30s)
+        x = self.compute_whisper_features(x)
+        out = self._compute_embedding(x)
+        return out
+class WhisperMultiFrontSpecRNet(WhisperSpecRNet):
+    def __init__(self, input_channels, freeze_encoder, **kwargs):
+        super().__init__(
+            input_channels=input_channels,
+            freeze_encoder=freeze_encoder,
+            **kwargs,
+        )
+        self.frontend = frontends.get_frontend(kwargs["frontend_algorithm"])
+        print(f"Using {self.frontend} frontend!")
+    def forward(self, x):
+        # Frontend computation
+        frontend_x = self.frontend(x)
+        x = self.compute_whisper_features(x)
+        x = torch.cat([x, frontend_x], 1)
+        out = self._compute_embedding(x)
+        return out
+if __name__ == "__main__":
+    import numpy as np
+    input_channels = 1
+    config = {
+        "filts": [input_channels, [input_channels, 20], [20, 64], [64, 64]],
+        "nb_fc_node": 64,
+        "gru_node": 64,
+        "nb_gru_layer": 2,
+        "nb_classes": 1,
+    }
+    device = "cpu"
+    classifier = WhisperSpecRNet(
+        input_channels,
+        freeze_encoder=False,
+        device=device,
+    )
+    input_channels = 2
+    classifier_2 = WhisperMultiFrontSpecRNet(
+        input_channels,
+        freeze_encoder=False,
+        device=device,
+        frontend_algorithm="lfcc"
+    )
+    x = np.random.rand(2, 30 * 16_000).astype(np.float32)
+    x = torch.from_numpy(x)
+    out = classifier(x)
+    print(out.shape)
+    out = classifier_2(x)
+    print(out.shape)