File size: 2,594 Bytes
462d118
 
 
 
 
573ac8e
 
 
 
 
 
 
 
 
 
 
 
 
 
462d118
 
a43fdc1
573ac8e
 
4a6c542
573ac8e
462d118
 
 
 
a43fdc1
462d118
 
 
573ac8e
 
 
 
 
462d118
 
 
573ac8e
a43fdc1
573ac8e
 
462d118
573ac8e
a43fdc1
462d118
a43fdc1
cecbe0e
 
 
a43fdc1
 
 
 
 
 
 
 
 
 
 
462d118
a43fdc1
573ac8e
a43fdc1
462d118
 
a43fdc1
462d118
 
 
573ac8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import torchaudio
import torch
from model import M11
import gradio as gr

def _cut_if_necessary(signal):
    if signal.shape[1] > 400000:
      signal = signal[:, :400000]

    return signal

def _right_pad_if_necessary(signal):
    signal_length = signal.shape[1]
    if signal_length < 400000:
      num_missing_samples = 400000 - signal_length
      last_dim_padding = (0, num_missing_samples)   # will add 0 number of zeros in the left side of array and num_missing_samples number of zeros in the right part
      signal = torch.nn.functional.pad(signal, last_dim_padding)

    return signal

def preprocess(signal, sr, device):
    
    # add a channel dimension for 1d samples
    if len(signal.shape) == 1:
        signal = signal.unsqueeze(0)
        
    # resampling the audio signal with the training sample rate
    if sr != 8_000:
        resampler = torchaudio.transforms.Resample(sr, 8_000).to(device)
        signal = resampler(signal)
        
    # turning the stereo signals into mono
    if signal.shape[0] > 1:
        signal = torch.mean(signal, dim=0, keepdim=True)

    signal = _cut_if_necessary(signal)   # truncating longer signals
    signal = _right_pad_if_necessary(signal)   # extending shorter signals
    

    return signal


def pipeline(audio_file):
    
    audio_PATH = audio_file.name
    audio, sample_rate = torchaudio.load(audio_PATH)

    processed_audio = preprocess(audio.to(DEVICE), sample_rate, DEVICE)
    
    with torch.no_grad():
        pred = torch.exp(classifier(processed_audio.unsqueeze(0)).squeeze())    # turning log_softmax into probabilities
    
    print({labels[i]: float(pred[i]) for i in range(3)})
    print(classifier(processed_audio.unsqueeze(0)).squeeze())

    return {labels[i]: float(pred[i]) for i in range(3)}


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model_PATH = "./model.ckpt"

labels = ["Threat", "Normal", "Sarcastic"]

classifier = M11.load_from_checkpoint(model_PATH).to(DEVICE)
classifier.eval()
  

inputs = gr.inputs.Audio(label="Input Audio", type="file")
outputs = gr.outputs.Label(num_top_classes=3)
title = "Threat Detection From Bengali Voice Calls"
description = "Gradio demo for Audio Classification, simply upload your audio, or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://github.com/khalidsaifullaah' target='_blank'>Github Repo</a></p>"
examples = [
    ['sample_audio.wav']
]
gr.Interface(pipeline, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()