File size: 2,481 Bytes
160b238
 
 
 
8d5fb9b
160b238
 
b8e2f30
511cd9e
 
 
b8e2f30
 
c1d7af5
 
 
729b487
 
8d5fb9b
729b487
 
 
 
 
 
 
 
8d5fb9b
729b487
 
8d5fb9b
 
729b487
 
8d5fb9b
 
729b487
8d5fb9b
729b487
 
c1d7af5
 
bc7c876
8d5fb9b
c1d7af5
8d5fb9b
729b487
 
160b238
 
8d5fb9b
160b238
8d5fb9b
 
 
 
 
bc7c876
c1d7af5
8d5fb9b
 
 
 
160b238
8d5fb9b
cf08fbc
 
 
 
21e2400
8d5fb9b
160b238
 
 
 
 
 
 
 
 
 
 
8d5fb9b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import tensorflow as tf
import librosa
import numpy as np
import tempfile

# Diccionario de etiquetas
labels = [
    'stop', 'six', 'sheila', 'marvin', 'go', 'tree', 'seven', 'five', 'two', 'off',
    'bird', 'zero', 'dog', 'house', 'one', 'right', 'nine', 'on', 'yes', 'wow', 'four',
    'three', 'up', 'no', 'left', 'cat', 'down', 'bed', 'eight', 'happy'
]


def extract_features(file_name):
    try:
        # Resamplea a 16kHz
        audio, sample_rate = librosa.load(file_name, sr=16000)
        
        # Saca Mel-spectrograma
        mel_spectrogram = librosa.feature.melspectrogram(
            y=audio, 
            sr=sample_rate, 
            n_mels=257, 
            n_fft=512, 
            hop_length=256
        )
        
        # Convierte a escala logarítmica
        log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        
        # Ajusta tamaño exacto
        log_mel_spectrogram = librosa.util.fix_length(log_mel_spectrogram, size=257, axis=0)
        log_mel_spectrogram = librosa.util.fix_length(log_mel_spectrogram, size=97, axis=1)
        
        # Normaliza
        log_mel_spectrogram = (log_mel_spectrogram - np.mean(log_mel_spectrogram)) / np.std(log_mel_spectrogram)

        # Añade canal
        log_mel_spectrogram = log_mel_spectrogram[..., np.newaxis]
        
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}")
        print(e)
        return None 
    
    return log_mel_spectrogram


def classify_audio(audio_file):
    print(f"Tipo de audio_file: {type(audio_file)}") 

    # El tipo es string (ruta), no hace falta leer ni escribir en temp files
    file_path = audio_file

    # Extrae características
    features = extract_features(file_path)

    if features is None:
        return "Error al procesar el audio"

    # Añade batch dimension
    features = features[np.newaxis, ...]  # (1, 97, 257, 1)

    # Carga el modelo en CPU
    model = tf.keras.models.load_model('my_model.h5', compile=False)

    with tf.device('/CPU:0'):
        prediction = model.predict(features)
        predicted_label_index = np.argmax(prediction)

    predicted_label = labels[predicted_label_index]
    return predicted_label

iface = gr.Interface(
    fn=classify_audio,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Clasificación de audio simple",
    description="Sube un archivo de audio para clasificarlo."
)

iface.launch()