File size: 2,234 Bytes
160b238 8d5fb9b 160b238 c1d7af5 8d5fb9b c1d7af5 bc7c876 8d5fb9b c1d7af5 8d5fb9b 160b238 8d5fb9b 160b238 8d5fb9b bc7c876 c1d7af5 8d5fb9b 160b238 8d5fb9b cf08fbc 21e2400 8d5fb9b 160b238 8d5fb9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import gradio as gr
import tensorflow as tf
import librosa
import numpy as np
import tempfile
# Diccionario de etiquetas
labels = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']
def extract_features(file_name):
try:
# Carga el audio sin cambiar el sample rate
audio, sample_rate = librosa.load(file_name, sr=None)
# Saca el espectrograma de magnitud
spectrogram = np.abs(librosa.stft(audio, n_fft=512, hop_length=256))
# Convierte a escala logarítmica (como normalmente esperan los modelos de audio)
log_spectrogram = librosa.amplitude_to_db(spectrogram)
# Ajusta tamaño exacto
log_spectrogram = librosa.util.fix_length(log_spectrogram, size=257, axis=0)
log_spectrogram = librosa.util.fix_length(log_spectrogram, size=97, axis=1)
# Normaliza
log_spectrogram = (log_spectrogram - np.mean(log_spectrogram)) / np.std(log_spectrogram)
# Añade canal para la red convolucional
log_spectrogram = log_spectrogram[..., np.newaxis]
except Exception as e:
print(f"Error encountered while parsing file: {file_name}")
print(e)
return None
return log_spectrogram
def classify_audio(audio_file):
print(f"Tipo de audio_file: {type(audio_file)}")
# El tipo es string (ruta), no hace falta leer ni escribir en temp files
file_path = audio_file
# Extrae características
features = extract_features(file_path)
if features is None:
return "Error al procesar el audio"
# Añade batch dimension
features = features[np.newaxis, ...] # (1, 97, 257, 1)
# Carga el modelo en CPU
model = tf.keras.models.load_model('my_model.h5', compile=False)
with tf.device('/CPU:0'):
prediction = model.predict(features)
predicted_label_index = np.argmax(prediction)
predicted_label = labels[predicted_label_index]
return predicted_label
iface = gr.Interface(
fn=classify_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Clasificación de audio simple",
description="Sube un archivo de audio para clasificarlo."
)
iface.launch()
|