File size: 2,481 Bytes
160b238 8d5fb9b 160b238 b8e2f30 511cd9e b8e2f30 c1d7af5 729b487 8d5fb9b 729b487 8d5fb9b 729b487 8d5fb9b 729b487 8d5fb9b 729b487 8d5fb9b 729b487 c1d7af5 bc7c876 8d5fb9b c1d7af5 8d5fb9b 729b487 160b238 8d5fb9b 160b238 8d5fb9b bc7c876 c1d7af5 8d5fb9b 160b238 8d5fb9b cf08fbc 21e2400 8d5fb9b 160b238 8d5fb9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
import tensorflow as tf
import librosa
import numpy as np
import tempfile
# Diccionario de etiquetas
labels = [
'stop', 'six', 'sheila', 'marvin', 'go', 'tree', 'seven', 'five', 'two', 'off',
'bird', 'zero', 'dog', 'house', 'one', 'right', 'nine', 'on', 'yes', 'wow', 'four',
'three', 'up', 'no', 'left', 'cat', 'down', 'bed', 'eight', 'happy'
]
def extract_features(file_name):
try:
# Resamplea a 16kHz
audio, sample_rate = librosa.load(file_name, sr=16000)
# Saca Mel-spectrograma
mel_spectrogram = librosa.feature.melspectrogram(
y=audio,
sr=sample_rate,
n_mels=257,
n_fft=512,
hop_length=256
)
# Convierte a escala logarítmica
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
# Ajusta tamaño exacto
log_mel_spectrogram = librosa.util.fix_length(log_mel_spectrogram, size=257, axis=0)
log_mel_spectrogram = librosa.util.fix_length(log_mel_spectrogram, size=97, axis=1)
# Normaliza
log_mel_spectrogram = (log_mel_spectrogram - np.mean(log_mel_spectrogram)) / np.std(log_mel_spectrogram)
# Añade canal
log_mel_spectrogram = log_mel_spectrogram[..., np.newaxis]
except Exception as e:
print(f"Error encountered while parsing file: {file_name}")
print(e)
return None
return log_mel_spectrogram
def classify_audio(audio_file):
print(f"Tipo de audio_file: {type(audio_file)}")
# El tipo es string (ruta), no hace falta leer ni escribir en temp files
file_path = audio_file
# Extrae características
features = extract_features(file_path)
if features is None:
return "Error al procesar el audio"
# Añade batch dimension
features = features[np.newaxis, ...] # (1, 97, 257, 1)
# Carga el modelo en CPU
model = tf.keras.models.load_model('my_model.h5', compile=False)
with tf.device('/CPU:0'):
prediction = model.predict(features)
predicted_label_index = np.argmax(prediction)
predicted_label = labels[predicted_label_index]
return predicted_label
iface = gr.Interface(
fn=classify_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Clasificación de audio simple",
description="Sube un archivo de audio para clasificarlo."
)
iface.launch()
|