Initial deployment: Speech Emotion Recognition
Browse files- app.py +300 -0
- preprocess_ravdess.py +178 -0
- quick_train.py +291 -0
- requirements.txt +9 -0
- src/__init__.py +0 -0
- src/ensemble_model.py +206 -0
- src/feature_extraction.py +78 -0
- src/utils.py +83 -0
- test_local.py +156 -0
app.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speech Emotion Recognition - Gradio Application
|
| 3 |
+
Upload or record audio to detect emotions using ensemble ML models
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import numpy as np
|
| 8 |
+
from src.ensemble_model import EnsembleEmotionRecognizer
|
| 9 |
+
from src.feature_extraction import extract_features
|
| 10 |
+
from src.utils import (
|
| 11 |
+
create_waveform_plot,
|
| 12 |
+
create_spectrogram_plot,
|
| 13 |
+
format_probability_text,
|
| 14 |
+
get_emotion_emoji
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# ============================================================================
|
| 18 |
+
# LOAD MODEL
|
| 19 |
+
# ============================================================================
|
| 20 |
+
|
| 21 |
+
print("="*60)
|
| 22 |
+
print("π€ Speech Emotion Recognition")
|
| 23 |
+
print("="*60)
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
model = EnsembleEmotionRecognizer(weights_dir='weights')
|
| 27 |
+
MODEL_LOADED = True
|
| 28 |
+
print("\nβ
Application ready!")
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"\nβ Error loading model: {e}")
|
| 31 |
+
MODEL_LOADED = False
|
| 32 |
+
|
| 33 |
+
print("="*60)
|
| 34 |
+
|
| 35 |
+
# ============================================================================
|
| 36 |
+
# PREDICTION FUNCTION
|
| 37 |
+
# ============================================================================
|
| 38 |
+
|
| 39 |
+
def predict_emotion(audio_file):
|
| 40 |
+
"""
|
| 41 |
+
Main prediction function for Gradio interface
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
audio_file (str): Path to uploaded/recorded audio file
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
tuple: (result_text, prob_chart, waveform_fig, spectrogram_fig)
|
| 48 |
+
"""
|
| 49 |
+
if not MODEL_LOADED:
|
| 50 |
+
return (
|
| 51 |
+
"β **Error**: Model not loaded. Please check model files in weights/ directory.",
|
| 52 |
+
None,
|
| 53 |
+
None,
|
| 54 |
+
None
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
if audio_file is None:
|
| 58 |
+
return (
|
| 59 |
+
"β οΈ **Please upload an audio file or record your voice**",
|
| 60 |
+
None,
|
| 61 |
+
None,
|
| 62 |
+
None
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
# Extract features from audio
|
| 67 |
+
print(f"\nπ Processing: {audio_file}")
|
| 68 |
+
features, y, sr = extract_features(audio_file)
|
| 69 |
+
print(f" β Features extracted: {features.shape}")
|
| 70 |
+
|
| 71 |
+
# Predict emotion
|
| 72 |
+
emotion, confidence, prob_dict = model.predict_with_confidence(features)
|
| 73 |
+
print(f" β Prediction: {emotion} ({confidence:.2%})")
|
| 74 |
+
|
| 75 |
+
# Get emoji
|
| 76 |
+
emoji = get_emotion_emoji(emotion)
|
| 77 |
+
|
| 78 |
+
# Create result text
|
| 79 |
+
result_text = f"""
|
| 80 |
+
## π― Prediction Result
|
| 81 |
+
|
| 82 |
+
### {emoji} **{emotion.upper()}**
|
| 83 |
+
|
| 84 |
+
**Confidence: {confidence*100:.1f}%**
|
| 85 |
+
|
| 86 |
+
---
|
| 87 |
+
|
| 88 |
+
### π Probability Distribution
|
| 89 |
+
|
| 90 |
+
{format_probability_text(prob_dict)}
|
| 91 |
+
|
| 92 |
+
---
|
| 93 |
+
|
| 94 |
+
### βΉοΈ Model Information
|
| 95 |
+
|
| 96 |
+
- **Models**: XGBoost, LightGBM, Gradient Boosting, AdaBoost
|
| 97 |
+
- **Features**: 80 selected from 162 total
|
| 98 |
+
- **Optimization**: Evolutionary Algorithm
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
# Create probability chart for Gradio Label component
|
| 102 |
+
prob_chart = {k.capitalize(): v for k, v in prob_dict.items()}
|
| 103 |
+
|
| 104 |
+
# Create visualizations
|
| 105 |
+
waveform_fig = create_waveform_plot(y, sr)
|
| 106 |
+
spectrogram_fig = create_spectrogram_plot(y, sr)
|
| 107 |
+
|
| 108 |
+
return result_text, prob_chart, waveform_fig, spectrogram_fig
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
error_msg = f"β **Error during prediction**: {str(e)}"
|
| 112 |
+
print(f"\n{error_msg}")
|
| 113 |
+
return error_msg, None, None, None
|
| 114 |
+
|
| 115 |
+
# ============================================================================
|
| 116 |
+
# GRADIO INTERFACE
|
| 117 |
+
# ============================================================================
|
| 118 |
+
|
| 119 |
+
# Custom CSS
|
| 120 |
+
custom_css = """
|
| 121 |
+
.gradio-container {
|
| 122 |
+
font-family: 'Inter', 'Arial', sans-serif;
|
| 123 |
+
max-width: 1200px;
|
| 124 |
+
margin: auto;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
.header {
|
| 128 |
+
text-align: center;
|
| 129 |
+
padding: 30px;
|
| 130 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 131 |
+
border-radius: 15px;
|
| 132 |
+
margin-bottom: 30px;
|
| 133 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.header h1 {
|
| 137 |
+
color: white;
|
| 138 |
+
margin: 0;
|
| 139 |
+
font-size: 2.5em;
|
| 140 |
+
font-weight: 700;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
.header p {
|
| 144 |
+
color: rgba(255, 255, 255, 0.9);
|
| 145 |
+
margin-top: 10px;
|
| 146 |
+
font-size: 1.1em;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.emotion-list {
|
| 150 |
+
background: #f8f9fa;
|
| 151 |
+
padding: 20px;
|
| 152 |
+
border-radius: 10px;
|
| 153 |
+
margin-top: 20px;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
.footer {
|
| 157 |
+
text-align: center;
|
| 158 |
+
margin-top: 40px;
|
| 159 |
+
padding: 20px;
|
| 160 |
+
color: #666;
|
| 161 |
+
border-top: 1px solid #e0e0e0;
|
| 162 |
+
}
|
| 163 |
+
"""
|
| 164 |
+
|
| 165 |
+
# Create Gradio interface
|
| 166 |
+
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Speech Emotion Recognition") as demo:
|
| 167 |
+
|
| 168 |
+
# Header
|
| 169 |
+
gr.HTML("""
|
| 170 |
+
<div class="header">
|
| 171 |
+
<h1>π€ Speech Emotion Recognition</h1>
|
| 172 |
+
<p>AI-powered emotion detection using Evolutionary Algorithm optimization</p>
|
| 173 |
+
</div>
|
| 174 |
+
""")
|
| 175 |
+
|
| 176 |
+
# Main interface
|
| 177 |
+
with gr.Row():
|
| 178 |
+
# Left column - Input
|
| 179 |
+
with gr.Column(scale=1):
|
| 180 |
+
gr.Markdown("### ποΈ Audio Input")
|
| 181 |
+
|
| 182 |
+
audio_input = gr.Audio(
|
| 183 |
+
sources=["upload", "microphone"],
|
| 184 |
+
type="filepath",
|
| 185 |
+
label="Upload or Record Audio"
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
predict_btn = gr.Button(
|
| 189 |
+
"π― Predict Emotion",
|
| 190 |
+
variant="primary",
|
| 191 |
+
size="lg"
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
gr.Markdown("""
|
| 195 |
+
<div class="emotion-list">
|
| 196 |
+
|
| 197 |
+
### π Supported Emotions
|
| 198 |
+
|
| 199 |
+
- π **Angry**
|
| 200 |
+
- π **Calm**
|
| 201 |
+
- π€’ **Disgust**
|
| 202 |
+
- π¨ **Fearful**
|
| 203 |
+
- π **Happy**
|
| 204 |
+
- π **Neutral**
|
| 205 |
+
- π’ **Sad**
|
| 206 |
+
- π² **Surprised**
|
| 207 |
+
|
| 208 |
+
</div>
|
| 209 |
+
""")
|
| 210 |
+
|
| 211 |
+
# Right column - Results
|
| 212 |
+
with gr.Column(scale=2):
|
| 213 |
+
gr.Markdown("### π Prediction Results")
|
| 214 |
+
|
| 215 |
+
result_text = gr.Markdown(
|
| 216 |
+
value="*Upload an audio file or record your voice to get started*"
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
prob_chart = gr.Label(
|
| 220 |
+
label="Emotion Probabilities",
|
| 221 |
+
num_top_classes=8
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# Visualizations (collapsible)
|
| 225 |
+
with gr.Accordion("π Audio Visualizations", open=False):
|
| 226 |
+
with gr.Row():
|
| 227 |
+
waveform_plot = gr.Plot(label="Waveform")
|
| 228 |
+
spectrogram_plot = gr.Plot(label="Spectrogram")
|
| 229 |
+
|
| 230 |
+
# Information section
|
| 231 |
+
gr.Markdown("""
|
| 232 |
+
---
|
| 233 |
+
|
| 234 |
+
## βΉοΈ About This System
|
| 235 |
+
|
| 236 |
+
This Speech Emotion Recognition system uses an **Evolutionary Algorithm** to optimize:
|
| 237 |
+
|
| 238 |
+
1. 𧬠**Feature Selection**: Automatically selects the most informative features (80 out of 162)
|
| 239 |
+
2. βοΈ **Hyperparameter Tuning**: Optimizes parameters for all 4 models
|
| 240 |
+
3. βοΈ **Ensemble Weights**: Finds optimal combination weights
|
| 241 |
+
|
| 242 |
+
### π― Model Architecture
|
| 243 |
+
|
| 244 |
+
- **XGBoost**: Gradient boosting with regularization
|
| 245 |
+
- **LightGBM**: Fast gradient boosting framework
|
| 246 |
+
- **Gradient Boosting**: Sequential ensemble learning
|
| 247 |
+
- **AdaBoost**: Adaptive boosting algorithm
|
| 248 |
+
|
| 249 |
+
### π Performance
|
| 250 |
+
|
| 251 |
+
- **Ensemble Accuracy**: ~87%
|
| 252 |
+
- **Dataset**: RAVDESS (1,440 samples, 24 actors)
|
| 253 |
+
- **Training**: Evolutionary optimization over 100 generations
|
| 254 |
+
|
| 255 |
+
### π¬ Technical Details
|
| 256 |
+
|
| 257 |
+
**Audio Features Extracted** (162 total):
|
| 258 |
+
- Zero Crossing Rate (1)
|
| 259 |
+
- Chroma STFT (12)
|
| 260 |
+
- MFCC (20)
|
| 261 |
+
- RMS Energy (1)
|
| 262 |
+
- Mel Spectrogram (128)
|
| 263 |
+
|
| 264 |
+
**Optimized to 80 features** via feature importance ranking
|
| 265 |
+
|
| 266 |
+
---
|
| 267 |
+
""")
|
| 268 |
+
|
| 269 |
+
# Footer
|
| 270 |
+
gr.HTML("""
|
| 271 |
+
<div class="footer">
|
| 272 |
+
<p>
|
| 273 |
+
π΅ Speech Emotion Recognition System<br>
|
| 274 |
+
Powered by Evolutionary Algorithm Optimization<br>
|
| 275 |
+
Built with β€οΈ using Gradio | Deployed on π€ Hugging Face Spaces
|
| 276 |
+
</p>
|
| 277 |
+
<p style="margin-top: 10px; font-size: 0.9em;">
|
| 278 |
+
<a href="https://github.com/yourusername/speech-emotion-recognition" target="_blank">GitHub</a> |
|
| 279 |
+
<a href="https://huggingface.co/spaces/yourusername/speech-emotion-recognition" target="_blank">Hugging Face</a>
|
| 280 |
+
</p>
|
| 281 |
+
</div>
|
| 282 |
+
""")
|
| 283 |
+
|
| 284 |
+
# Connect prediction function
|
| 285 |
+
predict_btn.click(
|
| 286 |
+
fn=predict_emotion,
|
| 287 |
+
inputs=[audio_input],
|
| 288 |
+
outputs=[result_text, prob_chart, waveform_plot, spectrogram_plot]
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# ============================================================================
|
| 292 |
+
# LAUNCH
|
| 293 |
+
# ============================================================================
|
| 294 |
+
|
| 295 |
+
if __name__ == "__main__":
|
| 296 |
+
demo.launch(
|
| 297 |
+
server_name="0.0.0.0",
|
| 298 |
+
server_port=7860,
|
| 299 |
+
show_error=True
|
| 300 |
+
)
|
preprocess_ravdess.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Preprocess RAVDESS Dataset
|
| 3 |
+
Extract features from all audio files and save to CSV
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import librosa
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import warnings
|
| 13 |
+
warnings.filterwarnings('ignore')
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def extract_emotion_from_filename(filename):
|
| 17 |
+
"""
|
| 18 |
+
Extract emotion from RAVDESS filename
|
| 19 |
+
Format: modality-vocal channel-emotion-emotional intensity-statement-repetition-actor.wav
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
filename (str): Audio filename
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
str: Emotion label
|
| 26 |
+
"""
|
| 27 |
+
parts = filename.split('-')
|
| 28 |
+
emotion_code = int(parts[2])
|
| 29 |
+
|
| 30 |
+
emotion_map = {
|
| 31 |
+
1: 'neutral',
|
| 32 |
+
2: 'calm',
|
| 33 |
+
3: 'happy',
|
| 34 |
+
4: 'sad',
|
| 35 |
+
5: 'angry',
|
| 36 |
+
6: 'fearful',
|
| 37 |
+
7: 'disgust',
|
| 38 |
+
8: 'surprised'
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
return emotion_map.get(emotion_code, 'unknown')
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def extract_audio_features(file_path, duration=2.5, offset=0.6):
|
| 45 |
+
"""
|
| 46 |
+
Extract 162 features from audio file
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
np.array: Feature vector of shape (162,)
|
| 50 |
+
"""
|
| 51 |
+
try:
|
| 52 |
+
y, sr = librosa.load(file_path, duration=duration, offset=offset)
|
| 53 |
+
|
| 54 |
+
features = np.array([])
|
| 55 |
+
|
| 56 |
+
# ZCR (1)
|
| 57 |
+
zcr = np.mean(librosa.feature.zero_crossing_rate(y=y).T, axis=0)
|
| 58 |
+
features = np.hstack((features, zcr))
|
| 59 |
+
|
| 60 |
+
# Chroma (12)
|
| 61 |
+
stft = np.abs(librosa.stft(y))
|
| 62 |
+
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
|
| 63 |
+
features = np.hstack((features, chroma))
|
| 64 |
+
|
| 65 |
+
# MFCC (20)
|
| 66 |
+
mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20).T, axis=0)
|
| 67 |
+
features = np.hstack((features, mfcc))
|
| 68 |
+
|
| 69 |
+
# RMS (1)
|
| 70 |
+
rms = np.mean(librosa.feature.rms(y=y).T, axis=0)
|
| 71 |
+
features = np.hstack((features, rms))
|
| 72 |
+
|
| 73 |
+
# Mel (128)
|
| 74 |
+
mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
|
| 75 |
+
features = np.hstack((features, mel))
|
| 76 |
+
|
| 77 |
+
return features
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"Error processing {file_path}: {e}")
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def process_ravdess_dataset(data_dir, output_csv='features_ravdess.csv'):
|
| 85 |
+
"""
|
| 86 |
+
Process all audio files in RAVDESS dataset
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
data_dir (str): Path to dataset root (containing Actor_01, Actor_02, ...)
|
| 90 |
+
output_csv (str): Output CSV filename
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
pd.DataFrame: DataFrame with features and labels
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
print("="*70)
|
| 97 |
+
print("RAVDESS Dataset Feature Extraction")
|
| 98 |
+
print("="*70)
|
| 99 |
+
|
| 100 |
+
data_dir = Path(data_dir)
|
| 101 |
+
|
| 102 |
+
# Find all actor directories
|
| 103 |
+
actor_dirs = sorted([d for d in data_dir.iterdir() if d.is_dir() and d.name.startswith('Actor_')])
|
| 104 |
+
|
| 105 |
+
if len(actor_dirs) == 0:
|
| 106 |
+
print(f"No Actor directories found in {data_dir}")
|
| 107 |
+
print(" Expected structure: data_dir/Actor_01/, Actor_02/, ...")
|
| 108 |
+
return None
|
| 109 |
+
|
| 110 |
+
print(f"\nFound {len(actor_dirs)} actor directories")
|
| 111 |
+
|
| 112 |
+
# Collect all audio files
|
| 113 |
+
audio_files = []
|
| 114 |
+
for actor_dir in actor_dirs:
|
| 115 |
+
files = list(actor_dir.glob('*.wav'))
|
| 116 |
+
audio_files.extend(files)
|
| 117 |
+
|
| 118 |
+
print(f"Total audio files: {len(audio_files)}")
|
| 119 |
+
|
| 120 |
+
if len(audio_files) == 0:
|
| 121 |
+
print("No audio files found!")
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
# Extract features
|
| 125 |
+
print(f"\nExtracting features...")
|
| 126 |
+
data_list = []
|
| 127 |
+
|
| 128 |
+
for audio_file in tqdm(audio_files, desc="Processing"):
|
| 129 |
+
# Extract features
|
| 130 |
+
features = extract_audio_features(str(audio_file))
|
| 131 |
+
|
| 132 |
+
if features is None:
|
| 133 |
+
continue
|
| 134 |
+
|
| 135 |
+
# Get metadata
|
| 136 |
+
emotion = extract_emotion_from_filename(audio_file.name)
|
| 137 |
+
actor = audio_file.parent.name
|
| 138 |
+
|
| 139 |
+
# Create row
|
| 140 |
+
row = {
|
| 141 |
+
'file_path': str(audio_file),
|
| 142 |
+
'filename': audio_file.name,
|
| 143 |
+
'actor': actor,
|
| 144 |
+
'emotion': emotion
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
# Add features
|
| 148 |
+
for i, feat in enumerate(features):
|
| 149 |
+
row[f'feature_{i}'] = feat
|
| 150 |
+
|
| 151 |
+
data_list.append(row)
|
| 152 |
+
|
| 153 |
+
# Create DataFrame
|
| 154 |
+
df = pd.DataFrame(data_list)
|
| 155 |
+
|
| 156 |
+
# Save to CSV
|
| 157 |
+
df.to_csv(output_csv, index=False)
|
| 158 |
+
|
| 159 |
+
print(f"\nβ
Feature extraction complete!")
|
| 160 |
+
print(f" Saved to: {output_csv}")
|
| 161 |
+
print(f" Shape: {df.shape}")
|
| 162 |
+
print(f" Emotions: {', '.join(df['emotion'].unique())}")
|
| 163 |
+
print(f"\nEmotion distribution:")
|
| 164 |
+
print(df['emotion'].value_counts())
|
| 165 |
+
print("="*70)
|
| 166 |
+
|
| 167 |
+
return df
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
if __name__ == "__main__":
|
| 171 |
+
# Example usage
|
| 172 |
+
import sys
|
| 173 |
+
if len(sys.argv) > 1:
|
| 174 |
+
DATA_DIR = sys.argv[1]
|
| 175 |
+
else:
|
| 176 |
+
DATA_DIR = "data/RAVDESS/audio_speech_actors_01-24"
|
| 177 |
+
|
| 178 |
+
df = process_ravdess_dataset(DATA_DIR)
|
quick_train.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Quick Training Script
|
| 3 |
+
Train models and save weights for Hugging Face deployment
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pickle
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
from sklearn.model_selection import train_test_split
|
| 12 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 13 |
+
from sklearn.metrics import accuracy_score, classification_report
|
| 14 |
+
from xgboost import XGBClassifier
|
| 15 |
+
from lightgbm import LGBMClassifier
|
| 16 |
+
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
|
| 17 |
+
|
| 18 |
+
print("="*70)
|
| 19 |
+
print("QUICK TRAINING - Speech Emotion Recognition")
|
| 20 |
+
print("="*70)
|
| 21 |
+
|
| 22 |
+
# ============================================================================
|
| 23 |
+
# 1. LOAD DATA
|
| 24 |
+
# ============================================================================
|
| 25 |
+
print("\n1οΈβ£ Loading data...")
|
| 26 |
+
|
| 27 |
+
CSV_FILE = 'features_ravdess.csv'
|
| 28 |
+
|
| 29 |
+
if not os.path.exists(CSV_FILE):
|
| 30 |
+
print(f"β Error: {CSV_FILE} not found!")
|
| 31 |
+
print(" Please run preprocess_ravdess.py first to extract features")
|
| 32 |
+
exit(1)
|
| 33 |
+
|
| 34 |
+
df = pd.read_csv(CSV_FILE)
|
| 35 |
+
|
| 36 |
+
# Get features and labels
|
| 37 |
+
feature_cols = [col for col in df.columns if col.startswith('feature_')]
|
| 38 |
+
X = df[feature_cols].values
|
| 39 |
+
y = df['emotion'].values
|
| 40 |
+
|
| 41 |
+
print(f" β Data loaded: {X.shape}")
|
| 42 |
+
print(f" β Emotions: {np.unique(y)}")
|
| 43 |
+
print(f" β Distribution:\n{pd.Series(y).value_counts()}")
|
| 44 |
+
|
| 45 |
+
# ============================================================================
|
| 46 |
+
# 2. PREPROCESSING
|
| 47 |
+
# ============================================================================
|
| 48 |
+
print("\n2οΈβ£ Preprocessing...")
|
| 49 |
+
|
| 50 |
+
# Encode labels
|
| 51 |
+
label_encoder = LabelEncoder()
|
| 52 |
+
y_encoded = label_encoder.fit_transform(y)
|
| 53 |
+
|
| 54 |
+
print(f" β Encoded labels: {label_encoder.classes_}")
|
| 55 |
+
|
| 56 |
+
# Split data (80% train, 20% test)
|
| 57 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 58 |
+
X, y_encoded,
|
| 59 |
+
test_size=0.2,
|
| 60 |
+
random_state=42,
|
| 61 |
+
stratify=y_encoded
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
print(f" β Train set: {X_train.shape}")
|
| 65 |
+
print(f" β Test set: {X_test.shape}")
|
| 66 |
+
|
| 67 |
+
# Scale features
|
| 68 |
+
scaler = StandardScaler()
|
| 69 |
+
X_train_scaled = scaler.fit_transform(X_train)
|
| 70 |
+
X_test_scaled = scaler.transform(X_test)
|
| 71 |
+
|
| 72 |
+
print(f" β Features scaled")
|
| 73 |
+
|
| 74 |
+
# ============================================================================
|
| 75 |
+
# 3. FEATURE SELECTION (Simple: Top 80 by variance)
|
| 76 |
+
# ============================================================================
|
| 77 |
+
print("\n3οΈβ£ Feature selection...")
|
| 78 |
+
|
| 79 |
+
feature_variance = np.var(X_train_scaled, axis=0)
|
| 80 |
+
top_indices = np.argsort(feature_variance)[-80:] # Top 80 features
|
| 81 |
+
|
| 82 |
+
X_train_selected = X_train_scaled[:, top_indices]
|
| 83 |
+
X_test_selected = X_test_scaled[:, top_indices]
|
| 84 |
+
|
| 85 |
+
print(f" β Selected {len(top_indices)} features (from 162)")
|
| 86 |
+
print(
|
| 87 |
+
f" β Variance range: {feature_variance[top_indices].min():.4f} - {feature_variance[top_indices].max():.4f}")
|
| 88 |
+
|
| 89 |
+
# ============================================================================
|
| 90 |
+
# 4. TRAIN MODELS
|
| 91 |
+
# ============================================================================
|
| 92 |
+
print("\n4οΈβ£ Training models...")
|
| 93 |
+
|
| 94 |
+
n_classes = len(label_encoder.classes_)
|
| 95 |
+
models = {}
|
| 96 |
+
accuracies = {}
|
| 97 |
+
|
| 98 |
+
# XGBoost
|
| 99 |
+
print("\n πΉ Training XGBoost...")
|
| 100 |
+
xgb_model = XGBClassifier(
|
| 101 |
+
n_estimators=150,
|
| 102 |
+
max_depth=5,
|
| 103 |
+
learning_rate=0.1,
|
| 104 |
+
subsample=0.8,
|
| 105 |
+
colsample_bytree=0.8,
|
| 106 |
+
gamma=1.0,
|
| 107 |
+
objective='multi:softprob',
|
| 108 |
+
num_class=n_classes,
|
| 109 |
+
random_state=42,
|
| 110 |
+
n_jobs=-1,
|
| 111 |
+
verbosity=0
|
| 112 |
+
)
|
| 113 |
+
xgb_model.fit(X_train_selected, y_train)
|
| 114 |
+
xgb_acc = xgb_model.score(X_test_selected, y_test)
|
| 115 |
+
models['xgboost'] = xgb_model
|
| 116 |
+
accuracies['xgboost'] = xgb_acc
|
| 117 |
+
print(f" β XGBoost accuracy: {xgb_acc:.4f}")
|
| 118 |
+
|
| 119 |
+
# LightGBM
|
| 120 |
+
print("\n πΉ Training LightGBM...")
|
| 121 |
+
lgbm_model = LGBMClassifier(
|
| 122 |
+
n_estimators=150,
|
| 123 |
+
num_leaves=40,
|
| 124 |
+
learning_rate=0.1,
|
| 125 |
+
subsample=0.8,
|
| 126 |
+
colsample_bytree=0.8,
|
| 127 |
+
min_child_samples=20,
|
| 128 |
+
objective='multiclass',
|
| 129 |
+
num_class=n_classes,
|
| 130 |
+
random_state=42,
|
| 131 |
+
n_jobs=-1,
|
| 132 |
+
verbose=-1
|
| 133 |
+
)
|
| 134 |
+
lgbm_model.fit(X_train_selected, y_train)
|
| 135 |
+
lgbm_acc = lgbm_model.score(X_test_selected, y_test)
|
| 136 |
+
models['lightgbm'] = lgbm_model
|
| 137 |
+
accuracies['lightgbm'] = lgbm_acc
|
| 138 |
+
print(f" β LightGBM accuracy: {lgbm_acc:.4f}")
|
| 139 |
+
|
| 140 |
+
# Gradient Boosting
|
| 141 |
+
print("\n πΉ Training Gradient Boosting...")
|
| 142 |
+
gb_model = GradientBoostingClassifier(
|
| 143 |
+
n_estimators=100,
|
| 144 |
+
max_depth=4,
|
| 145 |
+
learning_rate=0.1,
|
| 146 |
+
subsample=0.8,
|
| 147 |
+
min_samples_split=10,
|
| 148 |
+
random_state=42
|
| 149 |
+
)
|
| 150 |
+
gb_model.fit(X_train_selected, y_train)
|
| 151 |
+
gb_acc = gb_model.score(X_test_selected, y_test)
|
| 152 |
+
models['gradientboosting'] = gb_model
|
| 153 |
+
accuracies['gradientboosting'] = gb_acc
|
| 154 |
+
print(f" β Gradient Boosting accuracy: {gb_acc:.4f}")
|
| 155 |
+
|
| 156 |
+
# AdaBoost
|
| 157 |
+
print("\n πΉ Training AdaBoost...")
|
| 158 |
+
ada_model = AdaBoostClassifier(
|
| 159 |
+
n_estimators=100,
|
| 160 |
+
learning_rate=1.0,
|
| 161 |
+
algorithm='SAMME.R',
|
| 162 |
+
random_state=42
|
| 163 |
+
)
|
| 164 |
+
ada_model.fit(X_train_selected, y_train)
|
| 165 |
+
ada_acc = ada_model.score(X_test_selected, y_test)
|
| 166 |
+
models['adaboost'] = ada_model
|
| 167 |
+
accuracies['adaboost'] = ada_acc
|
| 168 |
+
print(f" β AdaBoost accuracy: {ada_acc:.4f}")
|
| 169 |
+
|
| 170 |
+
# ============================================================================
|
| 171 |
+
# 5. ENSEMBLE
|
| 172 |
+
# ============================================================================
|
| 173 |
+
print("\n5οΈβ£ Creating ensemble...")
|
| 174 |
+
|
| 175 |
+
# Get predictions
|
| 176 |
+
predictions = {}
|
| 177 |
+
for name, model in models.items():
|
| 178 |
+
predictions[name] = model.predict_proba(X_test_selected)
|
| 179 |
+
|
| 180 |
+
# Calculate weights (proportional to accuracy)
|
| 181 |
+
weights = np.array([accuracies[name] for name in [
|
| 182 |
+
'xgboost', 'lightgbm', 'gradientboosting', 'adaboost']])
|
| 183 |
+
weights = weights / weights.sum()
|
| 184 |
+
|
| 185 |
+
print(f" β Ensemble weights: {weights}")
|
| 186 |
+
|
| 187 |
+
# Weighted ensemble prediction
|
| 188 |
+
ensemble_pred = (
|
| 189 |
+
weights[0] * predictions['xgboost'] +
|
| 190 |
+
weights[1] * predictions['lightgbm'] +
|
| 191 |
+
weights[2] * predictions['gradientboosting'] +
|
| 192 |
+
weights[3] * predictions['adaboost']
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
ensemble_labels = np.argmax(ensemble_pred, axis=1)
|
| 196 |
+
ensemble_acc = accuracy_score(y_test, ensemble_labels)
|
| 197 |
+
|
| 198 |
+
print(f" β Ensemble accuracy: {ensemble_acc:.4f}")
|
| 199 |
+
|
| 200 |
+
# ============================================================================
|
| 201 |
+
# 6. SAVE WEIGHTS
|
| 202 |
+
# ============================================================================
|
| 203 |
+
print("\n6οΈβ£ Saving weights...")
|
| 204 |
+
|
| 205 |
+
os.makedirs('weights', exist_ok=True)
|
| 206 |
+
|
| 207 |
+
# Save individual models
|
| 208 |
+
with open('weights/xgboost_model.pkl', 'wb') as f:
|
| 209 |
+
pickle.dump(xgb_model, f)
|
| 210 |
+
print(" β xgboost_model.pkl")
|
| 211 |
+
|
| 212 |
+
with open('weights/lightgbm_model.pkl', 'wb') as f:
|
| 213 |
+
pickle.dump(lgbm_model, f)
|
| 214 |
+
print(" β lightgbm_model.pkl")
|
| 215 |
+
|
| 216 |
+
with open('weights/gradientboost_model.pkl', 'wb') as f:
|
| 217 |
+
pickle.dump(gb_model, f)
|
| 218 |
+
print(" β gradientboost_model.pkl")
|
| 219 |
+
|
| 220 |
+
with open('weights/adaboost_model.pkl', 'wb') as f:
|
| 221 |
+
pickle.dump(ada_model, f)
|
| 222 |
+
print(" β adaboost_model.pkl")
|
| 223 |
+
|
| 224 |
+
# Save preprocessing objects
|
| 225 |
+
with open('weights/scaler.pkl', 'wb') as f:
|
| 226 |
+
pickle.dump(scaler, f)
|
| 227 |
+
print(" β scaler.pkl")
|
| 228 |
+
|
| 229 |
+
with open('weights/label_encoder.pkl', 'wb') as f:
|
| 230 |
+
pickle.dump(label_encoder, f)
|
| 231 |
+
print(" β label_encoder.pkl")
|
| 232 |
+
|
| 233 |
+
# Save configuration
|
| 234 |
+
config = {
|
| 235 |
+
'selected_features': top_indices.tolist(),
|
| 236 |
+
'ensemble_weights': weights.tolist(),
|
| 237 |
+
'n_features': len(top_indices),
|
| 238 |
+
'emotions': label_encoder.classes_.tolist(),
|
| 239 |
+
'model_accuracies': {
|
| 240 |
+
'xgboost': float(xgb_acc),
|
| 241 |
+
'lightgbm': float(lgbm_acc),
|
| 242 |
+
'gradientboosting': float(gb_acc),
|
| 243 |
+
'adaboost': float(ada_acc),
|
| 244 |
+
'ensemble': float(ensemble_acc)
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
with open('weights/config.json', 'w') as f:
|
| 249 |
+
json.dump(config, f, indent=2)
|
| 250 |
+
print(" β config.json")
|
| 251 |
+
|
| 252 |
+
# ============================================================================
|
| 253 |
+
# 7. VERIFY
|
| 254 |
+
# ============================================================================
|
| 255 |
+
print("\n7οΈβ£ Verifying saved models...")
|
| 256 |
+
|
| 257 |
+
# Test loading
|
| 258 |
+
with open('weights/xgboost_model.pkl', 'rb') as f:
|
| 259 |
+
loaded_model = pickle.load(f)
|
| 260 |
+
|
| 261 |
+
test_acc = loaded_model.score(X_test_selected, y_test)
|
| 262 |
+
print(f" β Loaded model works (accuracy: {test_acc:.4f})")
|
| 263 |
+
|
| 264 |
+
# ============================================================================
|
| 265 |
+
# 8. SUMMARY
|
| 266 |
+
# ============================================================================
|
| 267 |
+
print("\n" + "="*70)
|
| 268 |
+
print("β
TRAINING COMPLETE!")
|
| 269 |
+
print("="*70)
|
| 270 |
+
|
| 271 |
+
print("\nπ Final Results:")
|
| 272 |
+
print(f" XGBoost: {xgb_acc:.4f}")
|
| 273 |
+
print(f" LightGBM: {lgbm_acc:.4f}")
|
| 274 |
+
print(f" GradientBoosting: {gb_acc:.4f}")
|
| 275 |
+
print(f" AdaBoost: {ada_acc:.4f}")
|
| 276 |
+
print(f" Ensemble: {ensemble_acc:.4f} β")
|
| 277 |
+
|
| 278 |
+
print(f"\nπΎ Saved files:")
|
| 279 |
+
print(f" weights/xgboost_model.pkl")
|
| 280 |
+
print(f" weights/lightgbm_model.pkl")
|
| 281 |
+
print(f" weights/gradientboost_model.pkl")
|
| 282 |
+
print(f" weights/adaboost_model.pkl")
|
| 283 |
+
print(f" weights/scaler.pkl")
|
| 284 |
+
print(f" weights/label_encoder.pkl")
|
| 285 |
+
print(f" weights/config.json")
|
| 286 |
+
|
| 287 |
+
print(f"\nπ Next steps:")
|
| 288 |
+
print(f" 1. Test locally: python app.py")
|
| 289 |
+
print(f" 2. Push to Hugging Face: git add . && git commit -m 'Add models' && git push")
|
| 290 |
+
|
| 291 |
+
print("="*70)
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==1.24.3
|
| 2 |
+
pandas==2.0.3
|
| 3 |
+
scikit-learn==1.3.0
|
| 4 |
+
xgboost==2.0.3
|
| 5 |
+
lightgbm==4.1.0
|
| 6 |
+
librosa==0.10.1
|
| 7 |
+
soundfile==0.12.1
|
| 8 |
+
gradio==4.44.0
|
| 9 |
+
matplotlib==3.7.2
|
src/__init__.py
ADDED
|
File without changes
|
src/ensemble_model.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ensemble Model for Speech Emotion Recognition
|
| 3 |
+
Loads pre-trained models and makes predictions
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pickle
|
| 7 |
+
import numpy as np
|
| 8 |
+
import json
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
class EnsembleEmotionRecognizer:
|
| 12 |
+
"""
|
| 13 |
+
Ensemble model combining XGBoost, LightGBM, Gradient Boosting, and AdaBoost
|
| 14 |
+
Pre-trained weights are loaded from the weights directory
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def __init__(self, weights_dir='weights'):
|
| 18 |
+
"""
|
| 19 |
+
Initialize ensemble model
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
weights_dir (str): Directory containing model weights
|
| 23 |
+
"""
|
| 24 |
+
self.weights_dir = Path(weights_dir)
|
| 25 |
+
self.models = {}
|
| 26 |
+
self.scaler = None
|
| 27 |
+
self.label_encoder = None
|
| 28 |
+
self.config = None
|
| 29 |
+
self.selected_features = None
|
| 30 |
+
self.ensemble_weights = None
|
| 31 |
+
|
| 32 |
+
self._load_weights()
|
| 33 |
+
|
| 34 |
+
def _load_weights(self):
|
| 35 |
+
"""Load all pre-trained model weights and configurations"""
|
| 36 |
+
|
| 37 |
+
print("Loading pre-trained models...")
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
# Load configuration
|
| 41 |
+
config_path = self.weights_dir / 'config.json'
|
| 42 |
+
with open(config_path, 'r') as f:
|
| 43 |
+
self.config = json.load(f)
|
| 44 |
+
print(" β Configuration loaded")
|
| 45 |
+
|
| 46 |
+
# Load scaler
|
| 47 |
+
with open(self.weights_dir / 'scaler.pkl', 'rb') as f:
|
| 48 |
+
self.scaler = pickle.load(f)
|
| 49 |
+
print(" β Scaler loaded")
|
| 50 |
+
|
| 51 |
+
# Load label encoder
|
| 52 |
+
with open(self.weights_dir / 'label_encoder.pkl', 'rb') as f:
|
| 53 |
+
self.label_encoder = pickle.load(f)
|
| 54 |
+
print(" β Label encoder loaded")
|
| 55 |
+
|
| 56 |
+
# Load models
|
| 57 |
+
model_files = {
|
| 58 |
+
'xgboost': 'xgboost_model.pkl',
|
| 59 |
+
'lightgbm': 'lightgbm_model.pkl',
|
| 60 |
+
'gradientboosting': 'gradientboost_model.pkl',
|
| 61 |
+
'adaboost': 'adaboost_model.pkl'
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
for name, filename in model_files.items():
|
| 65 |
+
with open(self.weights_dir / filename, 'rb') as f:
|
| 66 |
+
self.models[name] = pickle.load(f)
|
| 67 |
+
print(f" β {name.capitalize()} loaded")
|
| 68 |
+
|
| 69 |
+
# Load ensemble configuration
|
| 70 |
+
self.selected_features = self.config['selected_features']
|
| 71 |
+
self.ensemble_weights = np.array(self.config['ensemble_weights'])
|
| 72 |
+
|
| 73 |
+
print(f"\nβ
All models loaded successfully!")
|
| 74 |
+
print(f" - Number of models: {len(self.models)}")
|
| 75 |
+
print(f" - Selected features: {len(self.selected_features)}/162")
|
| 76 |
+
print(f" - Ensemble weights: {self.ensemble_weights}")
|
| 77 |
+
print(f" - Emotions: {', '.join(self.label_encoder.classes_)}")
|
| 78 |
+
|
| 79 |
+
except FileNotFoundError as e:
|
| 80 |
+
raise Exception(f"Model files not found in '{self.weights_dir}': {e}")
|
| 81 |
+
except Exception as e:
|
| 82 |
+
raise Exception(f"Error loading models: {e}")
|
| 83 |
+
|
| 84 |
+
def predict(self, features):
|
| 85 |
+
"""
|
| 86 |
+
Predict emotion from features
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
features (np.array): Feature vector of shape (162,) or (n_samples, 162)
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
np.array: Predicted emotion labels
|
| 93 |
+
"""
|
| 94 |
+
# Ensure 2D array
|
| 95 |
+
if features.ndim == 1:
|
| 96 |
+
features = features.reshape(1, -1)
|
| 97 |
+
|
| 98 |
+
# Preprocess
|
| 99 |
+
features_scaled = self.scaler.transform(features)
|
| 100 |
+
features_selected = features_scaled[:, self.selected_features]
|
| 101 |
+
|
| 102 |
+
# Get predictions from all models
|
| 103 |
+
predictions = []
|
| 104 |
+
for model in self.models.values():
|
| 105 |
+
pred_proba = model.predict_proba(features_selected)
|
| 106 |
+
predictions.append(pred_proba)
|
| 107 |
+
|
| 108 |
+
# Weighted ensemble
|
| 109 |
+
ensemble_proba = np.average(predictions, axis=0, weights=self.ensemble_weights)
|
| 110 |
+
|
| 111 |
+
# Get predicted labels
|
| 112 |
+
predicted_labels = np.argmax(ensemble_proba, axis=1)
|
| 113 |
+
|
| 114 |
+
return predicted_labels
|
| 115 |
+
|
| 116 |
+
def predict_proba(self, features):
|
| 117 |
+
"""
|
| 118 |
+
Predict emotion probabilities
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
features (np.array): Feature vector of shape (162,) or (n_samples, 162)
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
np.array: Probability distribution over emotions, shape (n_samples, n_emotions)
|
| 125 |
+
"""
|
| 126 |
+
# Ensure 2D array
|
| 127 |
+
if features.ndim == 1:
|
| 128 |
+
features = features.reshape(1, -1)
|
| 129 |
+
|
| 130 |
+
# Preprocess
|
| 131 |
+
features_scaled = self.scaler.transform(features)
|
| 132 |
+
features_selected = features_scaled[:, self.selected_features]
|
| 133 |
+
|
| 134 |
+
# Get predictions from all models
|
| 135 |
+
predictions = []
|
| 136 |
+
for model in self.models.values():
|
| 137 |
+
pred_proba = model.predict_proba(features_selected)
|
| 138 |
+
predictions.append(pred_proba)
|
| 139 |
+
|
| 140 |
+
# Weighted ensemble
|
| 141 |
+
ensemble_proba = np.average(predictions, axis=0, weights=self.ensemble_weights)
|
| 142 |
+
|
| 143 |
+
return ensemble_proba
|
| 144 |
+
|
| 145 |
+
def predict_with_confidence(self, features):
|
| 146 |
+
"""
|
| 147 |
+
Predict emotion with confidence score
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
features (np.array): Feature vector of shape (162,)
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
tuple: (emotion_name, confidence, probabilities_dict)
|
| 154 |
+
"""
|
| 155 |
+
# Get probabilities
|
| 156 |
+
proba = self.predict_proba(features)[0]
|
| 157 |
+
|
| 158 |
+
# Get prediction
|
| 159 |
+
predicted_idx = np.argmax(proba)
|
| 160 |
+
emotion_name = self.label_encoder.classes_[predicted_idx]
|
| 161 |
+
confidence = proba[predicted_idx]
|
| 162 |
+
|
| 163 |
+
# Create probability dictionary
|
| 164 |
+
prob_dict = {}
|
| 165 |
+
for i, emotion in enumerate(self.label_encoder.classes_):
|
| 166 |
+
prob_dict[emotion] = float(proba[i])
|
| 167 |
+
|
| 168 |
+
return emotion_name, confidence, prob_dict
|
| 169 |
+
|
| 170 |
+
def decode_emotion(self, label):
|
| 171 |
+
"""
|
| 172 |
+
Convert numeric label to emotion name
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
label (int): Numeric emotion label
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
str: Emotion name
|
| 179 |
+
"""
|
| 180 |
+
return self.label_encoder.inverse_transform([label])[0]
|
| 181 |
+
|
| 182 |
+
def get_emotion_names(self):
|
| 183 |
+
"""
|
| 184 |
+
Get list of all emotion names
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
list: List of emotion names
|
| 188 |
+
"""
|
| 189 |
+
return self.label_encoder.classes_.tolist()
|
| 190 |
+
|
| 191 |
+
def get_model_info(self):
|
| 192 |
+
"""
|
| 193 |
+
Get information about the ensemble model
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
dict: Model information
|
| 197 |
+
"""
|
| 198 |
+
return {
|
| 199 |
+
'n_models': len(self.models),
|
| 200 |
+
'models': list(self.models.keys()),
|
| 201 |
+
'n_features_selected': len(self.selected_features),
|
| 202 |
+
'n_features_total': 162,
|
| 203 |
+
'ensemble_weights': self.ensemble_weights.tolist(),
|
| 204 |
+
'emotions': self.get_emotion_names(),
|
| 205 |
+
'accuracies': self.config.get('model_accuracies', {})
|
| 206 |
+
}
|
src/feature_extraction.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio Feature Extraction Module
|
| 3 |
+
Extracts 162 features from audio files for emotion recognition
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import librosa
|
| 8 |
+
import warnings
|
| 9 |
+
warnings.filterwarnings('ignore')
|
| 10 |
+
|
| 11 |
+
def extract_features(audio_path, duration=2.5, offset=0.6):
|
| 12 |
+
"""
|
| 13 |
+
Extract 162 audio features from an audio file
|
| 14 |
+
|
| 15 |
+
Features:
|
| 16 |
+
- 1 Zero Crossing Rate
|
| 17 |
+
- 12 Chroma STFT
|
| 18 |
+
- 20 MFCC
|
| 19 |
+
- 1 RMS Energy
|
| 20 |
+
- 128 Mel Spectrogram
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
audio_path (str): Path to audio file
|
| 24 |
+
duration (float): Duration to load (seconds)
|
| 25 |
+
offset (float): Start reading after this time (seconds)
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
features (np.array): Feature vector of shape (162,)
|
| 29 |
+
y (np.array): Audio time series
|
| 30 |
+
sr (int): Sample rate
|
| 31 |
+
"""
|
| 32 |
+
try:
|
| 33 |
+
# Load audio file
|
| 34 |
+
y, sr = librosa.load(audio_path, duration=duration, offset=offset)
|
| 35 |
+
|
| 36 |
+
# Initialize feature array
|
| 37 |
+
features = np.array([])
|
| 38 |
+
|
| 39 |
+
# 1. Zero Crossing Rate (1 feature)
|
| 40 |
+
zcr = np.mean(librosa.feature.zero_crossing_rate(y=y).T, axis=0)
|
| 41 |
+
features = np.hstack((features, zcr))
|
| 42 |
+
|
| 43 |
+
# 2. Chroma STFT (12 features)
|
| 44 |
+
stft = np.abs(librosa.stft(y))
|
| 45 |
+
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
|
| 46 |
+
features = np.hstack((features, chroma))
|
| 47 |
+
|
| 48 |
+
# 3. MFCC (20 features)
|
| 49 |
+
mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20).T, axis=0)
|
| 50 |
+
features = np.hstack((features, mfcc))
|
| 51 |
+
|
| 52 |
+
# 4. RMS Energy (1 feature)
|
| 53 |
+
rms = np.mean(librosa.feature.rms(y=y).T, axis=0)
|
| 54 |
+
features = np.hstack((features, rms))
|
| 55 |
+
|
| 56 |
+
# 5. Mel Spectrogram (128 features)
|
| 57 |
+
mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
|
| 58 |
+
features = np.hstack((features, mel))
|
| 59 |
+
|
| 60 |
+
return features, y, sr
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
raise Exception(f"Error extracting features from {audio_path}: {str(e)}")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_feature_names():
|
| 67 |
+
"""
|
| 68 |
+
Get names of all 162 features
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
list: List of feature names
|
| 72 |
+
"""
|
| 73 |
+
names = ['zcr']
|
| 74 |
+
names.extend([f'chroma_{i}' for i in range(12)])
|
| 75 |
+
names.extend([f'mfcc_{i}' for i in range(20)])
|
| 76 |
+
names.append('rms')
|
| 77 |
+
names.extend([f'mel_{i}' for i in range(128)])
|
| 78 |
+
return names
|
src/utils.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for visualization and analysis
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import librosa
|
| 8 |
+
import librosa.display
|
| 9 |
+
|
| 10 |
+
def create_waveform_plot(y, sr, title="Audio Waveform"):
|
| 11 |
+
"""
|
| 12 |
+
Create waveform visualization
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
y (np.array): Audio time series
|
| 16 |
+
sr (int): Sample rate
|
| 17 |
+
title (str): Plot title
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
matplotlib.figure.Figure: Waveform plot
|
| 21 |
+
"""
|
| 22 |
+
fig, ax = plt.subplots(figsize=(10, 3))
|
| 23 |
+
librosa.display.waveshow(y, sr=sr, ax=ax, color='#2E86DE')
|
| 24 |
+
ax.set_title(title, fontsize=14, fontweight='bold')
|
| 25 |
+
ax.set_xlabel('Time (seconds)', fontsize=11)
|
| 26 |
+
ax.set_ylabel('Amplitude', fontsize=11)
|
| 27 |
+
ax.grid(True, alpha=0.3)
|
| 28 |
+
plt.tight_layout()
|
| 29 |
+
return fig
|
| 30 |
+
|
| 31 |
+
def create_spectrogram_plot(y, sr, title="Spectrogram"):
|
| 32 |
+
"""
|
| 33 |
+
Create spectrogram visualization
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
y (np.array): Audio time series
|
| 37 |
+
sr (int): Sample rate
|
| 38 |
+
title (str): Plot title
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
matplotlib.figure.Figure: Spectrogram plot
|
| 42 |
+
"""
|
| 43 |
+
fig, ax = plt.subplots(figsize=(10, 4))
|
| 44 |
+
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
|
| 45 |
+
img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz',
|
| 46 |
+
ax=ax, cmap='viridis')
|
| 47 |
+
ax.set_title(title, fontsize=14, fontweight='bold')
|
| 48 |
+
ax.set_xlabel('Time (seconds)', fontsize=11)
|
| 49 |
+
ax.set_ylabel('Frequency (Hz)', fontsize=11)
|
| 50 |
+
fig.colorbar(img, ax=ax, format='%+2.0f dB')
|
| 51 |
+
plt.tight_layout()
|
| 52 |
+
return fig
|
| 53 |
+
|
| 54 |
+
def create_mel_spectrogram_plot(y, sr, title="Mel Spectrogram"):
|
| 55 |
+
"""
|
| 56 |
+
Create mel spectrogram visualization
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
y (np.array): Audio time series
|
| 60 |
+
sr (int): Sample rate
|
| 61 |
+
title (str): Plot title
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
matplotlib.figure.Figure: Mel spectrogram plot
|
| 65 |
+
"""
|
| 66 |
+
fig, ax = plt.subplots(figsize=(10, 4))
|
| 67 |
+
S = librosa.feature.melspectrogram(y=y, sr=sr)
|
| 68 |
+
S_dB = librosa.power_to_db(S, ref=np.max)
|
| 69 |
+
img = librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel',
|
| 70 |
+
ax=ax, cmap='magma')
|
| 71 |
+
ax.set_title(title, fontsize=14, fontweight='bold')
|
| 72 |
+
ax.set_xlabel('Time (seconds)', fontsize=11)
|
| 73 |
+
ax.set_ylabel('Mel Frequency', fontsize=11)
|
| 74 |
+
fig.colorbar(img, ax=ax, format='%+2.0f dB')
|
| 75 |
+
plt.tight_layout()
|
| 76 |
+
return fig
|
| 77 |
+
|
| 78 |
+
def format_probability_text(prob_dict, top_k=None):
|
| 79 |
+
"""
|
| 80 |
+
Format probability dictionary as text with progress bars
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
prob_dict (dict): Dictionary of e
|
test_local.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test application locally before deploying
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
print("="*70)
|
| 9 |
+
print("LOCAL TEST - Speech Emotion Recognition")
|
| 10 |
+
print("="*70)
|
| 11 |
+
|
| 12 |
+
# ============================================================================
|
| 13 |
+
# 1. CHECK FILES
|
| 14 |
+
# ============================================================================
|
| 15 |
+
print("\n1οΈβ£ Checking required files...")
|
| 16 |
+
|
| 17 |
+
required_files = [
|
| 18 |
+
'app.py',
|
| 19 |
+
'requirements.txt',
|
| 20 |
+
'README.md',
|
| 21 |
+
'src/__init__.py',
|
| 22 |
+
'src/feature_extraction.py',
|
| 23 |
+
'src/ensemble_model.py',
|
| 24 |
+
'src/utils.py',
|
| 25 |
+
'weights/xgboost_model.pkl',
|
| 26 |
+
'weights/lightgbm_model.pkl',
|
| 27 |
+
'weights/gradientboost_model.pkl',
|
| 28 |
+
'weights/adaboost_model.pkl',
|
| 29 |
+
'weights/scaler.pkl',
|
| 30 |
+
'weights/label_encoder.pkl',
|
| 31 |
+
'weights/config.json'
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
missing_files = []
|
| 35 |
+
for file in required_files:
|
| 36 |
+
if os.path.exists(file):
|
| 37 |
+
print(f" β {file}")
|
| 38 |
+
else:
|
| 39 |
+
print(f" β {file} - MISSING")
|
| 40 |
+
missing_files.append(file)
|
| 41 |
+
|
| 42 |
+
if missing_files:
|
| 43 |
+
print(f"\nβ Missing {len(missing_files)} files. Please create them first.")
|
| 44 |
+
sys.exit(1)
|
| 45 |
+
|
| 46 |
+
# ============================================================================
|
| 47 |
+
# 2. TEST IMPORTS
|
| 48 |
+
# ============================================================================
|
| 49 |
+
print("\n2οΈβ£ Testing imports...")
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
import numpy
|
| 53 |
+
print(" β numpy")
|
| 54 |
+
except:
|
| 55 |
+
print(" β numpy - Install: pip install numpy")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
import pandas
|
| 59 |
+
print(" β pandas")
|
| 60 |
+
except:
|
| 61 |
+
print(" β pandas - Install: pip install pandas")
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
import sklearn
|
| 65 |
+
print(" β scikit-learn")
|
| 66 |
+
except:
|
| 67 |
+
print(" β scikit-learn - Install: pip install scikit-learn")
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
import xgboost
|
| 71 |
+
print(" β xgboost")
|
| 72 |
+
except:
|
| 73 |
+
print(" β xgboost - Install: pip install xgboost")
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
import lightgbm
|
| 77 |
+
print(" β lightgbm")
|
| 78 |
+
except:
|
| 79 |
+
print(" β lightgbm - Install: pip install lightgbm")
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
import librosa
|
| 83 |
+
print(" β librosa")
|
| 84 |
+
except:
|
| 85 |
+
print(" β librosa - Install: pip install librosa")
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
import gradio
|
| 89 |
+
print(" β gradio")
|
| 90 |
+
except:
|
| 91 |
+
print(" β gradio - Install: pip install gradio")
|
| 92 |
+
|
| 93 |
+
# ============================================================================
|
| 94 |
+
# 3. TEST MODEL LOADING
|
| 95 |
+
# ============================================================================
|
| 96 |
+
print("\n3οΈβ£ Testing model loading...")
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
from src.ensemble_model import EnsembleEmotionRecognizer
|
| 100 |
+
|
| 101 |
+
model = EnsembleEmotionRecognizer(weights_dir='weights')
|
| 102 |
+
print(" β Model loaded successfully")
|
| 103 |
+
|
| 104 |
+
# Get model info
|
| 105 |
+
info = model.get_model_info()
|
| 106 |
+
print(f" β Models: {', '.join(info['models'])}")
|
| 107 |
+
print(f" β Features: {info['n_features_selected']}/{info['n_features_total']}")
|
| 108 |
+
print(f" β Emotions: {', '.join(info['emotions'])}")
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f" β Error loading model: {e}")
|
| 112 |
+
sys.exit(1)
|
| 113 |
+
|
| 114 |
+
# ============================================================================
|
| 115 |
+
# 4. TEST FEATURE EXTRACTION
|
| 116 |
+
# ============================================================================
|
| 117 |
+
print("\n4οΈβ£ Testing feature extraction...")
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
from src.feature_extraction import extract_features
|
| 121 |
+
import numpy as np
|
| 122 |
+
|
| 123 |
+
# Create dummy audio
|
| 124 |
+
import librosa
|
| 125 |
+
y = np.random.randn(22050 * 3) # 3 seconds of random audio
|
| 126 |
+
|
| 127 |
+
# Save to temp file
|
| 128 |
+
import soundfile as sf
|
| 129 |
+
sf.write('temp_test.wav', y, 22050)
|
| 130 |
+
|
| 131 |
+
# Extract features
|
| 132 |
+
features, _, _ = extract_features('temp_test.wav')
|
| 133 |
+
print(f" β Features extracted: shape {features.shape}")
|
| 134 |
+
|
| 135 |
+
# Test prediction
|
| 136 |
+
prediction = model.predict(features)
|
| 137 |
+
print(f" β Prediction works: {model.decode_emotion(prediction[0])}")
|
| 138 |
+
|
| 139 |
+
# Cleanup
|
| 140 |
+
os.remove('temp_test.wav')
|
| 141 |
+
|
| 142 |
+
except Exception as e:
|
| 143 |
+
print(f" β Error in feature extraction: {e}")
|
| 144 |
+
sys.exit(1)
|
| 145 |
+
|
| 146 |
+
# ============================================================================
|
| 147 |
+
# 5. FILE SIZES
|
| 148 |
+
# ============================================================================
|
| 149 |
+
print("\n5οΈβ£ Checking file sizes...")
|
| 150 |
+
|
| 151 |
+
total_size = 0
|
| 152 |
+
for file in required_files:
|
| 153 |
+
if os.path.exists(file):
|
| 154 |
+
size = os.path.getsize(file) / 1024 / 1024 # MB
|
| 155 |
+
total_size += size
|
| 156 |
+
if size > 10:
|