speech-emotion-recognition / quick_train.py
nguyennp86's picture
Initial deployment: Speech Emotion Recognition
a344700 verified
"""
Quick Training Script
Train models and save weights for Hugging Face deployment
"""
import pickle
import numpy as np
import pandas as pd
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
print("="*70)
print("QUICK TRAINING - Speech Emotion Recognition")
print("="*70)
# ============================================================================
# 1. LOAD DATA
# ============================================================================
print("\n1️⃣ Loading data...")
CSV_FILE = 'features_ravdess.csv'
if not os.path.exists(CSV_FILE):
print(f"❌ Error: {CSV_FILE} not found!")
print(" Please run preprocess_ravdess.py first to extract features")
exit(1)
df = pd.read_csv(CSV_FILE)
# Get features and labels
feature_cols = [col for col in df.columns if col.startswith('feature_')]
X = df[feature_cols].values
y = df['emotion'].values
print(f" βœ“ Data loaded: {X.shape}")
print(f" βœ“ Emotions: {np.unique(y)}")
print(f" βœ“ Distribution:\n{pd.Series(y).value_counts()}")
# ============================================================================
# 2. PREPROCESSING
# ============================================================================
print("\n2️⃣ Preprocessing...")
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f" βœ“ Encoded labels: {label_encoder.classes_}")
# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded,
test_size=0.2,
random_state=42,
stratify=y_encoded
)
print(f" βœ“ Train set: {X_train.shape}")
print(f" βœ“ Test set: {X_test.shape}")
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f" βœ“ Features scaled")
# ============================================================================
# 3. FEATURE SELECTION (Simple: Top 80 by variance)
# ============================================================================
print("\n3️⃣ Feature selection...")
feature_variance = np.var(X_train_scaled, axis=0)
top_indices = np.argsort(feature_variance)[-80:] # Top 80 features
X_train_selected = X_train_scaled[:, top_indices]
X_test_selected = X_test_scaled[:, top_indices]
print(f" βœ“ Selected {len(top_indices)} features (from 162)")
print(
f" βœ“ Variance range: {feature_variance[top_indices].min():.4f} - {feature_variance[top_indices].max():.4f}")
# ============================================================================
# 4. TRAIN MODELS
# ============================================================================
print("\n4️⃣ Training models...")
n_classes = len(label_encoder.classes_)
models = {}
accuracies = {}
# XGBoost
print("\n πŸ”Ή Training XGBoost...")
xgb_model = XGBClassifier(
n_estimators=150,
max_depth=5,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
gamma=1.0,
objective='multi:softprob',
num_class=n_classes,
random_state=42,
n_jobs=-1,
verbosity=0
)
xgb_model.fit(X_train_selected, y_train)
xgb_acc = xgb_model.score(X_test_selected, y_test)
models['xgboost'] = xgb_model
accuracies['xgboost'] = xgb_acc
print(f" βœ“ XGBoost accuracy: {xgb_acc:.4f}")
# LightGBM
print("\n πŸ”Ή Training LightGBM...")
lgbm_model = LGBMClassifier(
n_estimators=150,
num_leaves=40,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
min_child_samples=20,
objective='multiclass',
num_class=n_classes,
random_state=42,
n_jobs=-1,
verbose=-1
)
lgbm_model.fit(X_train_selected, y_train)
lgbm_acc = lgbm_model.score(X_test_selected, y_test)
models['lightgbm'] = lgbm_model
accuracies['lightgbm'] = lgbm_acc
print(f" βœ“ LightGBM accuracy: {lgbm_acc:.4f}")
# Gradient Boosting
print("\n πŸ”Ή Training Gradient Boosting...")
gb_model = GradientBoostingClassifier(
n_estimators=100,
max_depth=4,
learning_rate=0.1,
subsample=0.8,
min_samples_split=10,
random_state=42
)
gb_model.fit(X_train_selected, y_train)
gb_acc = gb_model.score(X_test_selected, y_test)
models['gradientboosting'] = gb_model
accuracies['gradientboosting'] = gb_acc
print(f" βœ“ Gradient Boosting accuracy: {gb_acc:.4f}")
# AdaBoost
print("\n πŸ”Ή Training AdaBoost...")
ada_model = AdaBoostClassifier(
n_estimators=100,
learning_rate=1.0,
algorithm='SAMME.R',
random_state=42
)
ada_model.fit(X_train_selected, y_train)
ada_acc = ada_model.score(X_test_selected, y_test)
models['adaboost'] = ada_model
accuracies['adaboost'] = ada_acc
print(f" βœ“ AdaBoost accuracy: {ada_acc:.4f}")
# ============================================================================
# 5. ENSEMBLE
# ============================================================================
print("\n5️⃣ Creating ensemble...")
# Get predictions
predictions = {}
for name, model in models.items():
predictions[name] = model.predict_proba(X_test_selected)
# Calculate weights (proportional to accuracy)
weights = np.array([accuracies[name] for name in [
'xgboost', 'lightgbm', 'gradientboosting', 'adaboost']])
weights = weights / weights.sum()
print(f" βœ“ Ensemble weights: {weights}")
# Weighted ensemble prediction
ensemble_pred = (
weights[0] * predictions['xgboost'] +
weights[1] * predictions['lightgbm'] +
weights[2] * predictions['gradientboosting'] +
weights[3] * predictions['adaboost']
)
ensemble_labels = np.argmax(ensemble_pred, axis=1)
ensemble_acc = accuracy_score(y_test, ensemble_labels)
print(f" βœ“ Ensemble accuracy: {ensemble_acc:.4f}")
# ============================================================================
# 6. SAVE WEIGHTS
# ============================================================================
print("\n6️⃣ Saving weights...")
os.makedirs('weights', exist_ok=True)
# Save individual models
with open('weights/xgboost_model.pkl', 'wb') as f:
pickle.dump(xgb_model, f)
print(" βœ“ xgboost_model.pkl")
with open('weights/lightgbm_model.pkl', 'wb') as f:
pickle.dump(lgbm_model, f)
print(" βœ“ lightgbm_model.pkl")
with open('weights/gradientboost_model.pkl', 'wb') as f:
pickle.dump(gb_model, f)
print(" βœ“ gradientboost_model.pkl")
with open('weights/adaboost_model.pkl', 'wb') as f:
pickle.dump(ada_model, f)
print(" βœ“ adaboost_model.pkl")
# Save preprocessing objects
with open('weights/scaler.pkl', 'wb') as f:
pickle.dump(scaler, f)
print(" βœ“ scaler.pkl")
with open('weights/label_encoder.pkl', 'wb') as f:
pickle.dump(label_encoder, f)
print(" βœ“ label_encoder.pkl")
# Save configuration
config = {
'selected_features': top_indices.tolist(),
'ensemble_weights': weights.tolist(),
'n_features': len(top_indices),
'emotions': label_encoder.classes_.tolist(),
'model_accuracies': {
'xgboost': float(xgb_acc),
'lightgbm': float(lgbm_acc),
'gradientboosting': float(gb_acc),
'adaboost': float(ada_acc),
'ensemble': float(ensemble_acc)
}
}
with open('weights/config.json', 'w') as f:
json.dump(config, f, indent=2)
print(" βœ“ config.json")
# ============================================================================
# 7. VERIFY
# ============================================================================
print("\n7️⃣ Verifying saved models...")
# Test loading
with open('weights/xgboost_model.pkl', 'rb') as f:
loaded_model = pickle.load(f)
test_acc = loaded_model.score(X_test_selected, y_test)
print(f" βœ“ Loaded model works (accuracy: {test_acc:.4f})")
# ============================================================================
# 8. SUMMARY
# ============================================================================
print("\n" + "="*70)
print("βœ… TRAINING COMPLETE!")
print("="*70)
print("\nπŸ“Š Final Results:")
print(f" XGBoost: {xgb_acc:.4f}")
print(f" LightGBM: {lgbm_acc:.4f}")
print(f" GradientBoosting: {gb_acc:.4f}")
print(f" AdaBoost: {ada_acc:.4f}")
print(f" Ensemble: {ensemble_acc:.4f} ⭐")
print(f"\nπŸ’Ύ Saved files:")
print(f" weights/xgboost_model.pkl")
print(f" weights/lightgbm_model.pkl")
print(f" weights/gradientboost_model.pkl")
print(f" weights/adaboost_model.pkl")
print(f" weights/scaler.pkl")
print(f" weights/label_encoder.pkl")
print(f" weights/config.json")
print(f"\nπŸš€ Next steps:")
print(f" 1. Test locally: python app.py")
print(f" 2. Push to Hugging Face: git add . && git commit -m 'Add models' && git push")
print("="*70)