Spaces:

nguyennp86
/

speech-emotion-recognition

Sleeping

App Files Files Community

speech-emotion-recognition / quick_train.py

nguyennp86

Initial deployment: Speech Emotion Recognition

a344700 verified 2 months ago

raw

history blame contribute delete

9.2 kB

	"""
	Quick Training Script
	Train models and save weights for Hugging Face deployment
	"""

	import pickle
	import numpy as np
	import pandas as pd
	import json
	import os
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	from sklearn.metrics import accuracy_score, classification_report
	from xgboost import XGBClassifier
	from lightgbm import LGBMClassifier
	from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

	print("="*70)
	print("QUICK TRAINING - Speech Emotion Recognition")
	print("="*70)

	# ============================================================================
	# 1. LOAD DATA
	# ============================================================================
	print("\n1️⃣ Loading data...")

	CSV_FILE = 'features_ravdess.csv'

	if not os.path.exists(CSV_FILE):
	print(f"❌ Error: {CSV_FILE} not found!")
	print(" Please run preprocess_ravdess.py first to extract features")
	exit(1)

	df = pd.read_csv(CSV_FILE)

	# Get features and labels
	feature_cols = [col for col in df.columns if col.startswith('feature_')]
	X = df[feature_cols].values
	y = df['emotion'].values

	print(f" ✓ Data loaded: {X.shape}")
	print(f" ✓ Emotions: {np.unique(y)}")
	print(f" ✓ Distribution:\n{pd.Series(y).value_counts()}")

	# ============================================================================
	# 2. PREPROCESSING
	# ============================================================================
	print("\n2️⃣ Preprocessing...")

	# Encode labels
	label_encoder = LabelEncoder()
	y_encoded = label_encoder.fit_transform(y)

	print(f" ✓ Encoded labels: {label_encoder.classes_}")

	# Split data (80% train, 20% test)
	X_train, X_test, y_train, y_test = train_test_split(
	X, y_encoded,
	test_size=0.2,
	random_state=42,
	stratify=y_encoded
	)

	print(f" ✓ Train set: {X_train.shape}")
	print(f" ✓ Test set: {X_test.shape}")

	# Scale features
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)

	print(f" ✓ Features scaled")

	# ============================================================================
	# 3. FEATURE SELECTION (Simple: Top 80 by variance)
	# ============================================================================
	print("\n3️⃣ Feature selection...")

	feature_variance = np.var(X_train_scaled, axis=0)
	top_indices = np.argsort(feature_variance)[-80:] # Top 80 features

	X_train_selected = X_train_scaled[:, top_indices]
	X_test_selected = X_test_scaled[:, top_indices]

	print(f" ✓ Selected {len(top_indices)} features (from 162)")
	print(
	f" ✓ Variance range: {feature_variance[top_indices].min():.4f} - {feature_variance[top_indices].max():.4f}")

	# ============================================================================
	# 4. TRAIN MODELS
	# ============================================================================
	print("\n4️⃣ Training models...")

	n_classes = len(label_encoder.classes_)
	models = {}
	accuracies = {}

	# XGBoost
	print("\n 🔹 Training XGBoost...")
	xgb_model = XGBClassifier(
	n_estimators=150,
	max_depth=5,
	learning_rate=0.1,
	subsample=0.8,
	colsample_bytree=0.8,
	gamma=1.0,
	objective='multi:softprob',
	num_class=n_classes,
	random_state=42,
	n_jobs=-1,
	verbosity=0
	)
	xgb_model.fit(X_train_selected, y_train)
	xgb_acc = xgb_model.score(X_test_selected, y_test)
	models['xgboost'] = xgb_model
	accuracies['xgboost'] = xgb_acc
	print(f" ✓ XGBoost accuracy: {xgb_acc:.4f}")

	# LightGBM
	print("\n 🔹 Training LightGBM...")
	lgbm_model = LGBMClassifier(
	n_estimators=150,
	num_leaves=40,
	learning_rate=0.1,
	subsample=0.8,
	colsample_bytree=0.8,
	min_child_samples=20,
	objective='multiclass',
	num_class=n_classes,
	random_state=42,
	n_jobs=-1,
	verbose=-1
	)
	lgbm_model.fit(X_train_selected, y_train)
	lgbm_acc = lgbm_model.score(X_test_selected, y_test)
	models['lightgbm'] = lgbm_model
	accuracies['lightgbm'] = lgbm_acc
	print(f" ✓ LightGBM accuracy: {lgbm_acc:.4f}")

	# Gradient Boosting
	print("\n 🔹 Training Gradient Boosting...")
	gb_model = GradientBoostingClassifier(
	n_estimators=100,
	max_depth=4,
	learning_rate=0.1,
	subsample=0.8,
	min_samples_split=10,
	random_state=42
	)
	gb_model.fit(X_train_selected, y_train)
	gb_acc = gb_model.score(X_test_selected, y_test)
	models['gradientboosting'] = gb_model
	accuracies['gradientboosting'] = gb_acc
	print(f" ✓ Gradient Boosting accuracy: {gb_acc:.4f}")

	# AdaBoost
	print("\n 🔹 Training AdaBoost...")
	ada_model = AdaBoostClassifier(
	n_estimators=100,
	learning_rate=1.0,
	algorithm='SAMME.R',
	random_state=42
	)
	ada_model.fit(X_train_selected, y_train)
	ada_acc = ada_model.score(X_test_selected, y_test)
	models['adaboost'] = ada_model
	accuracies['adaboost'] = ada_acc
	print(f" ✓ AdaBoost accuracy: {ada_acc:.4f}")

	# ============================================================================
	# 5. ENSEMBLE
	# ============================================================================
	print("\n5️⃣ Creating ensemble...")

	# Get predictions
	predictions = {}
	for name, model in models.items():
	predictions[name] = model.predict_proba(X_test_selected)

	# Calculate weights (proportional to accuracy)
	weights = np.array([accuracies[name] for name in [
	'xgboost', 'lightgbm', 'gradientboosting', 'adaboost']])
	weights = weights / weights.sum()

	print(f" ✓ Ensemble weights: {weights}")

	# Weighted ensemble prediction
	ensemble_pred = (
	weights[0] * predictions['xgboost'] +
	weights[1] * predictions['lightgbm'] +
	weights[2] * predictions['gradientboosting'] +
	weights[3] * predictions['adaboost']
	)

	ensemble_labels = np.argmax(ensemble_pred, axis=1)
	ensemble_acc = accuracy_score(y_test, ensemble_labels)

	print(f" ✓ Ensemble accuracy: {ensemble_acc:.4f}")

	# ============================================================================
	# 6. SAVE WEIGHTS
	# ============================================================================
	print("\n6️⃣ Saving weights...")

	os.makedirs('weights', exist_ok=True)

	# Save individual models
	with open('weights/xgboost_model.pkl', 'wb') as f:
	pickle.dump(xgb_model, f)
	print(" ✓ xgboost_model.pkl")

	with open('weights/lightgbm_model.pkl', 'wb') as f:
	pickle.dump(lgbm_model, f)
	print(" ✓ lightgbm_model.pkl")

	with open('weights/gradientboost_model.pkl', 'wb') as f:
	pickle.dump(gb_model, f)
	print(" ✓ gradientboost_model.pkl")

	with open('weights/adaboost_model.pkl', 'wb') as f:
	pickle.dump(ada_model, f)
	print(" ✓ adaboost_model.pkl")

	# Save preprocessing objects
	with open('weights/scaler.pkl', 'wb') as f:
	pickle.dump(scaler, f)
	print(" ✓ scaler.pkl")

	with open('weights/label_encoder.pkl', 'wb') as f:
	pickle.dump(label_encoder, f)
	print(" ✓ label_encoder.pkl")

	# Save configuration
	config = {
	'selected_features': top_indices.tolist(),
	'ensemble_weights': weights.tolist(),
	'n_features': len(top_indices),
	'emotions': label_encoder.classes_.tolist(),
	'model_accuracies': {
	'xgboost': float(xgb_acc),
	'lightgbm': float(lgbm_acc),
	'gradientboosting': float(gb_acc),
	'adaboost': float(ada_acc),
	'ensemble': float(ensemble_acc)
	}
	}

	with open('weights/config.json', 'w') as f:
	json.dump(config, f, indent=2)
	print(" ✓ config.json")

	# ============================================================================
	# 7. VERIFY
	# ============================================================================
	print("\n7️⃣ Verifying saved models...")

	# Test loading
	with open('weights/xgboost_model.pkl', 'rb') as f:
	loaded_model = pickle.load(f)

	test_acc = loaded_model.score(X_test_selected, y_test)
	print(f" ✓ Loaded model works (accuracy: {test_acc:.4f})")

	# ============================================================================
	# 8. SUMMARY
	# ============================================================================
	print("\n" + "="*70)
	print("✅ TRAINING COMPLETE!")
	print("="*70)

	print("\n📊 Final Results:")
	print(f" XGBoost: {xgb_acc:.4f}")
	print(f" LightGBM: {lgbm_acc:.4f}")
	print(f" GradientBoosting: {gb_acc:.4f}")
	print(f" AdaBoost: {ada_acc:.4f}")
	print(f" Ensemble: {ensemble_acc:.4f} ⭐")

	print(f"\n💾 Saved files:")
	print(f" weights/xgboost_model.pkl")
	print(f" weights/lightgbm_model.pkl")
	print(f" weights/gradientboost_model.pkl")
	print(f" weights/adaboost_model.pkl")
	print(f" weights/scaler.pkl")
	print(f" weights/label_encoder.pkl")
	print(f" weights/config.json")

	print(f"\n🚀 Next steps:")
	print(f" 1. Test locally: python app.py")
	print(f" 2. Push to Hugging Face: git add . && git commit -m 'Add models' && git push")

	print("="*70)