""" Quick Training Script Train models and save weights for Hugging Face deployment """ import pickle import numpy as np import pandas as pd import json import os from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.metrics import accuracy_score, classification_report from xgboost import XGBClassifier from lightgbm import LGBMClassifier from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier print("="*70) print("QUICK TRAINING - Speech Emotion Recognition") print("="*70) # ============================================================================ # 1. LOAD DATA # ============================================================================ print("\n1️⃣ Loading data...") CSV_FILE = 'features_ravdess.csv' if not os.path.exists(CSV_FILE): print(f"❌ Error: {CSV_FILE} not found!") print(" Please run preprocess_ravdess.py first to extract features") exit(1) df = pd.read_csv(CSV_FILE) # Get features and labels feature_cols = [col for col in df.columns if col.startswith('feature_')] X = df[feature_cols].values y = df['emotion'].values print(f" ✓ Data loaded: {X.shape}") print(f" ✓ Emotions: {np.unique(y)}") print(f" ✓ Distribution:\n{pd.Series(y).value_counts()}") # ============================================================================ # 2. PREPROCESSING # ============================================================================ print("\n2️⃣ Preprocessing...") # Encode labels label_encoder = LabelEncoder() y_encoded = label_encoder.fit_transform(y) print(f" ✓ Encoded labels: {label_encoder.classes_}") # Split data (80% train, 20% test) X_train, X_test, y_train, y_test = train_test_split( X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded ) print(f" ✓ Train set: {X_train.shape}") print(f" ✓ Test set: {X_test.shape}") # Scale features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) print(f" ✓ Features scaled") # ============================================================================ # 3. FEATURE SELECTION (Simple: Top 80 by variance) # ============================================================================ print("\n3️⃣ Feature selection...") feature_variance = np.var(X_train_scaled, axis=0) top_indices = np.argsort(feature_variance)[-80:] # Top 80 features X_train_selected = X_train_scaled[:, top_indices] X_test_selected = X_test_scaled[:, top_indices] print(f" ✓ Selected {len(top_indices)} features (from 162)") print( f" ✓ Variance range: {feature_variance[top_indices].min():.4f} - {feature_variance[top_indices].max():.4f}") # ============================================================================ # 4. TRAIN MODELS # ============================================================================ print("\n4️⃣ Training models...") n_classes = len(label_encoder.classes_) models = {} accuracies = {} # XGBoost print("\n 🔹 Training XGBoost...") xgb_model = XGBClassifier( n_estimators=150, max_depth=5, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, gamma=1.0, objective='multi:softprob', num_class=n_classes, random_state=42, n_jobs=-1, verbosity=0 ) xgb_model.fit(X_train_selected, y_train) xgb_acc = xgb_model.score(X_test_selected, y_test) models['xgboost'] = xgb_model accuracies['xgboost'] = xgb_acc print(f" ✓ XGBoost accuracy: {xgb_acc:.4f}") # LightGBM print("\n 🔹 Training LightGBM...") lgbm_model = LGBMClassifier( n_estimators=150, num_leaves=40, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, min_child_samples=20, objective='multiclass', num_class=n_classes, random_state=42, n_jobs=-1, verbose=-1 ) lgbm_model.fit(X_train_selected, y_train) lgbm_acc = lgbm_model.score(X_test_selected, y_test) models['lightgbm'] = lgbm_model accuracies['lightgbm'] = lgbm_acc print(f" ✓ LightGBM accuracy: {lgbm_acc:.4f}") # Gradient Boosting print("\n 🔹 Training Gradient Boosting...") gb_model = GradientBoostingClassifier( n_estimators=100, max_depth=4, learning_rate=0.1, subsample=0.8, min_samples_split=10, random_state=42 ) gb_model.fit(X_train_selected, y_train) gb_acc = gb_model.score(X_test_selected, y_test) models['gradientboosting'] = gb_model accuracies['gradientboosting'] = gb_acc print(f" ✓ Gradient Boosting accuracy: {gb_acc:.4f}") # AdaBoost print("\n 🔹 Training AdaBoost...") ada_model = AdaBoostClassifier( n_estimators=100, learning_rate=1.0, algorithm='SAMME.R', random_state=42 ) ada_model.fit(X_train_selected, y_train) ada_acc = ada_model.score(X_test_selected, y_test) models['adaboost'] = ada_model accuracies['adaboost'] = ada_acc print(f" ✓ AdaBoost accuracy: {ada_acc:.4f}") # ============================================================================ # 5. ENSEMBLE # ============================================================================ print("\n5️⃣ Creating ensemble...") # Get predictions predictions = {} for name, model in models.items(): predictions[name] = model.predict_proba(X_test_selected) # Calculate weights (proportional to accuracy) weights = np.array([accuracies[name] for name in [ 'xgboost', 'lightgbm', 'gradientboosting', 'adaboost']]) weights = weights / weights.sum() print(f" ✓ Ensemble weights: {weights}") # Weighted ensemble prediction ensemble_pred = ( weights[0] * predictions['xgboost'] + weights[1] * predictions['lightgbm'] + weights[2] * predictions['gradientboosting'] + weights[3] * predictions['adaboost'] ) ensemble_labels = np.argmax(ensemble_pred, axis=1) ensemble_acc = accuracy_score(y_test, ensemble_labels) print(f" ✓ Ensemble accuracy: {ensemble_acc:.4f}") # ============================================================================ # 6. SAVE WEIGHTS # ============================================================================ print("\n6️⃣ Saving weights...") os.makedirs('weights', exist_ok=True) # Save individual models with open('weights/xgboost_model.pkl', 'wb') as f: pickle.dump(xgb_model, f) print(" ✓ xgboost_model.pkl") with open('weights/lightgbm_model.pkl', 'wb') as f: pickle.dump(lgbm_model, f) print(" ✓ lightgbm_model.pkl") with open('weights/gradientboost_model.pkl', 'wb') as f: pickle.dump(gb_model, f) print(" ✓ gradientboost_model.pkl") with open('weights/adaboost_model.pkl', 'wb') as f: pickle.dump(ada_model, f) print(" ✓ adaboost_model.pkl") # Save preprocessing objects with open('weights/scaler.pkl', 'wb') as f: pickle.dump(scaler, f) print(" ✓ scaler.pkl") with open('weights/label_encoder.pkl', 'wb') as f: pickle.dump(label_encoder, f) print(" ✓ label_encoder.pkl") # Save configuration config = { 'selected_features': top_indices.tolist(), 'ensemble_weights': weights.tolist(), 'n_features': len(top_indices), 'emotions': label_encoder.classes_.tolist(), 'model_accuracies': { 'xgboost': float(xgb_acc), 'lightgbm': float(lgbm_acc), 'gradientboosting': float(gb_acc), 'adaboost': float(ada_acc), 'ensemble': float(ensemble_acc) } } with open('weights/config.json', 'w') as f: json.dump(config, f, indent=2) print(" ✓ config.json") # ============================================================================ # 7. VERIFY # ============================================================================ print("\n7️⃣ Verifying saved models...") # Test loading with open('weights/xgboost_model.pkl', 'rb') as f: loaded_model = pickle.load(f) test_acc = loaded_model.score(X_test_selected, y_test) print(f" ✓ Loaded model works (accuracy: {test_acc:.4f})") # ============================================================================ # 8. SUMMARY # ============================================================================ print("\n" + "="*70) print("✅ TRAINING COMPLETE!") print("="*70) print("\n📊 Final Results:") print(f" XGBoost: {xgb_acc:.4f}") print(f" LightGBM: {lgbm_acc:.4f}") print(f" GradientBoosting: {gb_acc:.4f}") print(f" AdaBoost: {ada_acc:.4f}") print(f" Ensemble: {ensemble_acc:.4f} ⭐") print(f"\n💾 Saved files:") print(f" weights/xgboost_model.pkl") print(f" weights/lightgbm_model.pkl") print(f" weights/gradientboost_model.pkl") print(f" weights/adaboost_model.pkl") print(f" weights/scaler.pkl") print(f" weights/label_encoder.pkl") print(f" weights/config.json") print(f"\n🚀 Next steps:") print(f" 1. Test locally: python app.py") print(f" 2. Push to Hugging Face: git add . && git commit -m 'Add models' && git push") print("="*70)