"""
Quick Training Script
Train models and save weights for Hugging Face deployment
"""

import pickle
import numpy as np
import pandas as pd
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

print("="*70)
print("QUICK TRAINING - Speech Emotion Recognition")
print("="*70)

# ============================================================================
# 1. LOAD DATA
# ============================================================================
print("\n1️⃣ Loading data...")

CSV_FILE = 'features_ravdess.csv'

if not os.path.exists(CSV_FILE):
    print(f"❌ Error: {CSV_FILE} not found!")
    print("   Please run preprocess_ravdess.py first to extract features")
    exit(1)

df = pd.read_csv(CSV_FILE)

# Get features and labels
feature_cols = [col for col in df.columns if col.startswith('feature_')]
X = df[feature_cols].values
y = df['emotion'].values

print(f"   ✓ Data loaded: {X.shape}")
print(f"   ✓ Emotions: {np.unique(y)}")
print(f"   ✓ Distribution:\n{pd.Series(y).value_counts()}")

# ============================================================================
# 2. PREPROCESSING
# ============================================================================
print("\n2️⃣ Preprocessing...")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"   ✓ Encoded labels: {label_encoder.classes_}")

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print(f"   ✓ Train set: {X_train.shape}")
print(f"   ✓ Test set: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"   ✓ Features scaled")

# ============================================================================
# 3. FEATURE SELECTION (Simple: Top 80 by variance)
# ============================================================================
print("\n3️⃣ Feature selection...")

feature_variance = np.var(X_train_scaled, axis=0)
top_indices = np.argsort(feature_variance)[-80:]  # Top 80 features

X_train_selected = X_train_scaled[:, top_indices]
X_test_selected = X_test_scaled[:, top_indices]

print(f"   ✓ Selected {len(top_indices)} features (from 162)")
print(
    f"   ✓ Variance range: {feature_variance[top_indices].min():.4f} - {feature_variance[top_indices].max():.4f}")

# ============================================================================
# 4. TRAIN MODELS
# ============================================================================
print("\n4️⃣ Training models...")

n_classes = len(label_encoder.classes_)
models = {}
accuracies = {}

# XGBoost
print("\n   🔹 Training XGBoost...")
xgb_model = XGBClassifier(
    n_estimators=150,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1.0,
    objective='multi:softprob',
    num_class=n_classes,
    random_state=42,
    n_jobs=-1,
    verbosity=0
)
xgb_model.fit(X_train_selected, y_train)
xgb_acc = xgb_model.score(X_test_selected, y_test)
models['xgboost'] = xgb_model
accuracies['xgboost'] = xgb_acc
print(f"      ✓ XGBoost accuracy: {xgb_acc:.4f}")

# LightGBM
print("\n   🔹 Training LightGBM...")
lgbm_model = LGBMClassifier(
    n_estimators=150,
    num_leaves=40,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    objective='multiclass',
    num_class=n_classes,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
lgbm_model.fit(X_train_selected, y_train)
lgbm_acc = lgbm_model.score(X_test_selected, y_test)
models['lightgbm'] = lgbm_model
accuracies['lightgbm'] = lgbm_acc
print(f"      ✓ LightGBM accuracy: {lgbm_acc:.4f}")

# Gradient Boosting
print("\n   🔹 Training Gradient Boosting...")
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    min_samples_split=10,
    random_state=42
)
gb_model.fit(X_train_selected, y_train)
gb_acc = gb_model.score(X_test_selected, y_test)
models['gradientboosting'] = gb_model
accuracies['gradientboosting'] = gb_acc
print(f"      ✓ Gradient Boosting accuracy: {gb_acc:.4f}")

# AdaBoost
print("\n   🔹 Training AdaBoost...")
ada_model = AdaBoostClassifier(
    n_estimators=100,
    learning_rate=1.0,
    algorithm='SAMME.R',
    random_state=42
)
ada_model.fit(X_train_selected, y_train)
ada_acc = ada_model.score(X_test_selected, y_test)
models['adaboost'] = ada_model
accuracies['adaboost'] = ada_acc
print(f"      ✓ AdaBoost accuracy: {ada_acc:.4f}")

# ============================================================================
# 5. ENSEMBLE
# ============================================================================
print("\n5️⃣ Creating ensemble...")

# Get predictions
predictions = {}
for name, model in models.items():
    predictions[name] = model.predict_proba(X_test_selected)

# Calculate weights (proportional to accuracy)
weights = np.array([accuracies[name] for name in [
                   'xgboost', 'lightgbm', 'gradientboosting', 'adaboost']])
weights = weights / weights.sum()

print(f"   ✓ Ensemble weights: {weights}")

# Weighted ensemble prediction
ensemble_pred = (
    weights[0] * predictions['xgboost'] +
    weights[1] * predictions['lightgbm'] +
    weights[2] * predictions['gradientboosting'] +
    weights[3] * predictions['adaboost']
)

ensemble_labels = np.argmax(ensemble_pred, axis=1)
ensemble_acc = accuracy_score(y_test, ensemble_labels)

print(f"   ✓ Ensemble accuracy: {ensemble_acc:.4f}")

# ============================================================================
# 6. SAVE WEIGHTS
# ============================================================================
print("\n6️⃣ Saving weights...")

os.makedirs('weights', exist_ok=True)

# Save individual models
with open('weights/xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
print("   ✓ xgboost_model.pkl")

with open('weights/lightgbm_model.pkl', 'wb') as f:
    pickle.dump(lgbm_model, f)
print("   ✓ lightgbm_model.pkl")

with open('weights/gradientboost_model.pkl', 'wb') as f:
    pickle.dump(gb_model, f)
print("   ✓ gradientboost_model.pkl")

with open('weights/adaboost_model.pkl', 'wb') as f:
    pickle.dump(ada_model, f)
print("   ✓ adaboost_model.pkl")

# Save preprocessing objects
with open('weights/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("   ✓ scaler.pkl")

with open('weights/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
print("   ✓ label_encoder.pkl")

# Save configuration
config = {
    'selected_features': top_indices.tolist(),
    'ensemble_weights': weights.tolist(),
    'n_features': len(top_indices),
    'emotions': label_encoder.classes_.tolist(),
    'model_accuracies': {
        'xgboost': float(xgb_acc),
        'lightgbm': float(lgbm_acc),
        'gradientboosting': float(gb_acc),
        'adaboost': float(ada_acc),
        'ensemble': float(ensemble_acc)
    }
}

with open('weights/config.json', 'w') as f:
    json.dump(config, f, indent=2)
print("   ✓ config.json")

# ============================================================================
# 7. VERIFY
# ============================================================================
print("\n7️⃣ Verifying saved models...")

# Test loading
with open('weights/xgboost_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

test_acc = loaded_model.score(X_test_selected, y_test)
print(f"   ✓ Loaded model works (accuracy: {test_acc:.4f})")

# ============================================================================
# 8. SUMMARY
# ============================================================================
print("\n" + "="*70)
print("✅ TRAINING COMPLETE!")
print("="*70)

print("\n📊 Final Results:")
print(f"   XGBoost:          {xgb_acc:.4f}")
print(f"   LightGBM:         {lgbm_acc:.4f}")
print(f"   GradientBoosting: {gb_acc:.4f}")
print(f"   AdaBoost:         {ada_acc:.4f}")
print(f"   Ensemble:         {ensemble_acc:.4f} ⭐")

print(f"\n💾 Saved files:")
print(f"   weights/xgboost_model.pkl")
print(f"   weights/lightgbm_model.pkl")
print(f"   weights/gradientboost_model.pkl")
print(f"   weights/adaboost_model.pkl")
print(f"   weights/scaler.pkl")
print(f"   weights/label_encoder.pkl")
print(f"   weights/config.json")

print(f"\n🚀 Next steps:")
print(f"   1. Test locally: python app.py")
print(f"   2. Push to Hugging Face: git add . && git commit -m 'Add models' && git push")

print("="*70)