""" Model training functions with K-Fold Cross-Validation """ import os import json import pickle import numpy as np import pandas as pd from pathlib import Path from typing import Tuple, Dict, Optional from sklearn.model_selection import train_test_split, StratifiedKFold from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from xgboost import XGBClassifier from lightgbm import LGBMClassifier from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier import warnings import config from src.genetic_algorithm import GeneticAlgorithm # Suppress LightGBM feature name warnings warnings.filterwarnings( 'ignore', message='X does not have valid feature names') warnings.filterwarnings('ignore', category=UserWarning, module='sklearn') def train_models_with_ga(use_ga: bool = True, use_cv: bool = False, n_folds: int = 5, ga_generations: int = 20, ga_population: int = 15, n_jobs: int = 2, optimize_features: bool = True, n_features_select: int = 100, use_xgb: bool = True, use_lgbm: bool = True, use_gb: bool = True, use_ada: bool = True, progress_callback: Optional[callable] = None) -> Tuple[str, pd.DataFrame, Optional[pd.DataFrame], str]: """ Train models with or without GA optimization and optional K-Fold CV Args: use_ga: Whether to use GA optimization use_cv: Whether to use K-Fold Cross-Validation n_folds: Number of folds for CV ga_generations: Number of GA generations ga_population: GA population size n_jobs: Number of parallel jobs optimize_features: Whether GA should optimize feature selection n_features_select: Number of features to select use_xgb: Whether to use XGBoost use_lgbm: Whether to use LightGBM use_gb: Whether to use Gradient Boosting use_ada: Whether to use AdaBoost progress_callback: Optional progress callback function Returns: tuple: (summary_text, results_df, ga_history_df, training_log) """ # ======================================================================== # VALIDATE MODEL SELECTION # ======================================================================== selected_models = [] if use_xgb: selected_models.append('xgboost') if use_lgbm: selected_models.append('lightgbm') if use_gb: selected_models.append('gradientboosting') if use_ada: selected_models.append('adaboost') if len(selected_models) == 0: return """ ## āŒ Error: No Models Selected Please select at least one model to train! Go back and check at least one model: - šŸš€ XGBoost - ⚔ LightGBM - 🌳 Gradient Boosting - šŸ“ˆ AdaBoost **Recommendation:** - For best accuracy: Select all 4 models - For speed: Select 1-2 models (e.g., XGBoost + LightGBM) - For ensemble: Select at least 2 models """, None, None, "" print(f"āœ… Selected models: {', '.join(selected_models)}") # ======================================================================== # CHECK DATASET # ======================================================================== if not os.path.exists(config.FEATURES_CSV): return """ ## āŒ Error: Dataset Not Found Please go to **Tab 1: Feature Extraction** first! Click "šŸ”Š Extract Features" to process the dataset. """, None, None, "" try: if progress_callback: progress_callback(0, desc="Loading dataset...") # ==================================================================== # LOAD DATA # ==================================================================== df = pd.read_csv(config.FEATURES_CSV) # Extract only numeric feature columns feature_cols = [col for col in df.columns if col.startswith('feature_') and col.replace('feature_', '').isdigit()] feature_cols = sorted( feature_cols, key=lambda x: int(x.replace('feature_', ''))) if len(feature_cols) == 0: return """ ## āŒ Error: No numeric feature columns found! Please re-run feature extraction in Tab 1. """, None, None, "" X = df[feature_cols].values y = df['emotion'].values # ==================================================================== # ADJUST FEATURE SELECTION # ==================================================================== n_features_available = X.shape[1] if not optimize_features: n_features_select = n_features_available print(f"āœ… Feature Selection: DISABLED") print(f" Using all {n_features_available} features") else: if n_features_select > n_features_available: print( f"āš ļø Requested {n_features_select} features, but only {n_features_available} available") print(f" Auto-adjusting to {n_features_available}") n_features_select = n_features_available else: print(f"āœ… Feature Selection: ENABLED") print( f" Selecting {n_features_select}/{n_features_available} features ({n_features_select/n_features_available*100:.1f}%)") print(f"āœ… Dataset loaded:") print(f" - Total features: {n_features_available}") print(f" - Features for GA: {n_features_select}") print(f" - Shape: {X.shape}") print(f" - Samples: {len(y)}") print( f" - Models: {len(selected_models)} ({', '.join(selected_models)})") # ==================================================================== # ENCODE LABELS # ==================================================================== label_encoder = LabelEncoder() y_encoded = label_encoder.fit_transform(y) n_classes = len(label_encoder.classes_) training_log = "" # ==================================================================== # CHOOSE TRAINING MODE: CV or Single Split # ==================================================================== if use_cv: return _train_with_cross_validation( X, y_encoded, label_encoder, n_classes, use_ga, n_folds, ga_generations, ga_population, n_jobs, optimize_features, n_features_select, selected_models, progress_callback ) else: return _train_single_split( X, y_encoded, label_encoder, n_classes, use_ga, ga_generations, ga_population, n_jobs, optimize_features, n_features_select, selected_models, progress_callback ) except Exception as e: import traceback error_trace = traceback.format_exc() return f"āŒ Training failed: {str(e)}\n\n```\n{error_trace}\n```", None, None, "" def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes, use_ga, n_folds, ga_generations, ga_population, n_jobs, optimize_features, n_features_select, selected_models, progress_callback): """ Train with K-Fold Cross-Validation """ print("="*80) print(f"{'K-FOLD CROSS-VALIDATION TRAINING':^80}") print("="*80) print(f"Number of folds: {n_folds}") print(f"Use GA: {use_ga}") print(f"Optimize Features: {optimize_features}") print(f"Features to select: {n_features_select}") print(f"Selected models: {', '.join(selected_models)}") print(f"Total samples: {len(X)}") print("="*80) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE) fold_results = [] fold_models = [] all_ga_history = [] training_log = "" total_steps = n_folds current_step = 0 for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded), 1): fold_log = f"\n{'='*80}\n" fold_log += f"FOLD {fold_idx}/{n_folds}\n" fold_log += f"{'='*80}\n" print(fold_log) training_log += fold_log if progress_callback: base_progress = current_step / total_steps progress_callback( base_progress, desc=f"Fold {fold_idx}/{n_folds}: Preparing data...") X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y_encoded[train_idx], y_encoded[test_idx] fold_log = f"Train samples: {len(X_train)}, Test samples: {len(X_test)}\n" print(fold_log) training_log += fold_log scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) if use_ga: if progress_callback: progress_callback(base_progress + 0.05/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Splitting for GA...") X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split( X_train_scaled, y_train, test_size=0.2, random_state=config.RANDOM_STATE, stratify=y_train ) if progress_callback: progress_callback(base_progress + 0.1/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...") ga = GeneticAlgorithm(X_train_ga, y_train_ga, n_features_to_select=n_features_select, selected_models=selected_models, skip_feature_selection=(not optimize_features) or (n_features_select == n_features_available)) ga.population_size = ga_population ga.n_generations = ga_generations def ga_progress(p, desc): if progress_callback: ga_progress_in_fold = 0.1 + 0.6 * p progress_callback(base_progress + ga_progress_in_fold/total_steps, desc=f"Fold {fold_idx}/{n_folds}: {desc}") best_config = ga.evolve( X_train_ga, y_train_ga, X_val_ga, y_val_ga, progress_callback=ga_progress, n_jobs=n_jobs ) training_log += "\n".join(ga.log_messages) + "\n" all_ga_history.extend(ga.history) if best_config is None: fold_log = f"āŒ GA optimization failed for Fold {fold_idx}\n" print(fold_log) training_log += fold_log continue selected_indices = best_config['feature_indices'] X_train_selected = X_train_scaled[:, selected_indices] X_test_selected = X_test_scaled[:, selected_indices] if progress_callback: progress_callback(base_progress + 0.7/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Training models with GA config...") models, accuracies = _train_all_models( X_train_selected, y_train, X_test_selected, y_test, n_classes, best_config, selected_models ) weights = best_config['weights'] fold_log = f"\nāœ… GA optimization completed for Fold {fold_idx}\n" fold_log += f"Best fitness: {ga.best_fitness:.4f}\n" fold_log += f"Generations: {len(ga.history)}/{ga_generations}\n" print(fold_log) training_log += fold_log else: if progress_callback: progress_callback(base_progress + 0.2/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Selecting features...") if not optimize_features: selected_indices = np.arange(X_train_scaled.shape[1]) else: feature_variance = np.var(X_train_scaled, axis=0) selected_indices = np.argsort( feature_variance)[-n_features_select:] X_train_selected = X_train_scaled[:, selected_indices] X_test_selected = X_test_scaled[:, selected_indices] if progress_callback: progress_callback(base_progress + 0.3/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Training models...") models, accuracies = _train_all_models_default( X_train_selected, y_train, X_test_selected, y_test, n_classes, progress_callback, fold_idx, n_folds, base_progress, total_steps, selected_models ) acc_values = np.array([accuracies[m] for m in selected_models]) weights = acc_values / acc_values.sum() if progress_callback: progress_callback(base_progress + 0.9/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Evaluating ensemble...") predictions = [models[m].predict_proba( X_test_selected) for m in selected_models] ensemble_pred = np.average(predictions, axis=0, weights=weights) ensemble_labels = np.argmax(ensemble_pred, axis=1) ensemble_acc = accuracy_score(y_test, ensemble_labels) fold_result = { 'fold': fold_idx, 'n_train': len(X_train), 'n_test': len(X_test) } for model_name in selected_models: fold_result[model_name] = accuracies[model_name] fold_result['ensemble'] = ensemble_acc fold_models.append({ 'models': models, 'scaler': scaler, 'selected_indices': selected_indices, 'weights': weights }) fold_log = f"\nšŸ“Š Fold {fold_idx} Results:\n" for model_name in selected_models: fold_log += f" {model_name.capitalize()}: {accuracies[model_name]:.4f}\n" fold_log += f" Ensemble: {ensemble_acc:.4f}\n" print(fold_log) training_log += fold_log current_step += 1 if len(fold_results) == 0: return "āŒ All folds failed", None, None, training_log results_df = pd.DataFrame(fold_results) stats_log = f"\n{'='*80}\n" stats_log += f"{'CROSS-VALIDATION SUMMARY':^80}\n" stats_log += f"{'='*80}\n\n" stats_log += "Per-Fold Results:\n" stats_log += results_df.to_string(index=False) + "\n\n" stats_log += "="*80 + "\n" stats_log += "SUMMARY STATISTICS\n" stats_log += "="*80 + "\n" stats_summary = [] for model_name in ['xgboost', 'lightgbm', 'gradientboosting', 'adaboost', 'ensemble']: scores = results_df[model_name].values mean_score = scores.mean() std_score = scores.std() model_stats = f"\n{model_name.upper()}:\n" model_stats += f" Mean Accuracy: {mean_score:.4f}\n" model_stats += f" Std Deviation: {std_score:.4f}\n" model_stats += f" 95% CI: [{mean_score - 1.96*std_score:.4f}, {mean_score + 1.96*std_score:.4f}]\n" model_stats += f" Min: {scores.min():.4f}\n" model_stats += f" Max: {scores.max():.4f}\n" stats_log += model_stats stats_summary.append({ 'Model': model_name.upper(), 'Mean': mean_score, 'Std': std_score, 'Min': scores.min(), 'Max': scores.max() }) print(stats_log) training_log += stats_log best_fold_idx = results_df['ensemble'].idxmax() best_fold = fold_results[best_fold_idx] best_models = fold_models[best_fold_idx] save_log = f"\n{'='*80}\n" save_log += f"Best performing fold: Fold {best_fold['fold']} (Ensemble: {best_fold['ensemble']:.4f})\n" save_log += "Saving this model...\n" save_log += "="*80 + "\n" print(save_log) training_log += save_log if progress_callback: progress_callback(0.95, desc="Saving best model...") _save_models( best_models['models'], best_models['scaler'], label_encoder, best_models['selected_indices'], best_models['weights'], { 'xgboost': best_fold['xgboost'], 'lightgbm': best_fold['lightgbm'], 'gradientboosting': best_fold['gradientboosting'], 'adaboost': best_fold['adaboost'] }, best_fold['ensemble'], cv_results=results_df.to_dict('records') ) if progress_callback: progress_callback(1.0, desc="Complete!") ensemble_mean = results_df['ensemble'].mean() ensemble_std = results_df['ensemble'].std() consistency = (1 - ensemble_std / ensemble_mean) * 100 summary = f""" ## āœ… Cross-Validation Training Complete! ### šŸ”„ {n_folds}-Fold Cross-Validation Results #### šŸŽÆ Ensemble Performance: - **Mean Accuracy**: {ensemble_mean:.4f} ± {ensemble_std:.4f} - **Best Fold**: {results_df['ensemble'].max():.4f} - **Worst Fold**: {results_df['ensemble'].min():.4f} - **95% CI**: [{ensemble_mean - 1.96*ensemble_std:.4f}, {ensemble_mean + 1.96*ensemble_std:.4f}] - **Consistency**: {consistency:.1f}% {'āœ… Excellent' if consistency > 95 else 'āš ļø Moderate' if consistency > 90 else 'āŒ Poor'} #### šŸ“Š Individual Models (Mean ± Std): - **XGBoost**: {results_df['xgboost'].mean():.4f} ± {results_df['xgboost'].std():.4f} - **LightGBM**: {results_df['lightgbm'].mean():.4f} ± {results_df['lightgbm'].std():.4f} - **Gradient Boosting**: {results_df['gradientboosting'].mean():.4f} ± {results_df['gradientboosting'].std():.4f} - **AdaBoost**: {results_df['adaboost'].mean():.4f} ± {results_df['adaboost'].std():.4f} ### āš–ļø Ensemble Weights (Best Fold): {dict(zip(['XGBoost', 'LightGBM', 'GradientBoosting', 'AdaBoost'], [f'{w:.3f}' for w in best_models['weights']]))} ### šŸ’¾ Saved Model: Best performing fold (Fold {best_fold['fold']}) saved to `weights/` ### āœ… Interpretation: - **Low Std (<0.02)**: Model is stable and generalizes well āœ“ - **Mean ± Std**: Reliable performance estimate - **All folds used**: Every sample was tested exactly once --- šŸ“ **Note**: This is a more reliable estimate than single train/test split! """ ga_history_df = None if use_ga and len(all_ga_history) > 0: ga_history_df = pd.DataFrame(all_ga_history) summary_stats_df = pd.DataFrame(stats_summary) return summary, summary_stats_df, ga_history_df, training_log def _train_single_split(X, y_encoded, label_encoder, n_classes, use_ga, ga_generations, ga_population, n_jobs, optimize_features, n_features_select, selected_models, progress_callback): """ Train with single train/test split (Original method) """ X_train, X_test, y_train, y_test = train_test_split( X, y_encoded, test_size=config.TRAIN_TEST_SPLIT, random_state=config.RANDOM_STATE, stratify=y_encoded ) if progress_callback: progress_callback(0.1, desc="Scaling features...") scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) training_log = "" if use_ga: if progress_callback: progress_callback(0.2, desc="Initializing GA...") X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split( X_train_scaled, y_train, test_size=0.2, random_state=config.RANDOM_STATE, stratify=y_train ) ga = GeneticAlgorithm( X_train_ga, y_train_ga, n_features_to_select=n_features_select, skip_feature_selection=(not optimize_features), selected_models=selected_models ) ga.population_size = ga_population ga.n_generations = ga_generations def ga_progress(p, desc): if progress_callback: progress_callback(0.2 + 0.6*p, desc=desc) best_config = ga.evolve( X_train_ga, y_train_ga, X_val_ga, y_val_ga, progress_callback=ga_progress, n_jobs=n_jobs ) training_log = "\n".join(ga.log_messages) if best_config is None: error_msg = """ ## āŒ GA Optimization Failed The genetic algorithm did not produce a valid configuration. **Possible causes:** - All fitness evaluations returned 0 - Training data issues **Solutions:** 1. Check training log below 2. Try "Simple Training (No GA)" 3. Reduce population/generations **Training Log:** """ return error_msg + training_log, None, None, training_log if progress_callback: progress_callback( 0.8, desc="Training final models with GA config...") selected_indices = best_config['feature_indices'] X_train_selected = X_train_scaled[:, selected_indices] X_test_selected = X_test_scaled[:, selected_indices] models, accuracies = _train_all_models( X_train_selected, y_train, X_test_selected, y_test, n_classes, best_config, selected_models ) weights = best_config['weights'] ga_summary = f""" ### 🧬 GA Optimization Results: - **Generations Completed**: {len(ga.history)}/{ga_generations} - **Population Size**: {ga_population} - **Best Fitness**: {ga.best_fitness:.4f} - **Parallel Jobs**: {n_jobs} - **Feature Selection**: {'Enabled' if optimize_features else 'Disabled'} - **Features Used**: {len(selected_indices)} ### šŸŽÆ Best Configuration: - **XGBoost**: n_est={best_config['xgb_n_estimators']}, depth={best_config['xgb_max_depth']}, lr={best_config['xgb_learning_rate']} - **LightGBM**: n_est={best_config['lgbm_n_estimators']}, leaves={best_config['lgbm_num_leaves']}, lr={best_config['lgbm_learning_rate']} - **Gradient Boosting**: n_est={best_config['gb_n_estimators']}, depth={best_config['gb_max_depth']}, lr={best_config['gb_learning_rate']} - **AdaBoost**: n_est={best_config['ada_n_estimators']}, lr={best_config['ada_learning_rate']} """ ga_history_df = pd.DataFrame(ga.history) else: if progress_callback: progress_callback(0.3, desc="Selecting features...") if not optimize_features: selected_indices = np.arange(X_train_scaled.shape[1]) else: feature_variance = np.var(X_train_scaled, axis=0) selected_indices = np.argsort( feature_variance)[-n_features_select:] X_train_selected = X_train_scaled[:, selected_indices] X_test_selected = X_test_scaled[:, selected_indices] models, accuracies = _train_all_models_default( X_train_selected, y_train, X_test_selected, y_test, n_classes, progress_callback, selected_models ) acc_values = np.array([accuracies[m] for m in selected_models]) weights = acc_values / acc_values.sum() ga_summary = f"\n### ⚔ Simple Training (No GA)\n- **Feature Selection**: {'Enabled' if optimize_features else 'Disabled'}\n- **Features Used**: {len(selected_indices)}\n" ga_history_df = None training_log = "Simple training mode - no GA logs" if progress_callback: progress_callback(0.9, desc="Creating ensemble...") predictions = [models[m].predict_proba( X_test_selected) for m in selected_models] ensemble_pred = np.average(predictions, axis=0, weights=weights) ensemble_labels = np.argmax(ensemble_pred, axis=1) ensemble_acc = accuracy_score(y_test, ensemble_labels) if progress_callback: progress_callback(0.95, desc="Saving models...") _save_models(models, scaler, label_encoder, selected_indices, weights, accuracies, ensemble_acc, selected_models=selected_models) if progress_callback: progress_callback(1.0, desc="Complete!") results_data = [] for model_name in selected_models: model_display = { 'xgboost': 'XGBoost', 'lightgbm': 'LightGBM', 'gradientboosting': 'Gradient Boosting', 'adaboost': 'AdaBoost' } results_data.append({ 'Model': model_display[model_name], 'Test Accuracy': accuracies[model_name] }) results_data.append({ 'Model': 'Ensemble', 'Test Accuracy': ensemble_acc }) results_df = pd.DataFrame(results_data) # Build summary with selected models model_results = "" for model_name in selected_models: model_results += f"- **{model_name.upper()}**: {accuracies[model_name]:.4f}\n" weight_dict = dict(zip( [m.upper() for m in selected_models], [f'{w:.3f}' for w in weights] )) summary = f""" ## āœ… Training Complete! {ga_summary} ### šŸ“Š Model Performance: {model_results} - **Ensemble**: {ensemble_acc:.4f} ⭐ ### āš–ļø Ensemble Weights: {weight_dict} ### šŸ’¾ Saved Files: - Model files in `weights/` - Configuration in `weights/config.json` āœ… **Models ready for prediction!** """ return summary, results_df, ga_history_df, training_log def _train_all_models(X_train, y_train, X_test, y_test, n_classes, config_dict, selected_models): """Train all models with given configuration""" models = {} accuracies = {} if 'xgboost' in selected_models: xgb = XGBClassifier( n_estimators=config_dict['xgb_n_estimators'], max_depth=config_dict['xgb_max_depth'], learning_rate=config_dict['xgb_learning_rate'], subsample=config_dict.get('xgb_subsample', 0.8), colsample_bytree=config_dict.get('xgb_colsample_bytree', 0.8), min_child_weight=config_dict.get('xgb_min_child_weight', 1), gamma=config_dict.get('xgb_gamma', 0), objective='multi:softprob', num_class=n_classes, random_state=config.RANDOM_STATE, n_jobs=-1, verbosity=0 ) xgb.fit(X_train, y_train) models['xgboost'] = xgb accuracies['xgboost'] = xgb.score(X_test, y_test) if 'lightgbm' in selected_models: lgbm = LGBMClassifier( n_estimators=config_dict['lgbm_n_estimators'], num_leaves=config_dict['lgbm_num_leaves'], learning_rate=config_dict['lgbm_learning_rate'], min_child_samples=config_dict.get('lgbm_min_child_samples', 20), subsample=config_dict.get('lgbm_subsample', 0.8), colsample_bytree=config_dict.get('lgbm_colsample_bytree', 0.8), reg_alpha=config_dict.get('lgbm_reg_alpha', 0), reg_lambda=config_dict.get('lgbm_reg_lambda', 0), objective='multiclass', num_class=n_classes, random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1, force_col_wise=True ) lgbm.fit(X_train, y_train) models['lightgbm'] = lgbm accuracies['lightgbm'] = lgbm.score(X_test, y_test) if 'gradientboosting' in selected_models: gb = GradientBoostingClassifier( n_estimators=config_dict['gb_n_estimators'], max_depth=config_dict['gb_max_depth'], learning_rate=config_dict['gb_learning_rate'], subsample=config_dict.get('gb_subsample', 0.8), min_samples_split=config_dict.get('gb_min_samples_split', 2), min_samples_leaf=config_dict.get('gb_min_samples_leaf', 1), random_state=config.RANDOM_STATE ) gb.fit(X_train, y_train) models['gradientboosting'] = gb accuracies['gradientboosting'] = gb.score(X_test, y_test) if 'adaboost' in selected_models: ada = AdaBoostClassifier( n_estimators=config_dict['ada_n_estimators'], learning_rate=config_dict['ada_learning_rate'], # algorithm=config.ADABOOST_ALGORITHM, random_state=config.RANDOM_STATE ) ada.fit(X_train, y_train) models['adaboost'] = ada accuracies['adaboost'] = ada.score(X_test, y_test) return models, accuracies def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes, progress_callback=None, fold_idx=None, n_folds=None, base_progress=0, total_steps=1, selected_models=None): """Train all models with default hyperparameters""" models = {} accuracies = {} if 'xgboost' in selected_models: if progress_callback and fold_idx: progress_callback(base_progress + 0.4/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Training XGBoost...") elif progress_callback: progress_callback(0.4, desc="Training XGBoost...") xgb = XGBClassifier( n_estimators=150, max_depth=5, learning_rate=0.1, objective='multi:softprob', num_class=n_classes, random_state=config.RANDOM_STATE, n_jobs=-1, verbosity=0 ) xgb.fit(X_train, y_train) models['xgboost'] = xgb accuracies['xgboost'] = xgb.score(X_test, y_test) if 'lightgbm' in selected_models: if progress_callback and fold_idx: progress_callback(base_progress + 0.5/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Training LightGBM...") elif progress_callback: progress_callback(0.5, desc="Training LightGBM...") lgbm = LGBMClassifier( n_estimators=150, num_leaves=40, learning_rate=0.1, objective='multiclass', num_class=n_classes, random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1, force_col_wise=True ) lgbm.fit(X_train, y_train) models['lightgbm'] = lgbm accuracies['lightgbm'] = lgbm.score(X_test, y_test) if 'gradientboosting' in selected_models: if progress_callback and fold_idx: progress_callback(base_progress + 0.65/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Training Gradient Boosting...") elif progress_callback: progress_callback(0.65, desc="Training Gradient Boosting...") gb = GradientBoostingClassifier( n_estimators=100, max_depth=4, learning_rate=0.1, random_state=config.RANDOM_STATE ) gb.fit(X_train, y_train) models['gradientboosting'] = gb accuracies['gradientboosting'] = gb.score(X_test, y_test) if 'adaboost' in selected_models: if progress_callback and fold_idx: progress_callback(base_progress + 0.8/total_steps, desc=f"Fold {fold_idx}/{n_folds}: Training AdaBoost...") elif progress_callback: progress_callback(0.8, desc="Training AdaBoost...") ada = AdaBoostClassifier( n_estimators=100, learning_rate=1.0, # algorithm=config.ADABOOST_ALGORITHM, random_state=config.RANDOM_STATE ) ada.fit(X_train, y_train) models['adaboost'] = ada accuracies['adaboost'] = ada.score(X_test, y_test) return models, accuracies def _save_models(models, scaler, label_encoder, selected_indices, weights, accuracies, ensemble_acc, cv_results=None, selected_models=None): """Save all models and configuration""" config.WEIGHTS_DIR.mkdir(exist_ok=True) # Save only selected models if selected_models is None: selected_models = list(models.keys()) model_files = { 'xgboost': 'xgboost_model.pkl', 'lightgbm': 'lightgbm_model.pkl', 'gradientboosting': 'gradientboost_model.pkl', 'adaboost': 'adaboost_model.pkl' } for model_name in selected_models: if model_name in models: with open(config.WEIGHTS_DIR / model_files[model_name], 'wb') as f: pickle.dump(models[model_name], f) with open(config.WEIGHTS_DIR / 'scaler.pkl', 'wb') as f: pickle.dump(scaler, f) with open(config.WEIGHTS_DIR / 'label_encoder.pkl', 'wb') as f: pickle.dump(label_encoder, f) model_config = { 'selected_features': selected_indices.tolist(), 'ensemble_weights': weights.tolist(), 'n_features': len(selected_indices), 'emotions': label_encoder.classes_.tolist(), 'selected_models': selected_models, # NEW 'model_accuracies': { model_name: float(accuracies[model_name]) for model_name in selected_models } } model_config['model_accuracies']['ensemble'] = float(ensemble_acc) if cv_results is not None: model_config['cv_results'] = cv_results model_config['training_mode'] = 'cross_validation' else: model_config['training_mode'] = 'single_split' with open(config.WEIGHTS_DIR / 'config.json', 'w') as f: json.dump(model_config, f, indent=2)