Spaces:

nguyennp86
/

speech-emotion-recognition

Sleeping

@@ -1,6 +1,5 @@
 """
 Genetic Algorithm for feature selection and hyperparameter optimization
-Supports AdaBoost algorithm selection and variable MFCC counts
 """
 import numpy as np
@@ -17,7 +16,6 @@ from sklearn.metrics import accuracy_score
 import config
-# Suppress LightGBM warnings
 warnings.filterwarnings(
     'ignore', message='X does not have valid feature names')
 warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
@@ -26,18 +24,38 @@ warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
 class GeneticAlgorithm:
     """GA for optimizing features + hyperparameters + ensemble weights"""
-    def __init__(self, X: np.ndarray, y: np.ndarray, n_features_to_select: int = 80):
         self.X = X
         self.y = y
         self.n_features = X.shape[1]
-        # Auto-adjust if requested features exceed available
-        if n_features_to_select > self.n_features:
-            print(
-                f"⚠️  Adjusted: {n_features_to_select} → {self.n_features} features")
             self.n_select = self.n_features
         else:
-            self.n_select = n_features_to_select
         self.n_classes = len(np.unique(y))
@@ -63,14 +81,22 @@ class GeneticAlgorithm:
         print(log_entry)
     def create_chromosome(self) -> Dict:
-        """Create random chromosome with ALL hyperparameters including AdaBoost algorithm"""
-        chromosome = {
-            'feature_indices': np.sort(np.random.choice(
-                self.n_features, self.n_select, replace=False
             ))
-        }
-        # Add ALL hyperparameters for each model
         for model_prefix, params in config.MODEL_HYPERPARAMS.items():
             for param_name, param_values in params.items():
                 key = f"{model_prefix}_{param_name}"
@@ -87,23 +113,16 @@ class GeneticAlgorithm:
     def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
                 X_val: np.ndarray, y_val: np.ndarray) -> float:
-        """
-        Calculate fitness using validation accuracy
-        Now optimizes AdaBoost algorithm ('SAMME' vs 'SAMME.R')
-        """
         try:
             feature_indices = chromosome['feature_indices']
-            # Keep as NumPy arrays - FAST and efficient
             X_train_selected = X_train[:, feature_indices]
             X_val_selected = X_val[:, feature_indices]
             models = []
-            # ================================================================
             # XGBoost
-            # ================================================================
             xgb = XGBClassifier(
                 n_estimators=chromosome.get('xgb_n_estimators', 100),
                 max_depth=chromosome.get('xgb_max_depth', 6),
@@ -121,9 +140,7 @@ class GeneticAlgorithm:
             xgb.fit(X_train_selected, y_train)
             models.append(xgb)
-            # ================================================================
             # LightGBM
-            # ================================================================
             lgbm = LGBMClassifier(
                 n_estimators=chromosome.get('lgbm_n_estimators', 100),
                 num_leaves=chromosome.get('lgbm_num_leaves', 31),
@@ -143,9 +160,7 @@ class GeneticAlgorithm:
             lgbm.fit(X_train_selected, y_train)
             models.append(lgbm)
-            # ================================================================
             # Gradient Boosting
-            # ================================================================
             gb = GradientBoostingClassifier(
                 n_estimators=chromosome.get('gb_n_estimators', 100),
                 max_depth=chromosome.get('gb_max_depth', 5),
@@ -158,23 +173,17 @@ class GeneticAlgorithm:
             gb.fit(X_train_selected, y_train)
             models.append(gb)
-            # ================================================================
-            # AdaBoost - NOW WITH ALGORITHM OPTIMIZATION
-            # ================================================================
-            ada_algorithm = chromosome.get(
-                'ada_algorithm', 'SAMME')  # ← GA optimizes this!
             ada = AdaBoostClassifier(
                 n_estimators=chromosome.get('ada_n_estimators', 100),
                 learning_rate=chromosome.get('ada_learning_rate', 1.0),
                 random_state=config.RANDOM_STATE
             )
             ada.fit(X_train_selected, y_train)
             models.append(ada)
-            # ================================================================
-            # Ensemble Prediction
-            # ================================================================
             predictions = [model.predict_proba(
                 X_val_selected) for model in models]
             weights = chromosome['weights']
@@ -198,31 +207,39 @@ class GeneticAlgorithm:
         child1 = {}
         child2 = {}
-        # Feature crossover
-        mask = np.random.rand(self.n_select) < 0.5
-        child1_features = np.where(
-            mask, parent1['feature_indices'], parent2['feature_indices'])
-        child2_features = np.where(
-            mask, parent2['feature_indices'], parent1['feature_indices'])
-        child1_features = np.unique(child1_features)
-        child2_features = np.unique(child2_features)
-        # Fill to required size
-        while len(child1_features) < self.n_select:
-            new_feat = random.randint(0, self.n_features - 1)
-            if new_feat not in child1_features:
-                child1_features = np.append(child1_features, new_feat)
-        while len(child2_features) < self.n_select:
-            new_feat = random.randint(0, self.n_features - 1)
-            if new_feat not in child2_features:
-                child2_features = np.append(child2_features, new_feat)
-        child1['feature_indices'] = np.sort(child1_features[:self.n_select])
-        child2['feature_indices'] = np.sort(child2_features[:self.n_select])
-        # Hyperparameter crossover (including AdaBoost algorithm)
         for key in parent1.keys():
             if key != 'feature_indices':
                 if random.random() < 0.5:
@@ -238,21 +255,23 @@ class GeneticAlgorithm:
         """Mutation operation"""
         mutated = chromosome.copy()
-        # Feature mutation
-        if random.random() < self.mutation_rate:
-            n_replace = random.randint(1, 5)
-            indices_to_replace = np.random.choice(
-                self.n_select, n_replace, replace=False)
-            for idx in indices_to_replace:
-                new_feat = random.randint(0, self.n_features - 1)
-                while new_feat in mutated['feature_indices']:
                     new_feat = random.randint(0, self.n_features - 1)
-                mutated['feature_indices'][idx] = new_feat
-            mutated['feature_indices'] = np.sort(mutated['feature_indices'])
-        # Hyperparameter mutation (including AdaBoost algorithm)
         if random.random() < self.mutation_rate:
             param_keys = [k for k in chromosome.keys() if k not in [
                 'feature_indices', 'weights']]
@@ -286,28 +305,17 @@ class GeneticAlgorithm:
                X_val: np.ndarray, y_val: np.ndarray,
                progress_callback: Optional[Callable] = None,
                n_jobs: int = 2) -> Dict:
-        """
-        Main GA evolution loop with parallel evaluation, early stopping, and logging
-        Args:
-            X_train, y_train: Training data (NumPy arrays)
-            X_val, y_val: Validation data (NumPy arrays)
-            progress_callback: Optional callback for progress updates
-            n_jobs: Number of parallel jobs
-        Returns:
-            Best chromosome found
-        """
         self.log("="*70)
         self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
         self.log("="*70)
         self.log(f"Population size: {self.population_size}")
         self.log(f"Generations: {self.n_generations}")
-        self.log(f"Features to select: {self.n_select}/{self.n_features}")
         self.log(f"Early stopping patience: {self.early_stopping_patience}")
         self.log(f"Parallel jobs: {n_jobs}")
-        self.log(f"Optimizing AdaBoost algorithm: SAMME vs SAMME.R")
         self.log("="*70)
         population = [self.create_chromosome()
@@ -339,13 +347,8 @@ class GeneticAlgorithm:
                 self.best_chromosome = population[max_idx].copy()
                 no_improve_count = 0
                 improved = True
-                # Log best configuration
-                best_ada_algo = self.best_chromosome.get(
-                    'ada_algorithm', 'SAMME')
                 self.log(
                     f"   ✨ NEW BEST: {max_fitness:.4f} (+{max_fitness - prev_best:.4f})")
-                self.log(f"      AdaBoost algorithm: {best_ada_algo}")
             else:
                 no_improve_count += 1
                 self.log(
@@ -421,11 +424,6 @@ class GeneticAlgorithm:
         self.log(f"Total time: {total_time/60:.1f} minutes")
         self.log(
             f"Average time per generation: {total_time/len(self.history):.1f}s")
-        if self.best_chromosome:
-            self.log(
-                f"\n🎯 Best AdaBoost Algorithm: {self.best_chromosome.get('ada_algorithm', 'SAMME')}")
         self.log("="*70)
         if self.best_chromosome is None:

 """
 Genetic Algorithm for feature selection and hyperparameter optimization
 """
 import numpy as np
 import config
 warnings.filterwarnings(
     'ignore', message='X does not have valid feature names')
 warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
 class GeneticAlgorithm:
     """GA for optimizing features + hyperparameters + ensemble weights"""
+    def __init__(self, X: np.ndarray, y: np.ndarray,
+                 n_features_to_select: int = 80,
+                 skip_feature_selection: bool = False):
+        """
+        Initialize GA
+        Args:
+            X: Training data
+            y: Training labels
+            n_features_to_select: Number of features to select
+            skip_feature_selection: If True, use all features (only optimize hyperparams)
+        """
         self.X = X
         self.y = y
         self.n_features = X.shape[1]
+        self.skip_feature_selection = skip_feature_selection
+        if skip_feature_selection:
+            # Use ALL features, no selection
             self.n_select = self.n_features
+            print(
+                f"✅ GA will optimize: HYPERPARAMETERS ONLY (using all {self.n_features} features)")
         else:
+            # GA selects features
+            if n_features_to_select > self.n_features:
+                print(
+                    f"⚠️  Adjusted: {n_features_to_select} → {self.n_features} features")
+                self.n_select = self.n_features
+            else:
+                self.n_select = n_features_to_select
+            print(
+                f"✅ GA will optimize: FEATURES ({self.n_select}/{self.n_features}) + HYPERPARAMETERS")
         self.n_classes = len(np.unique(y))
         print(log_entry)
     def create_chromosome(self) -> Dict:
+        """Create random chromosome"""
+        chromosome = {}
+        # Feature selection (skip if not optimizing features)
+        if self.skip_feature_selection:
+            # Use ALL features
+            chromosome['feature_indices'] = np.arange(self.n_features)
+        else:
+            # Select random subset
+            n_to_select = min(self.n_select, self.n_features)
+            chromosome['feature_indices'] = np.sort(np.random.choice(
+                self.n_features, n_to_select, replace=False
             ))
+        # Add hyperparameters for each model
         for model_prefix, params in config.MODEL_HYPERPARAMS.items():
             for param_name, param_values in params.items():
                 key = f"{model_prefix}_{param_name}"
     def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
                 X_val: np.ndarray, y_val: np.ndarray) -> float:
+        """Calculate fitness using validation accuracy"""
         try:
             feature_indices = chromosome['feature_indices']
             X_train_selected = X_train[:, feature_indices]
             X_val_selected = X_val[:, feature_indices]
             models = []
             # XGBoost
             xgb = XGBClassifier(
                 n_estimators=chromosome.get('xgb_n_estimators', 100),
                 max_depth=chromosome.get('xgb_max_depth', 6),
             xgb.fit(X_train_selected, y_train)
             models.append(xgb)
             # LightGBM
             lgbm = LGBMClassifier(
                 n_estimators=chromosome.get('lgbm_n_estimators', 100),
                 num_leaves=chromosome.get('lgbm_num_leaves', 31),
             lgbm.fit(X_train_selected, y_train)
             models.append(lgbm)
             # Gradient Boosting
             gb = GradientBoostingClassifier(
                 n_estimators=chromosome.get('gb_n_estimators', 100),
                 max_depth=chromosome.get('gb_max_depth', 5),
             gb.fit(X_train_selected, y_train)
             models.append(gb)
+            # AdaBoost
             ada = AdaBoostClassifier(
                 n_estimators=chromosome.get('ada_n_estimators', 100),
                 learning_rate=chromosome.get('ada_learning_rate', 1.0),
+                # algorithm=config.ADABOOST_ALGORITHM,
                 random_state=config.RANDOM_STATE
             )
             ada.fit(X_train_selected, y_train)
             models.append(ada)
+            # Ensemble prediction
             predictions = [model.predict_proba(
                 X_val_selected) for model in models]
             weights = chromosome['weights']
         child1 = {}
         child2 = {}
+        # Feature crossover (only if not skipping feature selection)
+        if self.skip_feature_selection:
+            # Keep all features
+            child1['feature_indices'] = parent1['feature_indices'].copy()
+            child2['feature_indices'] = parent2['feature_indices'].copy()
+        else:
+            # Crossover features
+            mask = np.random.rand(self.n_select) < 0.5
+            child1_features = np.where(
+                mask, parent1['feature_indices'], parent2['feature_indices'])
+            child2_features = np.where(
+                mask, parent2['feature_indices'], parent1['feature_indices'])
+            child1_features = np.unique(child1_features)
+            child2_features = np.unique(child2_features)
+            # Fill to required size
+            while len(child1_features) < self.n_select:
+                new_feat = random.randint(0, self.n_features - 1)
+                if new_feat not in child1_features:
+                    child1_features = np.append(child1_features, new_feat)
+            while len(child2_features) < self.n_select:
+                new_feat = random.randint(0, self.n_features - 1)
+                if new_feat not in child2_features:
+                    child2_features = np.append(child2_features, new_feat)
+            child1['feature_indices'] = np.sort(
+                child1_features[:self.n_select])
+            child2['feature_indices'] = np.sort(
+                child2_features[:self.n_select])
+        # Hyperparameter crossover
         for key in parent1.keys():
             if key != 'feature_indices':
                 if random.random() < 0.5:
         """Mutation operation"""
         mutated = chromosome.copy()
+        # Feature mutation (only if not skipping feature selection)
+        if not self.skip_feature_selection:
+            if random.random() < self.mutation_rate:
+                n_replace = random.randint(1, min(5, self.n_select))
+                indices_to_replace = np.random.choice(
+                    self.n_select, n_replace, replace=False)
+                for idx in indices_to_replace:
                     new_feat = random.randint(0, self.n_features - 1)
+                    while new_feat in mutated['feature_indices']:
+                        new_feat = random.randint(0, self.n_features - 1)
+                    mutated['feature_indices'][idx] = new_feat
+                mutated['feature_indices'] = np.sort(
+                    mutated['feature_indices'])
+        # Hyperparameter mutation
         if random.random() < self.mutation_rate:
             param_keys = [k for k in chromosome.keys() if k not in [
                 'feature_indices', 'weights']]
                X_val: np.ndarray, y_val: np.ndarray,
                progress_callback: Optional[Callable] = None,
                n_jobs: int = 2) -> Dict:
+        """Main GA evolution loop"""
         self.log("="*70)
         self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
         self.log("="*70)
         self.log(f"Population size: {self.population_size}")
         self.log(f"Generations: {self.n_generations}")
+        self.log(
+            f"Feature selection: {'DISABLED (hyperparams only)' if self.skip_feature_selection else f'ENABLED ({self.n_select}/{self.n_features})'}")
         self.log(f"Early stopping patience: {self.early_stopping_patience}")
         self.log(f"Parallel jobs: {n_jobs}")
         self.log("="*70)
         population = [self.create_chromosome()
                 self.best_chromosome = population[max_idx].copy()
                 no_improve_count = 0
                 improved = True
                 self.log(
                     f"   ✨ NEW BEST: {max_fitness:.4f} (+{max_fitness - prev_best:.4f})")
             else:
                 no_improve_count += 1
                 self.log(
         self.log(f"Total time: {total_time/60:.1f} minutes")
         self.log(
             f"Average time per generation: {total_time/len(self.history):.1f}s")
         self.log("="*70)
         if self.best_chromosome is None:

src/training.py CHANGED Viewed

@@ -16,9 +16,13 @@ from sklearn.metrics import accuracy_score, classification_report, confusion_mat
 from xgboost import XGBClassifier
 from lightgbm import LGBMClassifier
 from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
 import config
 from src.genetic_algorithm import GeneticAlgorithm
 def train_models_with_ga(use_ga: bool = True,
@@ -203,7 +207,8 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
                                   desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...")
             ga = GeneticAlgorithm(X_train_ga, y_train_ga,
-                                  n_features_to_select=n_features_select)
             ga.population_size = ga_population
             ga.n_generations = ga_generations
@@ -701,7 +706,7 @@ def _train_all_models(X_train, y_train, X_test, y_test, n_classes, config_dict):
     ada = AdaBoostClassifier(
         n_estimators=config_dict['ada_n_estimators'],
         learning_rate=config_dict['ada_learning_rate'],
-        algorithm=config.ADABOOST_ALGORITHM,
         random_state=config.RANDOM_STATE
     )
     ada.fit(X_train, y_train)
@@ -772,7 +777,7 @@ def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
     ada = AdaBoostClassifier(
         n_estimators=100,
         learning_rate=1.0,
-        algorithm=config.ADABOOST_ALGORITHM,
         random_state=config.RANDOM_STATE
     )
     ada.fit(X_train, y_train)

 from xgboost import XGBClassifier
 from lightgbm import LGBMClassifier
 from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
+import warnings
 import config
 from src.genetic_algorithm import GeneticAlgorithm
+# Suppress LightGBM feature name warnings
+warnings.filterwarnings(
+    'ignore', message='X does not have valid feature names')
+warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
 def train_models_with_ga(use_ga: bool = True,
                                   desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...")
             ga = GeneticAlgorithm(X_train_ga, y_train_ga,
+                                  n_features_to_select=n_features_select,
+                                  skip_feature_selection=(not optimize_features) or (n_features_select == n_features_available))
             ga.population_size = ga_population
             ga.n_generations = ga_generations
     ada = AdaBoostClassifier(
         n_estimators=config_dict['ada_n_estimators'],
         learning_rate=config_dict['ada_learning_rate'],
+        # algorithm=config.ADABOOST_ALGORITHM,
         random_state=config.RANDOM_STATE
     )
     ada.fit(X_train, y_train)
     ada = AdaBoostClassifier(
         n_estimators=100,
         learning_rate=1.0,
+        # algorithm=config.ADABOOST_ALGORITHM,
         random_state=config.RANDOM_STATE
     )
     ada.fit(X_train, y_train)