nguyennp86 commited on
Commit
3075c38
·
1 Parent(s): cafbe14

update fix code GA feature selection

Browse files
features_ravdess.json CHANGED
@@ -12,5 +12,5 @@
12
  "mel": 0
13
  },
14
  "n_samples": 1440,
15
- "extraction_date": "2025-10-04T21:13:14.967210"
16
  }
 
12
  "mel": 0
13
  },
14
  "n_samples": 1440,
15
+ "extraction_date": "2025-10-05T12:24:04.258750"
16
  }
src/__pycache__/genetic_algorithm.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/genetic_algorithm.cpython-311.pyc and b/src/__pycache__/genetic_algorithm.cpython-311.pyc differ
 
src/__pycache__/training.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/training.cpython-311.pyc and b/src/__pycache__/training.cpython-311.pyc differ
 
src/genetic_algorithm.py CHANGED
@@ -1,6 +1,5 @@
1
  """
2
  Genetic Algorithm for feature selection and hyperparameter optimization
3
- Supports AdaBoost algorithm selection and variable MFCC counts
4
  """
5
 
6
  import numpy as np
@@ -17,7 +16,6 @@ from sklearn.metrics import accuracy_score
17
 
18
  import config
19
 
20
- # Suppress LightGBM warnings
21
  warnings.filterwarnings(
22
  'ignore', message='X does not have valid feature names')
23
  warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
@@ -26,18 +24,38 @@ warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
26
  class GeneticAlgorithm:
27
  """GA for optimizing features + hyperparameters + ensemble weights"""
28
 
29
- def __init__(self, X: np.ndarray, y: np.ndarray, n_features_to_select: int = 80):
 
 
 
 
 
 
 
 
 
 
 
30
  self.X = X
31
  self.y = y
32
  self.n_features = X.shape[1]
 
33
 
34
- # Auto-adjust if requested features exceed available
35
- if n_features_to_select > self.n_features:
36
- print(
37
- f"⚠️ Adjusted: {n_features_to_select} → {self.n_features} features")
38
  self.n_select = self.n_features
 
 
39
  else:
40
- self.n_select = n_features_to_select
 
 
 
 
 
 
 
 
41
 
42
  self.n_classes = len(np.unique(y))
43
 
@@ -63,14 +81,22 @@ class GeneticAlgorithm:
63
  print(log_entry)
64
 
65
  def create_chromosome(self) -> Dict:
66
- """Create random chromosome with ALL hyperparameters including AdaBoost algorithm"""
67
- chromosome = {
68
- 'feature_indices': np.sort(np.random.choice(
69
- self.n_features, self.n_select, replace=False
 
 
 
 
 
 
 
 
 
70
  ))
71
- }
72
 
73
- # Add ALL hyperparameters for each model
74
  for model_prefix, params in config.MODEL_HYPERPARAMS.items():
75
  for param_name, param_values in params.items():
76
  key = f"{model_prefix}_{param_name}"
@@ -87,23 +113,16 @@ class GeneticAlgorithm:
87
 
88
  def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
89
  X_val: np.ndarray, y_val: np.ndarray) -> float:
90
- """
91
- Calculate fitness using validation accuracy
92
-
93
- Now optimizes AdaBoost algorithm ('SAMME' vs 'SAMME.R')
94
- """
95
  try:
96
  feature_indices = chromosome['feature_indices']
97
 
98
- # Keep as NumPy arrays - FAST and efficient
99
  X_train_selected = X_train[:, feature_indices]
100
  X_val_selected = X_val[:, feature_indices]
101
 
102
  models = []
103
 
104
- # ================================================================
105
  # XGBoost
106
- # ================================================================
107
  xgb = XGBClassifier(
108
  n_estimators=chromosome.get('xgb_n_estimators', 100),
109
  max_depth=chromosome.get('xgb_max_depth', 6),
@@ -121,9 +140,7 @@ class GeneticAlgorithm:
121
  xgb.fit(X_train_selected, y_train)
122
  models.append(xgb)
123
 
124
- # ================================================================
125
  # LightGBM
126
- # ================================================================
127
  lgbm = LGBMClassifier(
128
  n_estimators=chromosome.get('lgbm_n_estimators', 100),
129
  num_leaves=chromosome.get('lgbm_num_leaves', 31),
@@ -143,9 +160,7 @@ class GeneticAlgorithm:
143
  lgbm.fit(X_train_selected, y_train)
144
  models.append(lgbm)
145
 
146
- # ================================================================
147
  # Gradient Boosting
148
- # ================================================================
149
  gb = GradientBoostingClassifier(
150
  n_estimators=chromosome.get('gb_n_estimators', 100),
151
  max_depth=chromosome.get('gb_max_depth', 5),
@@ -158,23 +173,17 @@ class GeneticAlgorithm:
158
  gb.fit(X_train_selected, y_train)
159
  models.append(gb)
160
 
161
- # ================================================================
162
- # AdaBoost - NOW WITH ALGORITHM OPTIMIZATION
163
- # ================================================================
164
- ada_algorithm = chromosome.get(
165
- 'ada_algorithm', 'SAMME') # ← GA optimizes this!
166
-
167
  ada = AdaBoostClassifier(
168
  n_estimators=chromosome.get('ada_n_estimators', 100),
169
  learning_rate=chromosome.get('ada_learning_rate', 1.0),
 
170
  random_state=config.RANDOM_STATE
171
  )
172
  ada.fit(X_train_selected, y_train)
173
  models.append(ada)
174
 
175
- # ================================================================
176
- # Ensemble Prediction
177
- # ================================================================
178
  predictions = [model.predict_proba(
179
  X_val_selected) for model in models]
180
  weights = chromosome['weights']
@@ -198,31 +207,39 @@ class GeneticAlgorithm:
198
  child1 = {}
199
  child2 = {}
200
 
201
- # Feature crossover
202
- mask = np.random.rand(self.n_select) < 0.5
203
- child1_features = np.where(
204
- mask, parent1['feature_indices'], parent2['feature_indices'])
205
- child2_features = np.where(
206
- mask, parent2['feature_indices'], parent1['feature_indices'])
207
-
208
- child1_features = np.unique(child1_features)
209
- child2_features = np.unique(child2_features)
210
-
211
- # Fill to required size
212
- while len(child1_features) < self.n_select:
213
- new_feat = random.randint(0, self.n_features - 1)
214
- if new_feat not in child1_features:
215
- child1_features = np.append(child1_features, new_feat)
 
 
 
 
 
 
216
 
217
- while len(child2_features) < self.n_select:
218
- new_feat = random.randint(0, self.n_features - 1)
219
- if new_feat not in child2_features:
220
- child2_features = np.append(child2_features, new_feat)
221
 
222
- child1['feature_indices'] = np.sort(child1_features[:self.n_select])
223
- child2['feature_indices'] = np.sort(child2_features[:self.n_select])
 
 
224
 
225
- # Hyperparameter crossover (including AdaBoost algorithm)
226
  for key in parent1.keys():
227
  if key != 'feature_indices':
228
  if random.random() < 0.5:
@@ -238,21 +255,23 @@ class GeneticAlgorithm:
238
  """Mutation operation"""
239
  mutated = chromosome.copy()
240
 
241
- # Feature mutation
242
- if random.random() < self.mutation_rate:
243
- n_replace = random.randint(1, 5)
244
- indices_to_replace = np.random.choice(
245
- self.n_select, n_replace, replace=False)
 
246
 
247
- for idx in indices_to_replace:
248
- new_feat = random.randint(0, self.n_features - 1)
249
- while new_feat in mutated['feature_indices']:
250
  new_feat = random.randint(0, self.n_features - 1)
251
- mutated['feature_indices'][idx] = new_feat
 
 
252
 
253
- mutated['feature_indices'] = np.sort(mutated['feature_indices'])
 
254
 
255
- # Hyperparameter mutation (including AdaBoost algorithm)
256
  if random.random() < self.mutation_rate:
257
  param_keys = [k for k in chromosome.keys() if k not in [
258
  'feature_indices', 'weights']]
@@ -286,28 +305,17 @@ class GeneticAlgorithm:
286
  X_val: np.ndarray, y_val: np.ndarray,
287
  progress_callback: Optional[Callable] = None,
288
  n_jobs: int = 2) -> Dict:
289
- """
290
- Main GA evolution loop with parallel evaluation, early stopping, and logging
291
-
292
- Args:
293
- X_train, y_train: Training data (NumPy arrays)
294
- X_val, y_val: Validation data (NumPy arrays)
295
- progress_callback: Optional callback for progress updates
296
- n_jobs: Number of parallel jobs
297
-
298
- Returns:
299
- Best chromosome found
300
- """
301
 
302
  self.log("="*70)
303
  self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
304
  self.log("="*70)
305
  self.log(f"Population size: {self.population_size}")
306
  self.log(f"Generations: {self.n_generations}")
307
- self.log(f"Features to select: {self.n_select}/{self.n_features}")
 
308
  self.log(f"Early stopping patience: {self.early_stopping_patience}")
309
  self.log(f"Parallel jobs: {n_jobs}")
310
- self.log(f"Optimizing AdaBoost algorithm: SAMME vs SAMME.R")
311
  self.log("="*70)
312
 
313
  population = [self.create_chromosome()
@@ -339,13 +347,8 @@ class GeneticAlgorithm:
339
  self.best_chromosome = population[max_idx].copy()
340
  no_improve_count = 0
341
  improved = True
342
-
343
- # Log best configuration
344
- best_ada_algo = self.best_chromosome.get(
345
- 'ada_algorithm', 'SAMME')
346
  self.log(
347
  f" ✨ NEW BEST: {max_fitness:.4f} (+{max_fitness - prev_best:.4f})")
348
- self.log(f" AdaBoost algorithm: {best_ada_algo}")
349
  else:
350
  no_improve_count += 1
351
  self.log(
@@ -421,11 +424,6 @@ class GeneticAlgorithm:
421
  self.log(f"Total time: {total_time/60:.1f} minutes")
422
  self.log(
423
  f"Average time per generation: {total_time/len(self.history):.1f}s")
424
-
425
- if self.best_chromosome:
426
- self.log(
427
- f"\n🎯 Best AdaBoost Algorithm: {self.best_chromosome.get('ada_algorithm', 'SAMME')}")
428
-
429
  self.log("="*70)
430
 
431
  if self.best_chromosome is None:
 
1
  """
2
  Genetic Algorithm for feature selection and hyperparameter optimization
 
3
  """
4
 
5
  import numpy as np
 
16
 
17
  import config
18
 
 
19
  warnings.filterwarnings(
20
  'ignore', message='X does not have valid feature names')
21
  warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
 
24
  class GeneticAlgorithm:
25
  """GA for optimizing features + hyperparameters + ensemble weights"""
26
 
27
+ def __init__(self, X: np.ndarray, y: np.ndarray,
28
+ n_features_to_select: int = 80,
29
+ skip_feature_selection: bool = False):
30
+ """
31
+ Initialize GA
32
+
33
+ Args:
34
+ X: Training data
35
+ y: Training labels
36
+ n_features_to_select: Number of features to select
37
+ skip_feature_selection: If True, use all features (only optimize hyperparams)
38
+ """
39
  self.X = X
40
  self.y = y
41
  self.n_features = X.shape[1]
42
+ self.skip_feature_selection = skip_feature_selection
43
 
44
+ if skip_feature_selection:
45
+ # Use ALL features, no selection
 
 
46
  self.n_select = self.n_features
47
+ print(
48
+ f"✅ GA will optimize: HYPERPARAMETERS ONLY (using all {self.n_features} features)")
49
  else:
50
+ # GA selects features
51
+ if n_features_to_select > self.n_features:
52
+ print(
53
+ f"⚠️ Adjusted: {n_features_to_select} → {self.n_features} features")
54
+ self.n_select = self.n_features
55
+ else:
56
+ self.n_select = n_features_to_select
57
+ print(
58
+ f"✅ GA will optimize: FEATURES ({self.n_select}/{self.n_features}) + HYPERPARAMETERS")
59
 
60
  self.n_classes = len(np.unique(y))
61
 
 
81
  print(log_entry)
82
 
83
  def create_chromosome(self) -> Dict:
84
+ """Create random chromosome"""
85
+
86
+ chromosome = {}
87
+
88
+ # Feature selection (skip if not optimizing features)
89
+ if self.skip_feature_selection:
90
+ # Use ALL features
91
+ chromosome['feature_indices'] = np.arange(self.n_features)
92
+ else:
93
+ # Select random subset
94
+ n_to_select = min(self.n_select, self.n_features)
95
+ chromosome['feature_indices'] = np.sort(np.random.choice(
96
+ self.n_features, n_to_select, replace=False
97
  ))
 
98
 
99
+ # Add hyperparameters for each model
100
  for model_prefix, params in config.MODEL_HYPERPARAMS.items():
101
  for param_name, param_values in params.items():
102
  key = f"{model_prefix}_{param_name}"
 
113
 
114
  def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
115
  X_val: np.ndarray, y_val: np.ndarray) -> float:
116
+ """Calculate fitness using validation accuracy"""
 
 
 
 
117
  try:
118
  feature_indices = chromosome['feature_indices']
119
 
 
120
  X_train_selected = X_train[:, feature_indices]
121
  X_val_selected = X_val[:, feature_indices]
122
 
123
  models = []
124
 
 
125
  # XGBoost
 
126
  xgb = XGBClassifier(
127
  n_estimators=chromosome.get('xgb_n_estimators', 100),
128
  max_depth=chromosome.get('xgb_max_depth', 6),
 
140
  xgb.fit(X_train_selected, y_train)
141
  models.append(xgb)
142
 
 
143
  # LightGBM
 
144
  lgbm = LGBMClassifier(
145
  n_estimators=chromosome.get('lgbm_n_estimators', 100),
146
  num_leaves=chromosome.get('lgbm_num_leaves', 31),
 
160
  lgbm.fit(X_train_selected, y_train)
161
  models.append(lgbm)
162
 
 
163
  # Gradient Boosting
 
164
  gb = GradientBoostingClassifier(
165
  n_estimators=chromosome.get('gb_n_estimators', 100),
166
  max_depth=chromosome.get('gb_max_depth', 5),
 
173
  gb.fit(X_train_selected, y_train)
174
  models.append(gb)
175
 
176
+ # AdaBoost
 
 
 
 
 
177
  ada = AdaBoostClassifier(
178
  n_estimators=chromosome.get('ada_n_estimators', 100),
179
  learning_rate=chromosome.get('ada_learning_rate', 1.0),
180
+ # algorithm=config.ADABOOST_ALGORITHM,
181
  random_state=config.RANDOM_STATE
182
  )
183
  ada.fit(X_train_selected, y_train)
184
  models.append(ada)
185
 
186
+ # Ensemble prediction
 
 
187
  predictions = [model.predict_proba(
188
  X_val_selected) for model in models]
189
  weights = chromosome['weights']
 
207
  child1 = {}
208
  child2 = {}
209
 
210
+ # Feature crossover (only if not skipping feature selection)
211
+ if self.skip_feature_selection:
212
+ # Keep all features
213
+ child1['feature_indices'] = parent1['feature_indices'].copy()
214
+ child2['feature_indices'] = parent2['feature_indices'].copy()
215
+ else:
216
+ # Crossover features
217
+ mask = np.random.rand(self.n_select) < 0.5
218
+ child1_features = np.where(
219
+ mask, parent1['feature_indices'], parent2['feature_indices'])
220
+ child2_features = np.where(
221
+ mask, parent2['feature_indices'], parent1['feature_indices'])
222
+
223
+ child1_features = np.unique(child1_features)
224
+ child2_features = np.unique(child2_features)
225
+
226
+ # Fill to required size
227
+ while len(child1_features) < self.n_select:
228
+ new_feat = random.randint(0, self.n_features - 1)
229
+ if new_feat not in child1_features:
230
+ child1_features = np.append(child1_features, new_feat)
231
 
232
+ while len(child2_features) < self.n_select:
233
+ new_feat = random.randint(0, self.n_features - 1)
234
+ if new_feat not in child2_features:
235
+ child2_features = np.append(child2_features, new_feat)
236
 
237
+ child1['feature_indices'] = np.sort(
238
+ child1_features[:self.n_select])
239
+ child2['feature_indices'] = np.sort(
240
+ child2_features[:self.n_select])
241
 
242
+ # Hyperparameter crossover
243
  for key in parent1.keys():
244
  if key != 'feature_indices':
245
  if random.random() < 0.5:
 
255
  """Mutation operation"""
256
  mutated = chromosome.copy()
257
 
258
+ # Feature mutation (only if not skipping feature selection)
259
+ if not self.skip_feature_selection:
260
+ if random.random() < self.mutation_rate:
261
+ n_replace = random.randint(1, min(5, self.n_select))
262
+ indices_to_replace = np.random.choice(
263
+ self.n_select, n_replace, replace=False)
264
 
265
+ for idx in indices_to_replace:
 
 
266
  new_feat = random.randint(0, self.n_features - 1)
267
+ while new_feat in mutated['feature_indices']:
268
+ new_feat = random.randint(0, self.n_features - 1)
269
+ mutated['feature_indices'][idx] = new_feat
270
 
271
+ mutated['feature_indices'] = np.sort(
272
+ mutated['feature_indices'])
273
 
274
+ # Hyperparameter mutation
275
  if random.random() < self.mutation_rate:
276
  param_keys = [k for k in chromosome.keys() if k not in [
277
  'feature_indices', 'weights']]
 
305
  X_val: np.ndarray, y_val: np.ndarray,
306
  progress_callback: Optional[Callable] = None,
307
  n_jobs: int = 2) -> Dict:
308
+ """Main GA evolution loop"""
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  self.log("="*70)
311
  self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
312
  self.log("="*70)
313
  self.log(f"Population size: {self.population_size}")
314
  self.log(f"Generations: {self.n_generations}")
315
+ self.log(
316
+ f"Feature selection: {'DISABLED (hyperparams only)' if self.skip_feature_selection else f'ENABLED ({self.n_select}/{self.n_features})'}")
317
  self.log(f"Early stopping patience: {self.early_stopping_patience}")
318
  self.log(f"Parallel jobs: {n_jobs}")
 
319
  self.log("="*70)
320
 
321
  population = [self.create_chromosome()
 
347
  self.best_chromosome = population[max_idx].copy()
348
  no_improve_count = 0
349
  improved = True
 
 
 
 
350
  self.log(
351
  f" ✨ NEW BEST: {max_fitness:.4f} (+{max_fitness - prev_best:.4f})")
 
352
  else:
353
  no_improve_count += 1
354
  self.log(
 
424
  self.log(f"Total time: {total_time/60:.1f} minutes")
425
  self.log(
426
  f"Average time per generation: {total_time/len(self.history):.1f}s")
 
 
 
 
 
427
  self.log("="*70)
428
 
429
  if self.best_chromosome is None:
src/training.py CHANGED
@@ -16,9 +16,13 @@ from sklearn.metrics import accuracy_score, classification_report, confusion_mat
16
  from xgboost import XGBClassifier
17
  from lightgbm import LGBMClassifier
18
  from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
19
-
20
  import config
21
  from src.genetic_algorithm import GeneticAlgorithm
 
 
 
 
22
 
23
 
24
  def train_models_with_ga(use_ga: bool = True,
@@ -203,7 +207,8 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
203
  desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...")
204
 
205
  ga = GeneticAlgorithm(X_train_ga, y_train_ga,
206
- n_features_to_select=n_features_select)
 
207
  ga.population_size = ga_population
208
  ga.n_generations = ga_generations
209
 
@@ -701,7 +706,7 @@ def _train_all_models(X_train, y_train, X_test, y_test, n_classes, config_dict):
701
  ada = AdaBoostClassifier(
702
  n_estimators=config_dict['ada_n_estimators'],
703
  learning_rate=config_dict['ada_learning_rate'],
704
- algorithm=config.ADABOOST_ALGORITHM,
705
  random_state=config.RANDOM_STATE
706
  )
707
  ada.fit(X_train, y_train)
@@ -772,7 +777,7 @@ def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
772
  ada = AdaBoostClassifier(
773
  n_estimators=100,
774
  learning_rate=1.0,
775
- algorithm=config.ADABOOST_ALGORITHM,
776
  random_state=config.RANDOM_STATE
777
  )
778
  ada.fit(X_train, y_train)
 
16
  from xgboost import XGBClassifier
17
  from lightgbm import LGBMClassifier
18
  from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
19
+ import warnings
20
  import config
21
  from src.genetic_algorithm import GeneticAlgorithm
22
+ # Suppress LightGBM feature name warnings
23
+ warnings.filterwarnings(
24
+ 'ignore', message='X does not have valid feature names')
25
+ warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
26
 
27
 
28
  def train_models_with_ga(use_ga: bool = True,
 
207
  desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...")
208
 
209
  ga = GeneticAlgorithm(X_train_ga, y_train_ga,
210
+ n_features_to_select=n_features_select,
211
+ skip_feature_selection=(not optimize_features) or (n_features_select == n_features_available))
212
  ga.population_size = ga_population
213
  ga.n_generations = ga_generations
214
 
 
706
  ada = AdaBoostClassifier(
707
  n_estimators=config_dict['ada_n_estimators'],
708
  learning_rate=config_dict['ada_learning_rate'],
709
+ # algorithm=config.ADABOOST_ALGORITHM,
710
  random_state=config.RANDOM_STATE
711
  )
712
  ada.fit(X_train, y_train)
 
777
  ada = AdaBoostClassifier(
778
  n_estimators=100,
779
  learning_rate=1.0,
780
+ # algorithm=config.ADABOOST_ALGORITHM,
781
  random_state=config.RANDOM_STATE
782
  )
783
  ada.fit(X_train, y_train)