nguyennp86 commited on
Commit
cafbe14
·
1 Parent(s): 31a2a2f

update project with selection

Browse files
__pycache__/config.cpython-311.pyc ADDED
Binary file (3.59 kB). View file
 
config.py CHANGED
@@ -13,104 +13,129 @@ FEATURES_CSV = 'features_ravdess.csv'
13
  WEIGHTS_DIR = Path('weights')
14
 
15
  # ============================================================================
16
- # FEATURE EXTRACTION
17
  # ============================================================================
18
  AUDIO_DURATION = 2.5 # seconds
19
  AUDIO_OFFSET = 0.6 # seconds
20
- N_FEATURES = 162
21
 
22
- # Feature breakdown
 
 
 
 
 
23
  FEATURE_CONFIG = {
24
- 'zcr': 1,
25
- 'chroma': 12,
26
- 'mfcc': 20,
27
- 'rms': 1,
28
- 'mel': 128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  }
30
 
 
 
 
 
 
 
 
 
31
  # ============================================================================
32
- # GENETIC ALGORITHM
33
  # ============================================================================
34
- # GA_CONFIG = {
35
- # 'n_features_to_select': 80,
36
- # 'population_size': 15,
37
- # 'n_generations': 20,
38
- # 'mutation_rate': 0.15,
39
- # 'crossover_rate': 0.8,
40
- # 'elite_size': 2,
41
- # 'early_stopping_patience': 15,
42
- # 'early_stopping_tolerance': 0.0001
43
- # }
44
-
45
  GA_CONFIG = {
46
- 'n_features_to_select': 100, # TĂNG từ 80 → 100 (giữ nhiều features hơn)
47
- 'population_size': 20, # GIẢM từ 30 → 20 (faster per generation)
48
- 'n_generations': 30, # TĂNG từ 20 → 30 (more exploration)
49
- 'mutation_rate': 0.2, # TĂNG từ 0.15 → 0.2 (more diversity)
50
- 'crossover_rate': 0.8, # GIỮ NGUYÊN
51
- 'elite_size': 3, # TĂNG từ 2 → 3 (keep more good solutions)
52
- 'early_stopping_patience': 8, # TĂNG từ 5 → 8 (be more patient)
53
- 'early_stopping_tolerance': 0.001 # TĂNG từ 0.0001 → 0.001 (accept smaller improvements)
 
 
 
 
54
  }
55
 
56
  # ============================================================================
57
- # MODEL HYPERPARAMETERS
58
  # ============================================================================
59
- # MODEL_HYPERPARAMS = {
60
- # 'xgb': {
61
- # 'n_estimators': [50, 100, 150],
62
- # 'max_depth': [3, 4, 5, 6],
63
- # 'learning_rate': [0.05, 0.1, 0.15]
64
- # },
65
- # 'lgbm': {
66
- # 'n_estimators': [50, 100, 150],
67
- # 'num_leaves': [20, 31, 40],
68
- # 'learning_rate': [0.05, 0.1, 0.15]
69
- # },
70
- # 'gb': {
71
- # 'n_estimators': [50, 100, 150],
72
- # 'max_depth': [3, 4, 5],
73
- # 'learning_rate': [0.05, 0.1, 0.15]
74
- # },
75
- # 'ada': {
76
- # 'n_estimators': [50, 100, 150],
77
- # 'learning_rate': [0.5, 1.0, 1.5]
78
- # }
79
- # }
80
  MODEL_HYPERPARAMS = {
81
  'xgb': {
82
- 'n_estimators': [100, 200, 300, 400, 500], # Thêm nhiều options
83
- 'max_depth': [4, 5, 6, 7, 8, 9], # Deeper trees
84
- 'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2], # Wider range
85
- 'subsample': [0.7, 0.8, 0.9, 1.0], # NEW: Regularization
86
- 'colsample_bytree': [0.7, 0.8, 0.9, 1.0], # NEW: Feature sampling
87
- 'min_child_weight': [1, 3, 5], # NEW: Regularization
88
- 'gamma': [0, 0.1, 0.2] # NEW: Pruning
 
 
 
89
  },
90
  'lgbm': {
 
91
  'n_estimators': [100, 200, 300, 400, 500],
92
- 'num_leaves': [31, 50, 70, 100, 127], # More options
93
  'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
94
- 'min_child_samples': [10, 20, 30, 50], # NEW: Regularization
95
- 'subsample': [0.7, 0.8, 0.9, 1.0], # NEW: Regularization
96
- 'colsample_bytree': [0.7, 0.8, 0.9, 1.0], # NEW: Feature sampling
97
- 'reg_alpha': [0, 0.1, 0.5], # NEW: L1 regularization
98
- 'reg_lambda': [0, 0.1, 0.5] # NEW: L2 regularization
 
 
99
  },
100
  'gb': {
 
101
  'n_estimators': [100, 200, 300, 400],
102
- 'max_depth': [4, 5, 6, 7, 8], # Deeper
103
  'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
104
- 'subsample': [0.7, 0.8, 0.9, 1.0], # NEW
105
- 'min_samples_split': [2, 5, 10], # NEW
106
- 'min_samples_leaf': [1, 2, 4] # NEW
 
 
107
  },
108
  'ada': {
109
  'n_estimators': [100, 200, 300, 400, 500],
110
- 'learning_rate': [0.5, 0.8, 1.0, 1.2, 1.5],
111
- 'algorithm': ['SAMME', 'SAMME.R'] # NEW: Try both algorithms
 
112
  }
113
  }
 
 
 
 
114
  # ============================================================================
115
  # TRAINING
116
  # ============================================================================
@@ -151,4 +176,4 @@ UI_CONFIG = {
151
  'server_port': 7860,
152
  'max_file_size': 10 * 1024 * 1024, # 10 MB
153
  'allowed_audio_formats': ['.wav', '.mp3', '.flac']
154
- }
 
13
  WEIGHTS_DIR = Path('weights')
14
 
15
  # ============================================================================
16
+ # FEATURE EXTRACTION - CONFIGURABLE
17
  # ============================================================================
18
  AUDIO_DURATION = 2.5 # seconds
19
  AUDIO_OFFSET = 0.6 # seconds
 
20
 
21
+ # MFCC Configuration - VARIABLE
22
+ MFCC_MIN = 20 # Minimum MFCC coefficients
23
+ MFCC_MAX = 40 # Maximum MFCC coefficients
24
+ MFCC_DEFAULT = 20 # Default for extraction
25
+
26
+ # Feature breakdown with DYNAMIC counts
27
  FEATURE_CONFIG = {
28
+ 'zcr': {
29
+ 'count': 1,
30
+ 'start_idx': 0,
31
+ 'description': 'Zero Crossing Rate - Signal sign change frequency',
32
+ 'fixed': True
33
+ },
34
+ 'chroma': {
35
+ 'count': 12,
36
+ 'start_idx': 1,
37
+ 'description': 'Chroma Features - Pitch class distribution',
38
+ 'fixed': True
39
+ },
40
+ 'mfcc': {
41
+ 'count': MFCC_DEFAULT, # VARIABLE: 20-40
42
+ 'min_count': MFCC_MIN,
43
+ 'max_count': MFCC_MAX,
44
+ 'start_idx': 13,
45
+ 'description': 'MFCC - Mel-frequency cepstral coefficients',
46
+ 'fixed': False # Can vary
47
+ },
48
+ 'rms': {
49
+ 'count': 1,
50
+ 'start_idx': 13 + MFCC_DEFAULT, # Dynamic based on MFCC
51
+ 'description': 'RMS Energy - Signal amplitude',
52
+ 'fixed': True
53
+ },
54
+ 'mel': {
55
+ 'count': 128,
56
+ 'start_idx': 13 + MFCC_DEFAULT + 1, # Dynamic based on MFCC
57
+ 'description': 'Mel Spectrogram - Frequency distribution',
58
+ 'fixed': True
59
+ }
60
  }
61
 
62
+ # Total features with default MFCC
63
+ N_FEATURES_MIN = 1 + 12 + MFCC_MIN + 1 + 128 # 162 features (MFCC=20)
64
+ N_FEATURES_MAX = 1 + 12 + MFCC_MAX + 1 + 128 # 182 features (MFCC=40)
65
+ N_FEATURES_DEFAULT = 1 + 12 + MFCC_DEFAULT + 1 + 128 # 162 features
66
+
67
+ # Default feature types to extract
68
+ DEFAULT_FEATURE_TYPES = ['zcr', 'chroma', 'mfcc', 'rms', 'mel']
69
+
70
  # ============================================================================
71
+ # GENETIC ALGORITHM - OPTIMIZED
72
  # ============================================================================
 
 
 
 
 
 
 
 
 
 
 
73
  GA_CONFIG = {
74
+ 'n_features_to_select': 100, # From selected feature types
75
+ 'population_size': 20, # Smaller for faster generations
76
+ 'n_generations': 30, # More generations for exploration
77
+ 'mutation_rate': 0.2, # Higher for diversity
78
+ 'crossover_rate': 0.8, # Standard crossover rate
79
+ 'elite_size': 3, # Keep top 3 solutions
80
+ 'early_stopping_patience': 8, # Be patient for improvements
81
+ 'early_stopping_tolerance': 0.001, # Accept small improvements
82
+
83
+ # Feature optimization options
84
+ 'optimize_feature_types': False, # Whether GA should select feature types
85
+ 'optimize_mfcc_count': False, # Whether GA should optimize MFCC count
86
  }
87
 
88
  # ============================================================================
89
+ # MODEL HYPERPARAMETERS - EXPANDED & OPTIMIZED
90
  # ============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  MODEL_HYPERPARAMS = {
92
  'xgb': {
93
+ # Core parameters
94
+ 'n_estimators': [100, 200, 300, 400, 500],
95
+ 'max_depth': [4, 5, 6, 7, 8, 9],
96
+ 'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
97
+
98
+ # Regularization (PREVENT OVERFITTING)
99
+ 'subsample': [0.7, 0.8, 0.9, 1.0],
100
+ 'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
101
+ 'min_child_weight': [1, 3, 5, 7],
102
+ 'gamma': [0, 0.1, 0.2, 0.3]
103
  },
104
  'lgbm': {
105
+ # Core parameters
106
  'n_estimators': [100, 200, 300, 400, 500],
107
+ 'num_leaves': [31, 50, 70, 100, 127],
108
  'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
109
+
110
+ # Regularization
111
+ 'min_child_samples': [10, 20, 30, 50],
112
+ 'subsample': [0.7, 0.8, 0.9, 1.0],
113
+ 'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
114
+ 'reg_alpha': [0, 0.1, 0.5, 1.0],
115
+ 'reg_lambda': [0, 0.1, 0.5, 1.0]
116
  },
117
  'gb': {
118
+ # Core parameters
119
  'n_estimators': [100, 200, 300, 400],
120
+ 'max_depth': [4, 5, 6, 7, 8],
121
  'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
122
+
123
+ # Regularization
124
+ 'subsample': [0.7, 0.8, 0.9, 1.0],
125
+ 'min_samples_split': [2, 5, 10, 20],
126
+ 'min_samples_leaf': [1, 2, 4, 8]
127
  },
128
  'ada': {
129
  'n_estimators': [100, 200, 300, 400, 500],
130
+ 'learning_rate': [0.5, 0.8, 1.0, 1.2, 1.5]
131
+ # Note: algorithm='SAMME' is fixed (not optimized by GA)
132
+ # SAMME.R doesn't work well with multi-class problems in our case
133
  }
134
  }
135
+
136
+ # Fixed AdaBoost algorithm (not part of GA search space)
137
+ ADABOOST_ALGORITHM = 'SAMME' # Fixed choice
138
+
139
  # ============================================================================
140
  # TRAINING
141
  # ============================================================================
 
176
  'server_port': 7860,
177
  'max_file_size': 10 * 1024 * 1024, # 10 MB
178
  'allowed_audio_formats': ['.wav', '.mp3', '.flac']
179
+ }
features_ravdess.csv ADDED
The diff for this file is too large to render. See raw diff
 
features_ravdess.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_types": [
3
+ "mfcc"
4
+ ],
5
+ "n_mfcc": 40,
6
+ "total_features": 40,
7
+ "feature_breakdown": {
8
+ "zcr": 0,
9
+ "chroma": 0,
10
+ "mfcc": 40,
11
+ "rms": 0,
12
+ "mel": 0
13
+ },
14
+ "n_samples": 1440,
15
+ "extraction_date": "2025-10-04T21:13:14.967210"
16
+ }
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (186 Bytes). View file
 
src/__pycache__/data_loader.cpython-311.pyc ADDED
Binary file (5.04 kB). View file
 
src/__pycache__/ensemble_model.cpython-311.pyc ADDED
Binary file (10 kB). View file
 
src/__pycache__/feature_extraction.cpython-311.pyc ADDED
Binary file (8.35 kB). View file
 
src/__pycache__/genetic_algorithm.cpython-311.pyc ADDED
Binary file (24.3 kB). View file
 
src/__pycache__/training.cpython-311.pyc ADDED
Binary file (37.3 kB). View file
 
src/__pycache__/utils.cpython-311.pyc ADDED
Binary file (6.02 kB). View file
 
src/feature_extraction.py CHANGED
@@ -1,78 +1,226 @@
1
  """
2
  Audio Feature Extraction Module
3
- Extracts 162 features from audio files for emotion recognition
4
  """
5
 
6
  import numpy as np
7
  import librosa
8
  import warnings
 
9
  warnings.filterwarnings('ignore')
10
 
11
- def extract_features(audio_path, duration=2.5, offset=0.6):
 
12
  """
13
- Extract 162 audio features from an audio file
14
-
15
  Features:
16
- - 1 Zero Crossing Rate
17
- - 12 Chroma STFT
18
- - 20 MFCC
19
- - 1 RMS Energy
20
- - 128 Mel Spectrogram
21
-
22
  Args:
23
  audio_path (str): Path to audio file
24
  duration (float): Duration to load (seconds)
25
  offset (float): Start reading after this time (seconds)
26
-
 
 
 
 
 
27
  Returns:
28
- features (np.array): Feature vector of shape (162,)
29
  y (np.array): Audio time series
30
  sr (int): Sample rate
 
31
  """
 
 
 
 
 
 
 
 
 
 
32
  try:
33
  # Load audio file
34
  y, sr = librosa.load(audio_path, duration=duration, offset=offset)
35
-
36
  # Initialize feature array
37
  features = np.array([])
38
-
 
 
 
 
 
 
39
  # 1. Zero Crossing Rate (1 feature)
40
- zcr = np.mean(librosa.feature.zero_crossing_rate(y=y).T, axis=0)
41
- features = np.hstack((features, zcr))
42
-
 
 
43
  # 2. Chroma STFT (12 features)
44
- stft = np.abs(librosa.stft(y))
45
- chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
46
- features = np.hstack((features, chroma))
47
-
48
- # 3. MFCC (20 features)
49
- mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20).T, axis=0)
50
- features = np.hstack((features, mfcc))
51
-
 
 
 
 
 
 
52
  # 4. RMS Energy (1 feature)
53
- rms = np.mean(librosa.feature.rms(y=y).T, axis=0)
54
- features = np.hstack((features, rms))
55
-
 
 
56
  # 5. Mel Spectrogram (128 features)
57
- mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
58
- features = np.hstack((features, mel))
59
-
60
- return features, y, sr
61
-
 
 
 
 
62
  except Exception as e:
63
- raise Exception(f"Error extracting features from {audio_path}: {str(e)}")
 
64
 
65
 
66
- def get_feature_names():
67
  """
68
- Get names of all 162 features
69
-
 
 
 
 
70
  Returns:
71
  list: List of feature names
72
  """
73
- names = ['zcr']
74
- names.extend([f'chroma_{i}' for i in range(12)])
75
- names.extend([f'mfcc_{i}' for i in range(20)])
76
- names.append('rms')
77
- names.extend([f'mel_{i}' for i in range(128)])
78
- return names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Audio Feature Extraction Module
3
+ Extracts audio features with configurable feature types and MFCC count
4
  """
5
 
6
  import numpy as np
7
  import librosa
8
  import warnings
9
+ import config
10
  warnings.filterwarnings('ignore')
11
 
12
+
13
+ def extract_features(audio_path, duration=2.5, offset=0.6, feature_types=None, n_mfcc=None):
14
  """
15
+ Extract audio features based on selected feature types
16
+
17
  Features:
18
+ - ZCR: Zero Crossing Rate (1)
19
+ - Chroma: Chroma STFT (12)
20
+ - MFCC: Mel-frequency cepstral coefficients (20-40, configurable)
21
+ - RMS: RMS Energy (1)
22
+ - Mel: Mel Spectrogram (128)
23
+
24
  Args:
25
  audio_path (str): Path to audio file
26
  duration (float): Duration to load (seconds)
27
  offset (float): Start reading after this time (seconds)
28
+ feature_types (list): List of feature types to extract
29
+ ['zcr', 'chroma', 'mfcc', 'rms', 'mel']
30
+ If None, extract all features
31
+ n_mfcc (int): Number of MFCC coefficients (20-40)
32
+ If None, use default from config
33
+
34
  Returns:
35
+ features (np.array): Feature vector
36
  y (np.array): Audio time series
37
  sr (int): Sample rate
38
+ feature_info (dict): Information about extracted features
39
  """
40
+
41
+ if feature_types is None:
42
+ feature_types = config.DEFAULT_FEATURE_TYPES
43
+
44
+ if n_mfcc is None:
45
+ n_mfcc = config.MFCC_DEFAULT
46
+
47
+ # Validate MFCC count
48
+ n_mfcc = max(config.MFCC_MIN, min(n_mfcc, config.MFCC_MAX))
49
+
50
  try:
51
  # Load audio file
52
  y, sr = librosa.load(audio_path, duration=duration, offset=offset)
53
+
54
  # Initialize feature array
55
  features = np.array([])
56
+ feature_info = {
57
+ 'types_used': feature_types,
58
+ 'counts': {},
59
+ 'total': 0,
60
+ 'n_mfcc': n_mfcc if 'mfcc' in feature_types else 0
61
+ }
62
+
63
  # 1. Zero Crossing Rate (1 feature)
64
+ if 'zcr' in feature_types:
65
+ zcr = np.mean(librosa.feature.zero_crossing_rate(y=y).T, axis=0)
66
+ features = np.hstack((features, zcr))
67
+ feature_info['counts']['zcr'] = 1
68
+
69
  # 2. Chroma STFT (12 features)
70
+ if 'chroma' in feature_types:
71
+ stft = np.abs(librosa.stft(y))
72
+ chroma = np.mean(librosa.feature.chroma_stft(
73
+ S=stft, sr=sr).T, axis=0)
74
+ features = np.hstack((features, chroma))
75
+ feature_info['counts']['chroma'] = 12
76
+
77
+ # 3. MFCC (20-40 features, CONFIGURABLE)
78
+ if 'mfcc' in feature_types:
79
+ mfcc = np.mean(librosa.feature.mfcc(
80
+ y=y, sr=sr, n_mfcc=n_mfcc).T, axis=0)
81
+ features = np.hstack((features, mfcc))
82
+ feature_info['counts']['mfcc'] = n_mfcc
83
+
84
  # 4. RMS Energy (1 feature)
85
+ if 'rms' in feature_types:
86
+ rms = np.mean(librosa.feature.rms(y=y).T, axis=0)
87
+ features = np.hstack((features, rms))
88
+ feature_info['counts']['rms'] = 1
89
+
90
  # 5. Mel Spectrogram (128 features)
91
+ if 'mel' in feature_types:
92
+ mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
93
+ features = np.hstack((features, mel))
94
+ feature_info['counts']['mel'] = 128
95
+
96
+ feature_info['total'] = len(features)
97
+
98
+ return features, y, sr, feature_info
99
+
100
  except Exception as e:
101
+ raise Exception(
102
+ f"Error extracting features from {audio_path}: {str(e)}")
103
 
104
 
105
+ def get_feature_names(feature_types=None, n_mfcc=None):
106
  """
107
+ Get names of features based on selected types
108
+
109
+ Args:
110
+ feature_types (list): List of feature types
111
+ n_mfcc (int): Number of MFCC coefficients
112
+
113
  Returns:
114
  list: List of feature names
115
  """
116
+ if feature_types is None:
117
+ feature_types = config.DEFAULT_FEATURE_TYPES
118
+
119
+ if n_mfcc is None:
120
+ n_mfcc = config.MFCC_DEFAULT
121
+
122
+ names = []
123
+
124
+ if 'zcr' in feature_types:
125
+ names.append('zcr')
126
+
127
+ if 'chroma' in feature_types:
128
+ names.extend([f'chroma_{i}' for i in range(12)])
129
+
130
+ if 'mfcc' in feature_types:
131
+ names.extend([f'mfcc_{i}' for i in range(n_mfcc)])
132
+
133
+ if 'rms' in feature_types:
134
+ names.append('rms')
135
+
136
+ if 'mel' in feature_types:
137
+ names.extend([f'mel_{i}' for i in range(128)])
138
+
139
+ return names
140
+
141
+
142
+ def get_feature_count(feature_types=None, n_mfcc=None):
143
+ """
144
+ Get total feature count for selected types
145
+
146
+ Args:
147
+ feature_types (list): List of feature types
148
+ n_mfcc (int): Number of MFCC coefficients
149
+
150
+ Returns:
151
+ int: Total number of features
152
+ """
153
+ if feature_types is None:
154
+ feature_types = config.DEFAULT_FEATURE_TYPES
155
+
156
+ if n_mfcc is None:
157
+ n_mfcc = config.MFCC_DEFAULT
158
+
159
+ count = 0
160
+
161
+ if 'zcr' in feature_types:
162
+ count += 1
163
+ if 'chroma' in feature_types:
164
+ count += 12
165
+ if 'mfcc' in feature_types:
166
+ count += n_mfcc # VARIABLE
167
+ if 'rms' in feature_types:
168
+ count += 1
169
+ if 'mel' in feature_types:
170
+ count += 128
171
+
172
+ return count
173
+
174
+
175
+ def get_feature_indices(feature_types=None, n_mfcc=None, total_mfcc_in_dataset=None):
176
+ """
177
+ Get feature indices for selected types (for existing datasets)
178
+
179
+ Args:
180
+ feature_types (list): List of feature types to keep
181
+ n_mfcc (int): Number of MFCC to keep
182
+ total_mfcc_in_dataset (int): Total MFCC in the dataset
183
+
184
+ Returns:
185
+ np.array: Indices of features to keep
186
+ """
187
+ if feature_types is None:
188
+ feature_types = config.DEFAULT_FEATURE_TYPES
189
+
190
+ if n_mfcc is None:
191
+ n_mfcc = config.MFCC_DEFAULT
192
+
193
+ if total_mfcc_in_dataset is None:
194
+ total_mfcc_in_dataset = config.MFCC_DEFAULT
195
+
196
+ indices = []
197
+ current_idx = 0
198
+
199
+ # ZCR (1)
200
+ if 'zcr' in feature_types:
201
+ indices.extend(range(current_idx, current_idx + 1))
202
+ current_idx += 1
203
+
204
+ # Chroma (12)
205
+ if 'chroma' in feature_types:
206
+ indices.extend(range(current_idx, current_idx + 12))
207
+ current_idx += 12
208
+
209
+ # MFCC (variable)
210
+ if 'mfcc' in feature_types:
211
+ # Only take first n_mfcc coefficients
212
+ indices.extend(range(current_idx, current_idx +
213
+ min(n_mfcc, total_mfcc_in_dataset)))
214
+ current_idx += total_mfcc_in_dataset
215
+
216
+ # RMS (1)
217
+ if 'rms' in feature_types:
218
+ indices.extend(range(current_idx, current_idx + 1))
219
+ current_idx += 1
220
+
221
+ # Mel (128)
222
+ if 'mel' in feature_types:
223
+ indices.extend(range(current_idx, current_idx + 128))
224
+ current_idx += 128
225
+
226
+ return np.array(indices)
src/genetic_algorithm.py CHANGED
@@ -1,11 +1,13 @@
1
  """
2
  Genetic Algorithm for feature selection and hyperparameter optimization
 
3
  """
4
 
5
  import numpy as np
6
  import random
7
  import time
8
- from typing import Dict, List, Callable, Optional, Tuple
 
9
  from joblib import Parallel, delayed
10
 
11
  from xgboost import XGBClassifier
@@ -15,17 +17,30 @@ from sklearn.metrics import accuracy_score
15
 
16
  import config
17
 
 
 
 
 
 
18
 
19
  class GeneticAlgorithm:
20
  """GA for optimizing features + hyperparameters + ensemble weights"""
21
-
22
  def __init__(self, X: np.ndarray, y: np.ndarray, n_features_to_select: int = 80):
23
  self.X = X
24
  self.y = y
25
  self.n_features = X.shape[1]
26
- self.n_select = n_features_to_select
 
 
 
 
 
 
 
 
27
  self.n_classes = len(np.unique(y))
28
-
29
  # GA parameters from config
30
  self.population_size = config.GA_CONFIG['population_size']
31
  self.n_generations = config.GA_CONFIG['n_generations']
@@ -34,192 +49,180 @@ class GeneticAlgorithm:
34
  self.elite_size = config.GA_CONFIG['elite_size']
35
  self.early_stopping_patience = config.GA_CONFIG['early_stopping_patience']
36
  self.early_stopping_tolerance = config.GA_CONFIG['early_stopping_tolerance']
37
-
38
  self.best_chromosome = None
39
  self.best_fitness = 0
40
  self.history = []
41
  self.log_messages = []
42
-
43
  def log(self, message: str):
44
  """Add log message with timestamp"""
45
  timestamp = time.strftime("%H:%M:%S")
46
  log_entry = f"[{timestamp}] {message}"
47
  self.log_messages.append(log_entry)
48
  print(log_entry)
49
-
50
  def create_chromosome(self) -> Dict:
51
- """Create random chromosome"""
52
  chromosome = {
53
  'feature_indices': np.sort(np.random.choice(
54
  self.n_features, self.n_select, replace=False
55
  ))
56
  }
57
-
58
- # Add hyperparameters for each model
59
  for model_prefix, params in config.MODEL_HYPERPARAMS.items():
60
  for param_name, param_values in params.items():
61
  key = f"{model_prefix}_{param_name}"
62
  chromosome[key] = random.choice(param_values)
63
-
64
  # Ensemble weights
65
  chromosome['weights'] = self._random_weights(4)
66
-
67
  return chromosome
68
-
69
  def _random_weights(self, n: int) -> np.ndarray:
70
  """Generate n random weights that sum to 1"""
71
  return np.random.dirichlet(np.ones(n))
72
-
73
- def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
74
  X_val: np.ndarray, y_val: np.ndarray) -> float:
75
- """Calculate fitness using validation accuracy"""
 
 
 
 
76
  try:
77
  feature_indices = chromosome['feature_indices']
 
 
78
  X_train_selected = X_train[:, feature_indices]
79
  X_val_selected = X_val[:, feature_indices]
80
-
81
  models = []
82
-
 
83
  # XGBoost
84
- # xgb = XGBClassifier(
85
- # n_estimators=chromosome['xgb_n_estimators'],
86
- # max_depth=chromosome['xgb_max_depth'],
87
- # learning_rate=chromosome['xgb_learning_rate'],
88
- # objective='multi:softprob',
89
- # num_class=self.n_classes,
90
- # random_state=config.RANDOM_STATE,
91
- # n_jobs=-1,
92
- # verbosity=0
93
- # )
94
- # xgb.fit(X_train_selected, y_train)
95
- # models.append(xgb)
96
  xgb = XGBClassifier(
97
- n_estimators=chromosome['xgb_n_estimators'],
98
- max_depth=chromosome['xgb_max_depth'],
99
- learning_rate=chromosome['xgb_learning_rate'],
100
- subsample=chromosome.get('xgb_subsample', 0.8), # NEW
101
- colsample_bytree=chromosome.get('xgb_colsample_bytree', 0.8),# NEW
102
- min_child_weight=chromosome.get('xgb_min_child_weight', 1), # NEW
103
- gamma=chromosome.get('xgb_gamma', 0), # NEW
104
  objective='multi:softprob',
105
  num_class=self.n_classes,
106
  random_state=config.RANDOM_STATE,
107
  n_jobs=-1,
108
- verbosity=0,
109
- eval_metric='mlogloss' # NEW: Better metric
110
  )
111
  xgb.fit(X_train_selected, y_train)
112
  models.append(xgb)
113
-
 
114
  # LightGBM
115
- # lgbm = LGBMClassifier(
116
- # n_estimators=chromosome['lgbm_n_estimators'],
117
- # num_leaves=chromosome['lgbm_num_leaves'],
118
- # learning_rate=chromosome['lgbm_learning_rate'],
119
- # objective='multiclass',
120
- # num_class=self.n_classes,
121
- # random_state=config.RANDOM_STATE,
122
- # n_jobs=-1,
123
- # verbose=-1
124
- # )
125
  lgbm = LGBMClassifier(
126
- n_estimators=chromosome['lgbm_n_estimators'],
127
- num_leaves=chromosome['lgbm_num_leaves'],
128
- learning_rate=chromosome['lgbm_learning_rate'],
129
- min_child_samples=chromosome.get('lgbm_min_child_samples', 20), # NEW
130
- subsample=chromosome.get('lgbm_subsample', 0.8), # NEW
131
- colsample_bytree=chromosome.get('lgbm_colsample_bytree', 0.8), # NEW
132
- reg_alpha=chromosome.get('lgbm_reg_alpha', 0), # NEW
133
- reg_lambda=chromosome.get('lgbm_reg_lambda', 0), # NEW
134
  objective='multiclass',
135
  num_class=self.n_classes,
136
  random_state=config.RANDOM_STATE,
137
  n_jobs=-1,
138
  verbose=-1,
139
- metric='multi_logloss' # NEW
140
  )
141
  lgbm.fit(X_train_selected, y_train)
142
  models.append(lgbm)
143
-
 
144
  # Gradient Boosting
145
- # gb = GradientBoostingClassifier(
146
- # n_estimators=chromosome['gb_n_estimators'],
147
- # max_depth=chromosome['gb_max_depth'],
148
- # learning_rate=chromosome['gb_learning_rate'],
149
- # random_state=config.RANDOM_STATE
150
- # )
151
  gb = GradientBoostingClassifier(
152
- n_estimators=chromosome['gb_n_estimators'],
153
- max_depth=chromosome['gb_max_depth'],
154
- learning_rate=chromosome['gb_learning_rate'],
155
- subsample=chromosome.get('gb_subsample', 0.8), # NEW
156
- min_samples_split=chromosome.get('gb_min_samples_split', 2), # NEW
157
- min_samples_leaf=chromosome.get('gb_min_samples_leaf', 1), # NEW
158
  random_state=config.RANDOM_STATE
159
  )
160
  gb.fit(X_train_selected, y_train)
161
  models.append(gb)
162
-
163
- # AdaBoost
164
- # ada = AdaBoostClassifier(
165
- # n_estimators=chromosome['ada_n_estimators'],
166
- # learning_rate=chromosome['ada_learning_rate'],
167
- # algorithm='SAMME',
168
- # random_state=config.RANDOM_STATE
169
- # )
170
  ada = AdaBoostClassifier(
171
- n_estimators=chromosome['ada_n_estimators'],
172
- learning_rate=chromosome['ada_learning_rate'],
173
- algorithm=chromosome.get('ada_algorithm', 'SAMME'), # NEW
174
  random_state=config.RANDOM_STATE
175
  )
176
  ada.fit(X_train_selected, y_train)
177
  models.append(ada)
178
-
179
- # Ensemble prediction
180
- predictions = [model.predict_proba(X_val_selected) for model in models]
 
 
 
181
  weights = chromosome['weights']
182
  ensemble_proba = np.average(predictions, axis=0, weights=weights)
183
  y_pred = np.argmax(ensemble_proba, axis=1)
184
-
185
  accuracy = accuracy_score(y_val, y_pred)
186
  return accuracy
187
-
188
  except Exception as e:
189
- print(f"Error in fitness: {e}")
 
 
190
  return 0.0
191
-
192
  def crossover(self, parent1: Dict, parent2: Dict) -> Tuple[Dict, Dict]:
193
  """Crossover operation"""
194
  if random.random() > self.crossover_rate:
195
  return parent1.copy(), parent2.copy()
196
-
197
  child1 = {}
198
  child2 = {}
199
-
200
  # Feature crossover
201
  mask = np.random.rand(self.n_select) < 0.5
202
- child1_features = np.where(mask, parent1['feature_indices'], parent2['feature_indices'])
203
- child2_features = np.where(mask, parent2['feature_indices'], parent1['feature_indices'])
204
-
 
 
205
  child1_features = np.unique(child1_features)
206
  child2_features = np.unique(child2_features)
207
-
208
  # Fill to required size
209
  while len(child1_features) < self.n_select:
210
  new_feat = random.randint(0, self.n_features - 1)
211
  if new_feat not in child1_features:
212
  child1_features = np.append(child1_features, new_feat)
213
-
214
  while len(child2_features) < self.n_select:
215
  new_feat = random.randint(0, self.n_features - 1)
216
  if new_feat not in child2_features:
217
  child2_features = np.append(child2_features, new_feat)
218
-
219
  child1['feature_indices'] = np.sort(child1_features[:self.n_select])
220
  child2['feature_indices'] = np.sort(child2_features[:self.n_select])
221
-
222
- # Hyperparameter crossover
223
  for key in parent1.keys():
224
  if key != 'feature_indices':
225
  if random.random() < 0.5:
@@ -228,71 +231,74 @@ class GeneticAlgorithm:
228
  else:
229
  child1[key] = parent2[key]
230
  child2[key] = parent1[key]
231
-
232
  return child1, child2
233
-
234
  def mutate(self, chromosome: Dict) -> Dict:
235
  """Mutation operation"""
236
  mutated = chromosome.copy()
237
-
238
  # Feature mutation
239
  if random.random() < self.mutation_rate:
240
  n_replace = random.randint(1, 5)
241
- indices_to_replace = np.random.choice(self.n_select, n_replace, replace=False)
242
-
 
243
  for idx in indices_to_replace:
244
  new_feat = random.randint(0, self.n_features - 1)
245
  while new_feat in mutated['feature_indices']:
246
  new_feat = random.randint(0, self.n_features - 1)
247
  mutated['feature_indices'][idx] = new_feat
248
-
249
  mutated['feature_indices'] = np.sort(mutated['feature_indices'])
250
-
251
- # Hyperparameter mutation
252
  if random.random() < self.mutation_rate:
253
- param_keys = [k for k in chromosome.keys() if k not in ['feature_indices', 'weights']]
 
254
  if param_keys:
255
  param_to_mutate = random.choice(param_keys)
256
  temp = self.create_chromosome()
257
  mutated[param_to_mutate] = temp[param_to_mutate]
258
-
259
  # Weight mutation
260
  if random.random() < self.mutation_rate:
261
  mutated['weights'] = self._random_weights(4)
262
-
263
  return mutated
264
-
265
- def evaluate_population_parallel(self, population: List[Dict],
266
- X_train: np.ndarray, y_train: np.ndarray,
267
- X_val: np.ndarray, y_val: np.ndarray,
268
- n_jobs: int = 2) -> List[float]:
269
  """Evaluate entire population in parallel"""
270
- self.log(f" Evaluating {len(population)} individuals in parallel (n_jobs={n_jobs})...")
271
-
 
272
  fitness_scores = Parallel(n_jobs=n_jobs, verbose=0)(
273
  delayed(self.fitness)(chromosome, X_train, y_train, X_val, y_val)
274
  for chromosome in population
275
  )
276
-
277
  return fitness_scores
278
-
279
  def evolve(self, X_train: np.ndarray, y_train: np.ndarray,
280
  X_val: np.ndarray, y_val: np.ndarray,
281
  progress_callback: Optional[Callable] = None,
282
  n_jobs: int = 2) -> Dict:
283
  """
284
  Main GA evolution loop with parallel evaluation, early stopping, and logging
285
-
286
  Args:
287
- X_train, y_train: Training data
288
- X_val, y_val: Validation data
289
  progress_callback: Optional callback for progress updates
290
  n_jobs: Number of parallel jobs
291
-
292
  Returns:
293
  Best chromosome found
294
  """
295
-
296
  self.log("="*70)
297
  self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
298
  self.log("="*70)
@@ -301,28 +307,30 @@ class GeneticAlgorithm:
301
  self.log(f"Features to select: {self.n_select}/{self.n_features}")
302
  self.log(f"Early stopping patience: {self.early_stopping_patience}")
303
  self.log(f"Parallel jobs: {n_jobs}")
 
304
  self.log("="*70)
305
-
306
- population = [self.create_chromosome() for _ in range(self.population_size)]
307
-
 
308
  start_time = time.time()
309
  no_improve_count = 0
310
-
311
  for generation in range(self.n_generations):
312
  gen_start = time.time()
313
-
314
  self.log(f"\n📊 Generation {generation + 1}/{self.n_generations}")
315
-
316
  # Parallel fitness evaluation
317
  fitness_scores = self.evaluate_population_parallel(
318
  population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
319
  )
320
-
321
  max_fitness = max(fitness_scores)
322
  avg_fitness = np.mean(fitness_scores)
323
  std_fitness = np.std(fitness_scores)
324
  max_idx = fitness_scores.index(max_fitness)
325
-
326
  # Track improvement
327
  improved = False
328
  if max_fitness > self.best_fitness + self.early_stopping_tolerance:
@@ -331,22 +339,31 @@ class GeneticAlgorithm:
331
  self.best_chromosome = population[max_idx].copy()
332
  no_improve_count = 0
333
  improved = True
334
- self.log(f" ✨ NEW BEST: {max_fitness:.4f} (+{max_fitness - prev_best:.4f})")
 
 
 
 
 
 
335
  else:
336
  no_improve_count += 1
337
- self.log(f" → Best: {max_fitness:.4f} (no improvement, count={no_improve_count})")
338
-
 
339
  # Log statistics
340
  self.log(f" Average: {avg_fitness:.4f} (σ={std_fitness:.4f})")
341
- self.log(f" Range: [{min(fitness_scores):.4f}, {max(fitness_scores):.4f}]")
342
-
 
343
  gen_time = time.time() - gen_start
344
  elapsed = time.time() - start_time
345
  avg_gen_time = elapsed / (generation + 1)
346
  eta = avg_gen_time * (self.n_generations - generation - 1)
347
-
348
- self.log(f" Time: {gen_time:.1f}s | Elapsed: {elapsed/60:.1f}min | ETA: {eta/60:.1f}min")
349
-
 
350
  self.history.append({
351
  'generation': generation + 1,
352
  'best_fitness': max_fitness,
@@ -355,31 +372,33 @@ class GeneticAlgorithm:
355
  'time': gen_time,
356
  'improved': improved
357
  })
358
-
359
  # Update progress callback
360
  if progress_callback:
361
  progress_callback(
362
  (generation + 1) / self.n_generations,
363
  desc=f"Gen {generation+1}/{self.n_generations} | Best: {max_fitness:.4f} | Avg: {avg_fitness:.4f} | ETA: {eta/60:.0f}min"
364
  )
365
-
366
  # Early stopping check
367
  if no_improve_count >= self.early_stopping_patience:
368
  self.log(f"\n🛑 EARLY STOPPING at generation {generation + 1}")
369
- self.log(f" No improvement for {self.early_stopping_patience} consecutive generations")
 
370
  self.log(f" Best fitness: {self.best_fitness:.4f}")
371
  break
372
-
373
  # Selection (Tournament + Elitism)
374
  selected = []
375
  for _ in range(self.population_size - self.elite_size):
376
- tournament = random.sample(list(zip(population, fitness_scores)), 3)
 
377
  winner = max(tournament, key=lambda x: x[1])[0]
378
  selected.append(winner)
379
-
380
  elite_indices = np.argsort(fitness_scores)[-self.elite_size:]
381
  elite = [population[i] for i in elite_indices]
382
-
383
  # Crossover & Mutation
384
  offspring = []
385
  for i in range(0, len(selected), 2):
@@ -387,28 +406,36 @@ class GeneticAlgorithm:
387
  child1, child2 = self.crossover(selected[i], selected[i+1])
388
  offspring.append(self.mutate(child1))
389
  offspring.append(self.mutate(child2))
390
-
391
- population = elite + offspring[:self.population_size - self.elite_size]
392
-
 
393
  total_time = time.time() - start_time
394
-
395
  self.log("\n" + "="*70)
396
  self.log("✅ GA OPTIMIZATION COMPLETE")
397
  self.log("="*70)
398
  self.log(f"Final best fitness: {self.best_fitness:.4f}")
399
- self.log(f"Total generations: {len(self.history)}/{self.n_generations}")
 
400
  self.log(f"Total time: {total_time/60:.1f} minutes")
401
- self.log(f"Average time per generation: {total_time/len(self.history):.1f}s")
 
 
 
 
 
 
402
  self.log("="*70)
403
-
404
-
405
  if self.best_chromosome is None:
406
- self.log("⚠️ Warning: No improvement found, using best from final generation")
 
407
  fitness_scores = self.evaluate_population_parallel(
408
  population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
409
  )
410
  max_idx = fitness_scores.index(max(fitness_scores))
411
  self.best_chromosome = population[max_idx].copy()
412
  self.best_fitness = fitness_scores[max_idx]
413
-
414
- return self.best_chromosome
 
1
  """
2
  Genetic Algorithm for feature selection and hyperparameter optimization
3
+ Supports AdaBoost algorithm selection and variable MFCC counts
4
  """
5
 
6
  import numpy as np
7
  import random
8
  import time
9
+ import warnings
10
+ from typing import Dict, List, Callable, Optional, Tuple
11
  from joblib import Parallel, delayed
12
 
13
  from xgboost import XGBClassifier
 
17
 
18
  import config
19
 
20
+ # Suppress LightGBM warnings
21
+ warnings.filterwarnings(
22
+ 'ignore', message='X does not have valid feature names')
23
+ warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
24
+
25
 
26
  class GeneticAlgorithm:
27
  """GA for optimizing features + hyperparameters + ensemble weights"""
28
+
29
  def __init__(self, X: np.ndarray, y: np.ndarray, n_features_to_select: int = 80):
30
  self.X = X
31
  self.y = y
32
  self.n_features = X.shape[1]
33
+
34
+ # Auto-adjust if requested features exceed available
35
+ if n_features_to_select > self.n_features:
36
+ print(
37
+ f"⚠️ Adjusted: {n_features_to_select} → {self.n_features} features")
38
+ self.n_select = self.n_features
39
+ else:
40
+ self.n_select = n_features_to_select
41
+
42
  self.n_classes = len(np.unique(y))
43
+
44
  # GA parameters from config
45
  self.population_size = config.GA_CONFIG['population_size']
46
  self.n_generations = config.GA_CONFIG['n_generations']
 
49
  self.elite_size = config.GA_CONFIG['elite_size']
50
  self.early_stopping_patience = config.GA_CONFIG['early_stopping_patience']
51
  self.early_stopping_tolerance = config.GA_CONFIG['early_stopping_tolerance']
52
+
53
  self.best_chromosome = None
54
  self.best_fitness = 0
55
  self.history = []
56
  self.log_messages = []
57
+
58
  def log(self, message: str):
59
  """Add log message with timestamp"""
60
  timestamp = time.strftime("%H:%M:%S")
61
  log_entry = f"[{timestamp}] {message}"
62
  self.log_messages.append(log_entry)
63
  print(log_entry)
64
+
65
  def create_chromosome(self) -> Dict:
66
+ """Create random chromosome with ALL hyperparameters including AdaBoost algorithm"""
67
  chromosome = {
68
  'feature_indices': np.sort(np.random.choice(
69
  self.n_features, self.n_select, replace=False
70
  ))
71
  }
72
+
73
+ # Add ALL hyperparameters for each model
74
  for model_prefix, params in config.MODEL_HYPERPARAMS.items():
75
  for param_name, param_values in params.items():
76
  key = f"{model_prefix}_{param_name}"
77
  chromosome[key] = random.choice(param_values)
78
+
79
  # Ensemble weights
80
  chromosome['weights'] = self._random_weights(4)
81
+
82
  return chromosome
83
+
84
  def _random_weights(self, n: int) -> np.ndarray:
85
  """Generate n random weights that sum to 1"""
86
  return np.random.dirichlet(np.ones(n))
87
+
88
+ def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
89
  X_val: np.ndarray, y_val: np.ndarray) -> float:
90
+ """
91
+ Calculate fitness using validation accuracy
92
+
93
+ Now optimizes AdaBoost algorithm ('SAMME' vs 'SAMME.R')
94
+ """
95
  try:
96
  feature_indices = chromosome['feature_indices']
97
+
98
+ # Keep as NumPy arrays - FAST and efficient
99
  X_train_selected = X_train[:, feature_indices]
100
  X_val_selected = X_val[:, feature_indices]
101
+
102
  models = []
103
+
104
+ # ================================================================
105
  # XGBoost
106
+ # ================================================================
 
 
 
 
 
 
 
 
 
 
 
107
  xgb = XGBClassifier(
108
+ n_estimators=chromosome.get('xgb_n_estimators', 100),
109
+ max_depth=chromosome.get('xgb_max_depth', 6),
110
+ learning_rate=chromosome.get('xgb_learning_rate', 0.1),
111
+ subsample=chromosome.get('xgb_subsample', 0.8),
112
+ colsample_bytree=chromosome.get('xgb_colsample_bytree', 0.8),
113
+ min_child_weight=chromosome.get('xgb_min_child_weight', 1),
114
+ gamma=chromosome.get('xgb_gamma', 0),
115
  objective='multi:softprob',
116
  num_class=self.n_classes,
117
  random_state=config.RANDOM_STATE,
118
  n_jobs=-1,
119
+ verbosity=0
 
120
  )
121
  xgb.fit(X_train_selected, y_train)
122
  models.append(xgb)
123
+
124
+ # ================================================================
125
  # LightGBM
126
+ # ================================================================
 
 
 
 
 
 
 
 
 
127
  lgbm = LGBMClassifier(
128
+ n_estimators=chromosome.get('lgbm_n_estimators', 100),
129
+ num_leaves=chromosome.get('lgbm_num_leaves', 31),
130
+ learning_rate=chromosome.get('lgbm_learning_rate', 0.1),
131
+ min_child_samples=chromosome.get('lgbm_min_child_samples', 20),
132
+ subsample=chromosome.get('lgbm_subsample', 0.8),
133
+ colsample_bytree=chromosome.get('lgbm_colsample_bytree', 0.8),
134
+ reg_alpha=chromosome.get('lgbm_reg_alpha', 0),
135
+ reg_lambda=chromosome.get('lgbm_reg_lambda', 0),
136
  objective='multiclass',
137
  num_class=self.n_classes,
138
  random_state=config.RANDOM_STATE,
139
  n_jobs=-1,
140
  verbose=-1,
141
+ force_col_wise=True
142
  )
143
  lgbm.fit(X_train_selected, y_train)
144
  models.append(lgbm)
145
+
146
+ # ================================================================
147
  # Gradient Boosting
148
+ # ================================================================
 
 
 
 
 
149
  gb = GradientBoostingClassifier(
150
+ n_estimators=chromosome.get('gb_n_estimators', 100),
151
+ max_depth=chromosome.get('gb_max_depth', 5),
152
+ learning_rate=chromosome.get('gb_learning_rate', 0.1),
153
+ subsample=chromosome.get('gb_subsample', 0.8),
154
+ min_samples_split=chromosome.get('gb_min_samples_split', 2),
155
+ min_samples_leaf=chromosome.get('gb_min_samples_leaf', 1),
156
  random_state=config.RANDOM_STATE
157
  )
158
  gb.fit(X_train_selected, y_train)
159
  models.append(gb)
160
+
161
+ # ================================================================
162
+ # AdaBoost - NOW WITH ALGORITHM OPTIMIZATION
163
+ # ================================================================
164
+ ada_algorithm = chromosome.get(
165
+ 'ada_algorithm', 'SAMME') # ← GA optimizes this!
166
+
 
167
  ada = AdaBoostClassifier(
168
+ n_estimators=chromosome.get('ada_n_estimators', 100),
169
+ learning_rate=chromosome.get('ada_learning_rate', 1.0),
 
170
  random_state=config.RANDOM_STATE
171
  )
172
  ada.fit(X_train_selected, y_train)
173
  models.append(ada)
174
+
175
+ # ================================================================
176
+ # Ensemble Prediction
177
+ # ================================================================
178
+ predictions = [model.predict_proba(
179
+ X_val_selected) for model in models]
180
  weights = chromosome['weights']
181
  ensemble_proba = np.average(predictions, axis=0, weights=weights)
182
  y_pred = np.argmax(ensemble_proba, axis=1)
183
+
184
  accuracy = accuracy_score(y_val, y_pred)
185
  return accuracy
186
+
187
  except Exception as e:
188
+ print(f"⚠️ Error in fitness evaluation: {e}")
189
+ import traceback
190
+ traceback.print_exc()
191
  return 0.0
192
+
193
  def crossover(self, parent1: Dict, parent2: Dict) -> Tuple[Dict, Dict]:
194
  """Crossover operation"""
195
  if random.random() > self.crossover_rate:
196
  return parent1.copy(), parent2.copy()
197
+
198
  child1 = {}
199
  child2 = {}
200
+
201
  # Feature crossover
202
  mask = np.random.rand(self.n_select) < 0.5
203
+ child1_features = np.where(
204
+ mask, parent1['feature_indices'], parent2['feature_indices'])
205
+ child2_features = np.where(
206
+ mask, parent2['feature_indices'], parent1['feature_indices'])
207
+
208
  child1_features = np.unique(child1_features)
209
  child2_features = np.unique(child2_features)
210
+
211
  # Fill to required size
212
  while len(child1_features) < self.n_select:
213
  new_feat = random.randint(0, self.n_features - 1)
214
  if new_feat not in child1_features:
215
  child1_features = np.append(child1_features, new_feat)
216
+
217
  while len(child2_features) < self.n_select:
218
  new_feat = random.randint(0, self.n_features - 1)
219
  if new_feat not in child2_features:
220
  child2_features = np.append(child2_features, new_feat)
221
+
222
  child1['feature_indices'] = np.sort(child1_features[:self.n_select])
223
  child2['feature_indices'] = np.sort(child2_features[:self.n_select])
224
+
225
+ # Hyperparameter crossover (including AdaBoost algorithm)
226
  for key in parent1.keys():
227
  if key != 'feature_indices':
228
  if random.random() < 0.5:
 
231
  else:
232
  child1[key] = parent2[key]
233
  child2[key] = parent1[key]
234
+
235
  return child1, child2
236
+
237
  def mutate(self, chromosome: Dict) -> Dict:
238
  """Mutation operation"""
239
  mutated = chromosome.copy()
240
+
241
  # Feature mutation
242
  if random.random() < self.mutation_rate:
243
  n_replace = random.randint(1, 5)
244
+ indices_to_replace = np.random.choice(
245
+ self.n_select, n_replace, replace=False)
246
+
247
  for idx in indices_to_replace:
248
  new_feat = random.randint(0, self.n_features - 1)
249
  while new_feat in mutated['feature_indices']:
250
  new_feat = random.randint(0, self.n_features - 1)
251
  mutated['feature_indices'][idx] = new_feat
252
+
253
  mutated['feature_indices'] = np.sort(mutated['feature_indices'])
254
+
255
+ # Hyperparameter mutation (including AdaBoost algorithm)
256
  if random.random() < self.mutation_rate:
257
+ param_keys = [k for k in chromosome.keys() if k not in [
258
+ 'feature_indices', 'weights']]
259
  if param_keys:
260
  param_to_mutate = random.choice(param_keys)
261
  temp = self.create_chromosome()
262
  mutated[param_to_mutate] = temp[param_to_mutate]
263
+
264
  # Weight mutation
265
  if random.random() < self.mutation_rate:
266
  mutated['weights'] = self._random_weights(4)
267
+
268
  return mutated
269
+
270
+ def evaluate_population_parallel(self, population: List[Dict],
271
+ X_train: np.ndarray, y_train: np.ndarray,
272
+ X_val: np.ndarray, y_val: np.ndarray,
273
+ n_jobs: int = 2) -> List[float]:
274
  """Evaluate entire population in parallel"""
275
+ self.log(
276
+ f" Evaluating {len(population)} individuals in parallel (n_jobs={n_jobs})...")
277
+
278
  fitness_scores = Parallel(n_jobs=n_jobs, verbose=0)(
279
  delayed(self.fitness)(chromosome, X_train, y_train, X_val, y_val)
280
  for chromosome in population
281
  )
282
+
283
  return fitness_scores
284
+
285
  def evolve(self, X_train: np.ndarray, y_train: np.ndarray,
286
  X_val: np.ndarray, y_val: np.ndarray,
287
  progress_callback: Optional[Callable] = None,
288
  n_jobs: int = 2) -> Dict:
289
  """
290
  Main GA evolution loop with parallel evaluation, early stopping, and logging
291
+
292
  Args:
293
+ X_train, y_train: Training data (NumPy arrays)
294
+ X_val, y_val: Validation data (NumPy arrays)
295
  progress_callback: Optional callback for progress updates
296
  n_jobs: Number of parallel jobs
297
+
298
  Returns:
299
  Best chromosome found
300
  """
301
+
302
  self.log("="*70)
303
  self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
304
  self.log("="*70)
 
307
  self.log(f"Features to select: {self.n_select}/{self.n_features}")
308
  self.log(f"Early stopping patience: {self.early_stopping_patience}")
309
  self.log(f"Parallel jobs: {n_jobs}")
310
+ self.log(f"Optimizing AdaBoost algorithm: SAMME vs SAMME.R")
311
  self.log("="*70)
312
+
313
+ population = [self.create_chromosome()
314
+ for _ in range(self.population_size)]
315
+
316
  start_time = time.time()
317
  no_improve_count = 0
318
+
319
  for generation in range(self.n_generations):
320
  gen_start = time.time()
321
+
322
  self.log(f"\n📊 Generation {generation + 1}/{self.n_generations}")
323
+
324
  # Parallel fitness evaluation
325
  fitness_scores = self.evaluate_population_parallel(
326
  population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
327
  )
328
+
329
  max_fitness = max(fitness_scores)
330
  avg_fitness = np.mean(fitness_scores)
331
  std_fitness = np.std(fitness_scores)
332
  max_idx = fitness_scores.index(max_fitness)
333
+
334
  # Track improvement
335
  improved = False
336
  if max_fitness > self.best_fitness + self.early_stopping_tolerance:
 
339
  self.best_chromosome = population[max_idx].copy()
340
  no_improve_count = 0
341
  improved = True
342
+
343
+ # Log best configuration
344
+ best_ada_algo = self.best_chromosome.get(
345
+ 'ada_algorithm', 'SAMME')
346
+ self.log(
347
+ f" ✨ NEW BEST: {max_fitness:.4f} (+{max_fitness - prev_best:.4f})")
348
+ self.log(f" AdaBoost algorithm: {best_ada_algo}")
349
  else:
350
  no_improve_count += 1
351
+ self.log(
352
+ f" → Best: {max_fitness:.4f} (no improvement, count={no_improve_count})")
353
+
354
  # Log statistics
355
  self.log(f" Average: {avg_fitness:.4f} (σ={std_fitness:.4f})")
356
+ self.log(
357
+ f" Range: [{min(fitness_scores):.4f}, {max(fitness_scores):.4f}]")
358
+
359
  gen_time = time.time() - gen_start
360
  elapsed = time.time() - start_time
361
  avg_gen_time = elapsed / (generation + 1)
362
  eta = avg_gen_time * (self.n_generations - generation - 1)
363
+
364
+ self.log(
365
+ f" Time: {gen_time:.1f}s | Elapsed: {elapsed/60:.1f}min | ETA: {eta/60:.1f}min")
366
+
367
  self.history.append({
368
  'generation': generation + 1,
369
  'best_fitness': max_fitness,
 
372
  'time': gen_time,
373
  'improved': improved
374
  })
375
+
376
  # Update progress callback
377
  if progress_callback:
378
  progress_callback(
379
  (generation + 1) / self.n_generations,
380
  desc=f"Gen {generation+1}/{self.n_generations} | Best: {max_fitness:.4f} | Avg: {avg_fitness:.4f} | ETA: {eta/60:.0f}min"
381
  )
382
+
383
  # Early stopping check
384
  if no_improve_count >= self.early_stopping_patience:
385
  self.log(f"\n🛑 EARLY STOPPING at generation {generation + 1}")
386
+ self.log(
387
+ f" No improvement for {self.early_stopping_patience} consecutive generations")
388
  self.log(f" Best fitness: {self.best_fitness:.4f}")
389
  break
390
+
391
  # Selection (Tournament + Elitism)
392
  selected = []
393
  for _ in range(self.population_size - self.elite_size):
394
+ tournament = random.sample(
395
+ list(zip(population, fitness_scores)), 3)
396
  winner = max(tournament, key=lambda x: x[1])[0]
397
  selected.append(winner)
398
+
399
  elite_indices = np.argsort(fitness_scores)[-self.elite_size:]
400
  elite = [population[i] for i in elite_indices]
401
+
402
  # Crossover & Mutation
403
  offspring = []
404
  for i in range(0, len(selected), 2):
 
406
  child1, child2 = self.crossover(selected[i], selected[i+1])
407
  offspring.append(self.mutate(child1))
408
  offspring.append(self.mutate(child2))
409
+
410
+ population = elite + \
411
+ offspring[:self.population_size - self.elite_size]
412
+
413
  total_time = time.time() - start_time
414
+
415
  self.log("\n" + "="*70)
416
  self.log("✅ GA OPTIMIZATION COMPLETE")
417
  self.log("="*70)
418
  self.log(f"Final best fitness: {self.best_fitness:.4f}")
419
+ self.log(
420
+ f"Total generations: {len(self.history)}/{self.n_generations}")
421
  self.log(f"Total time: {total_time/60:.1f} minutes")
422
+ self.log(
423
+ f"Average time per generation: {total_time/len(self.history):.1f}s")
424
+
425
+ if self.best_chromosome:
426
+ self.log(
427
+ f"\n🎯 Best AdaBoost Algorithm: {self.best_chromosome.get('ada_algorithm', 'SAMME')}")
428
+
429
  self.log("="*70)
430
+
 
431
  if self.best_chromosome is None:
432
+ self.log(
433
+ "⚠️ Warning: No improvement found, using best from final generation")
434
  fitness_scores = self.evaluate_population_parallel(
435
  population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
436
  )
437
  max_idx = fitness_scores.index(max(fitness_scores))
438
  self.best_chromosome = population[max_idx].copy()
439
  self.best_fitness = fitness_scores[max_idx]
440
+
441
+ return self.best_chromosome
src/training.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Model training functions
3
  """
4
 
5
  import os
@@ -22,15 +22,17 @@ from src.genetic_algorithm import GeneticAlgorithm
22
 
23
 
24
  def train_models_with_ga(use_ga: bool = True,
25
- use_cv: bool = False,
26
- n_folds: int = 5,
27
- ga_generations: int = 20,
28
- ga_population: int = 15,
29
- n_jobs: int = 2,
30
- progress_callback: Optional[callable] = None) -> Tuple[str, pd.DataFrame, Optional[pd.DataFrame], str]:
 
 
31
  """
32
  Train models with or without GA optimization and optional K-Fold CV
33
-
34
  Args:
35
  use_ga: Whether to use GA optimization
36
  use_cv: Whether to use K-Fold Cross-Validation
@@ -38,12 +40,14 @@ def train_models_with_ga(use_ga: bool = True,
38
  ga_generations: Number of GA generations
39
  ga_population: GA population size
40
  n_jobs: Number of parallel jobs
 
 
41
  progress_callback: Optional progress callback function
42
-
43
  Returns:
44
  tuple: (summary_text, results_df, ga_history_df, training_log)
45
  """
46
-
47
  if not os.path.exists(config.FEATURES_CSV):
48
  return """
49
  ## ❌ Error: Dataset Not Found
@@ -52,45 +56,78 @@ Please go to **Tab 1: Feature Extraction** first!
52
 
53
  Click "🔊 Extract Features" to process the dataset.
54
  """, None, None, ""
55
-
56
  try:
57
  if progress_callback:
58
  progress_callback(0, desc="Loading dataset...")
59
-
60
  # Load data
61
  df = pd.read_csv(config.FEATURES_CSV)
62
-
63
- feature_cols = [col for col in df.columns if col.startswith('feature_')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  X = df[feature_cols].values
65
  y = df['emotion'].values
66
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  label_encoder = LabelEncoder()
68
  y_encoded = label_encoder.fit_transform(y)
69
-
70
  n_classes = len(label_encoder.classes_)
71
-
72
  training_log = ""
73
-
74
- # ========================================================================
75
- # CROSS-VALIDATION MODE
76
- # ========================================================================
77
  if use_cv:
78
  return _train_with_cross_validation(
79
  X, y_encoded, label_encoder, n_classes,
80
  use_ga, n_folds, ga_generations, ga_population, n_jobs,
 
81
  progress_callback
82
  )
83
-
84
- # ========================================================================
85
- # SINGLE SPLIT MODE (Original)
86
- # ========================================================================
87
  else:
88
  return _train_single_split(
89
  X, y_encoded, label_encoder, n_classes,
90
  use_ga, ga_generations, ga_population, n_jobs,
 
91
  progress_callback
92
  )
93
-
94
  except Exception as e:
95
  import traceback
96
  error_trace = traceback.format_exc()
@@ -99,175 +136,161 @@ Click "🔊 Extract Features" to process the dataset.
99
 
100
  def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
101
  use_ga, n_folds, ga_generations, ga_population, n_jobs,
 
102
  progress_callback):
103
  """
104
  Train with K-Fold Cross-Validation
105
  """
106
-
107
  print("="*80)
108
  print(f"{'K-FOLD CROSS-VALIDATION TRAINING':^80}")
109
  print("="*80)
110
  print(f"Number of folds: {n_folds}")
111
  print(f"Use GA: {use_ga}")
 
 
112
  print(f"Total samples: {len(X)}")
113
  print("="*80)
114
-
115
- # Initialize K-Fold
116
- skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE)
117
-
118
- # Storage for results
119
  fold_results = []
120
  fold_models = []
121
  all_ga_history = []
122
  training_log = ""
123
-
124
- # Calculate progress steps
125
  total_steps = n_folds
126
  current_step = 0
127
-
128
- # Iterate through folds
129
  for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded), 1):
130
  fold_log = f"\n{'='*80}\n"
131
  fold_log += f"FOLD {fold_idx}/{n_folds}\n"
132
  fold_log += f"{'='*80}\n"
133
  print(fold_log)
134
  training_log += fold_log
135
-
136
  if progress_callback:
137
  base_progress = current_step / total_steps
138
- progress_callback(base_progress, desc=f"Fold {fold_idx}/{n_folds}: Preparing data...")
139
-
140
- # Split data
141
  X_train, X_test = X[train_idx], X[test_idx]
142
  y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]
143
-
144
  fold_log = f"Train samples: {len(X_train)}, Test samples: {len(X_test)}\n"
145
  print(fold_log)
146
  training_log += fold_log
147
-
148
- # Scale features
149
  scaler = StandardScaler()
150
  X_train_scaled = scaler.fit_transform(X_train)
151
  X_test_scaled = scaler.transform(X_test)
152
-
153
- # ====================================================================
154
- # GA OPTIMIZATION (if enabled)
155
- # ====================================================================
156
  if use_ga:
157
  if progress_callback:
158
- progress_callback(base_progress + 0.05/total_steps,
159
- desc=f"Fold {fold_idx}/{n_folds}: Splitting for GA...")
160
-
161
- # Split train into train + validation for GA
162
  X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
163
  X_train_scaled, y_train,
164
  test_size=0.2,
165
  random_state=config.RANDOM_STATE,
166
  stratify=y_train
167
  )
168
-
169
  if progress_callback:
170
  progress_callback(base_progress + 0.1/total_steps,
171
- desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...")
172
-
173
- # Initialize and run GA
174
- ga = GeneticAlgorithm(X_train_ga, y_train_ga, n_features_to_select=config.GA_CONFIG['n_features_to_select'])
175
  ga.population_size = ga_population
176
  ga.n_generations = ga_generations
177
-
178
  def ga_progress(p, desc):
179
  if progress_callback:
180
- # GA takes 60% of fold time
181
  ga_progress_in_fold = 0.1 + 0.6 * p
182
- progress_callback(base_progress + ga_progress_in_fold/total_steps,
183
- desc=f"Fold {fold_idx}/{n_folds}: {desc}")
184
-
185
  best_config = ga.evolve(
186
  X_train_ga, y_train_ga, X_val_ga, y_val_ga,
187
  progress_callback=ga_progress,
188
  n_jobs=n_jobs
189
  )
190
-
191
- # Store GA logs
192
  training_log += "\n".join(ga.log_messages) + "\n"
193
  all_ga_history.extend(ga.history)
194
-
195
  if best_config is None:
196
  fold_log = f"❌ GA optimization failed for Fold {fold_idx}\n"
197
  print(fold_log)
198
  training_log += fold_log
199
  continue
200
-
201
- # Use GA-selected features
202
  selected_indices = best_config['feature_indices']
203
  X_train_selected = X_train_scaled[:, selected_indices]
204
  X_test_selected = X_test_scaled[:, selected_indices]
205
-
206
  if progress_callback:
207
  progress_callback(base_progress + 0.7/total_steps,
208
- desc=f"Fold {fold_idx}/{n_folds}: Training models with GA config...")
209
-
210
- # Train models with GA config
211
  models, accuracies = _train_all_models(
212
  X_train_selected, y_train, X_test_selected, y_test,
213
  n_classes, best_config
214
  )
215
-
216
  weights = best_config['weights']
217
-
218
  fold_log = f"\n✅ GA optimization completed for Fold {fold_idx}\n"
219
  fold_log += f"Best fitness: {ga.best_fitness:.4f}\n"
220
  fold_log += f"Generations: {len(ga.history)}/{ga_generations}\n"
221
  print(fold_log)
222
  training_log += fold_log
223
-
224
- # ====================================================================
225
- # SIMPLE TRAINING (no GA)
226
- # ====================================================================
227
  else:
228
  if progress_callback:
229
  progress_callback(base_progress + 0.2/total_steps,
230
- desc=f"Fold {fold_idx}/{n_folds}: Selecting features...")
231
-
232
- # Select features by variance
233
- feature_variance = np.var(X_train_scaled, axis=0)
234
- selected_indices = np.argsort(feature_variance)[-config.GA_CONFIG['n_features_to_select']:]
235
-
 
 
 
236
  X_train_selected = X_train_scaled[:, selected_indices]
237
  X_test_selected = X_test_scaled[:, selected_indices]
238
-
239
  if progress_callback:
240
  progress_callback(base_progress + 0.3/total_steps,
241
- desc=f"Fold {fold_idx}/{n_folds}: Training models...")
242
-
243
  models, accuracies = _train_all_models_default(
244
  X_train_selected, y_train, X_test_selected, y_test,
245
  n_classes, progress_callback, fold_idx, n_folds, base_progress, total_steps
246
  )
247
-
248
- # Calculate weights based on accuracies
249
  acc_values = np.array(list(accuracies.values()))
250
  weights = acc_values / acc_values.sum()
251
-
252
- # ====================================================================
253
- # ENSEMBLE EVALUATION
254
- # ====================================================================
255
  if progress_callback:
256
  progress_callback(base_progress + 0.9/total_steps,
257
- desc=f"Fold {fold_idx}/{n_folds}: Evaluating ensemble...")
258
-
259
  predictions = [
260
  models['xgboost'].predict_proba(X_test_selected),
261
  models['lightgbm'].predict_proba(X_test_selected),
262
  models['gradientboosting'].predict_proba(X_test_selected),
263
  models['adaboost'].predict_proba(X_test_selected)
264
  ]
265
-
266
  ensemble_pred = np.average(predictions, axis=0, weights=weights)
267
  ensemble_labels = np.argmax(ensemble_pred, axis=1)
268
  ensemble_acc = accuracy_score(y_test, ensemble_labels)
269
-
270
- # Store results
271
  fold_result = {
272
  'fold': fold_idx,
273
  'xgboost': accuracies['xgboost'],
@@ -279,15 +302,14 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
279
  'n_test': len(X_test)
280
  }
281
  fold_results.append(fold_result)
282
-
283
  fold_models.append({
284
  'models': models,
285
  'scaler': scaler,
286
  'selected_indices': selected_indices,
287
  'weights': weights
288
  })
289
-
290
- # Print fold results
291
  fold_log = f"\n📊 Fold {fold_idx} Results:\n"
292
  fold_log += f" XGBoost: {accuracies['xgboost']:.4f}\n"
293
  fold_log += f" LightGBM: {accuracies['lightgbm']:.4f}\n"
@@ -296,45 +318,41 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
296
  fold_log += f" Ensemble: {ensemble_acc:.4f} ⭐\n"
297
  print(fold_log)
298
  training_log += fold_log
299
-
300
  current_step += 1
301
-
302
- # ========================================================================
303
- # AGGREGATE RESULTS
304
- # ========================================================================
305
  if len(fold_results) == 0:
306
  return "❌ All folds failed", None, None, training_log
307
-
308
  results_df = pd.DataFrame(fold_results)
309
-
310
- # Calculate statistics
311
  stats_log = f"\n{'='*80}\n"
312
  stats_log += f"{'CROSS-VALIDATION SUMMARY':^80}\n"
313
  stats_log += f"{'='*80}\n\n"
314
-
315
  stats_log += "Per-Fold Results:\n"
316
  stats_log += results_df.to_string(index=False) + "\n\n"
317
-
318
  stats_log += "="*80 + "\n"
319
  stats_log += "SUMMARY STATISTICS\n"
320
  stats_log += "="*80 + "\n"
321
-
322
  stats_summary = []
323
-
324
  for model_name in ['xgboost', 'lightgbm', 'gradientboosting', 'adaboost', 'ensemble']:
325
  scores = results_df[model_name].values
326
  mean_score = scores.mean()
327
  std_score = scores.std()
328
-
329
  model_stats = f"\n{model_name.upper()}:\n"
330
  model_stats += f" Mean Accuracy: {mean_score:.4f}\n"
331
  model_stats += f" Std Deviation: {std_score:.4f}\n"
332
  model_stats += f" 95% CI: [{mean_score - 1.96*std_score:.4f}, {mean_score + 1.96*std_score:.4f}]\n"
333
  model_stats += f" Min: {scores.min():.4f}\n"
334
  model_stats += f" Max: {scores.max():.4f}\n"
335
-
336
  stats_log += model_stats
337
-
338
  stats_summary.append({
339
  'Model': model_name.upper(),
340
  'Mean': mean_score,
@@ -342,27 +360,24 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
342
  'Min': scores.min(),
343
  'Max': scores.max()
344
  })
345
-
346
  print(stats_log)
347
  training_log += stats_log
348
-
349
- # ========================================================================
350
- # SELECT AND SAVE BEST MODEL
351
- # ========================================================================
352
  best_fold_idx = results_df['ensemble'].idxmax()
353
  best_fold = fold_results[best_fold_idx]
354
  best_models = fold_models[best_fold_idx]
355
-
356
  save_log = f"\n{'='*80}\n"
357
  save_log += f"Best performing fold: Fold {best_fold['fold']} (Ensemble: {best_fold['ensemble']:.4f})\n"
358
  save_log += "Saving this model...\n"
359
  save_log += "="*80 + "\n"
360
  print(save_log)
361
  training_log += save_log
362
-
363
  if progress_callback:
364
  progress_callback(0.95, desc="Saving best model...")
365
-
366
  _save_models(
367
  best_models['models'],
368
  best_models['scaler'],
@@ -378,18 +393,14 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
378
  best_fold['ensemble'],
379
  cv_results=results_df.to_dict('records')
380
  )
381
-
382
  if progress_callback:
383
  progress_callback(1.0, desc="Complete!")
384
-
385
- # ========================================================================
386
- # CREATE SUMMARY
387
- # ========================================================================
388
-
389
  ensemble_mean = results_df['ensemble'].mean()
390
  ensemble_std = results_df['ensemble'].std()
391
  consistency = (1 - ensemble_std / ensemble_mean) * 100
392
-
393
  summary = f"""
394
  ## ✅ Cross-Validation Training Complete!
395
 
@@ -423,70 +434,68 @@ Best performing fold (Fold {best_fold['fold']}) saved to `weights/`
423
 
424
  📝 **Note**: This is a more reliable estimate than single train/test split!
425
  """
426
-
427
- # GA history dataframe (if GA was used)
428
  ga_history_df = None
429
  if use_ga and len(all_ga_history) > 0:
430
  ga_history_df = pd.DataFrame(all_ga_history)
431
-
432
- # Summary stats dataframe
433
  summary_stats_df = pd.DataFrame(stats_summary)
434
-
435
  return summary, summary_stats_df, ga_history_df, training_log
436
 
437
 
438
  def _train_single_split(X, y_encoded, label_encoder, n_classes,
439
- use_ga, ga_generations, ga_population, n_jobs,
440
- progress_callback):
 
441
  """
442
  Train with single train/test split (Original method)
443
  """
444
-
445
- # Train/test split
446
  X_train, X_test, y_train, y_test = train_test_split(
447
  X, y_encoded,
448
  test_size=config.TRAIN_TEST_SPLIT,
449
  random_state=config.RANDOM_STATE,
450
  stratify=y_encoded
451
  )
452
-
453
  if progress_callback:
454
  progress_callback(0.1, desc="Scaling features...")
455
-
456
  scaler = StandardScaler()
457
  X_train_scaled = scaler.fit_transform(X_train)
458
  X_test_scaled = scaler.transform(X_test)
459
-
460
  training_log = ""
461
-
462
  if use_ga:
463
- # GA optimization
464
  if progress_callback:
465
  progress_callback(0.2, desc="Initializing GA...")
466
-
467
  X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
468
  X_train_scaled, y_train,
469
  test_size=0.2,
470
  random_state=config.RANDOM_STATE,
471
  stratify=y_train
472
  )
473
-
474
- ga = GeneticAlgorithm(X_train_ga, y_train_ga, n_features_to_select=config.GA_CONFIG['n_features_to_select'])
 
475
  ga.population_size = ga_population
476
  ga.n_generations = ga_generations
477
-
478
  def ga_progress(p, desc):
479
  if progress_callback:
480
  progress_callback(0.2 + 0.6*p, desc=desc)
481
-
482
  best_config = ga.evolve(
483
  X_train_ga, y_train_ga, X_val_ga, y_val_ga,
484
  progress_callback=ga_progress,
485
  n_jobs=n_jobs
486
  )
487
-
488
  training_log = "\n".join(ga.log_messages)
489
-
490
  if best_config is None:
491
  error_msg = """
492
  ## ❌ GA Optimization Failed
@@ -505,28 +514,30 @@ The genetic algorithm did not produce a valid configuration.
505
  **Training Log:**
506
  """
507
  return error_msg + training_log, None, None, training_log
508
-
509
  if progress_callback:
510
- progress_callback(0.8, desc="Training final models with GA config...")
511
-
 
512
  selected_indices = best_config['feature_indices']
513
  X_train_selected = X_train_scaled[:, selected_indices]
514
  X_test_selected = X_test_scaled[:, selected_indices]
515
-
516
- # Train models with GA config
517
  models, accuracies = _train_all_models(
518
  X_train_selected, y_train, X_test_selected, y_test,
519
  n_classes, best_config
520
  )
521
-
522
  weights = best_config['weights']
523
-
524
  ga_summary = f"""
525
  ### 🧬 GA Optimization Results:
526
  - **Generations Completed**: {len(ga.history)}/{ga_generations}
527
  - **Population Size**: {ga_population}
528
  - **Best Fitness**: {ga.best_fitness:.4f}
529
  - **Parallel Jobs**: {n_jobs}
 
 
530
 
531
  ### 🎯 Best Configuration:
532
  - **XGBoost**: n_est={best_config['xgb_n_estimators']}, depth={best_config['xgb_max_depth']}, lr={best_config['xgb_learning_rate']}
@@ -534,59 +545,58 @@ The genetic algorithm did not produce a valid configuration.
534
  - **Gradient Boosting**: n_est={best_config['gb_n_estimators']}, depth={best_config['gb_max_depth']}, lr={best_config['gb_learning_rate']}
535
  - **AdaBoost**: n_est={best_config['ada_n_estimators']}, lr={best_config['ada_learning_rate']}
536
  """
537
-
538
  ga_history_df = pd.DataFrame(ga.history)
539
-
540
  else:
541
- # Simple training without GA
542
  if progress_callback:
543
- progress_callback(0.3, desc="Selecting features (variance)...")
544
-
545
- feature_variance = np.var(X_train_scaled, axis=0)
546
- selected_indices = np.argsort(feature_variance)[-config.GA_CONFIG['n_features_to_select']:]
547
-
 
 
 
 
548
  X_train_selected = X_train_scaled[:, selected_indices]
549
  X_test_selected = X_test_scaled[:, selected_indices]
550
-
551
  models, accuracies = _train_all_models_default(
552
  X_train_selected, y_train, X_test_selected, y_test,
553
  n_classes, progress_callback
554
  )
555
-
556
- # Calculate weights based on accuracies
557
  acc_values = list(accuracies.values())
558
  weights = np.array(acc_values) / sum(acc_values)
559
-
560
- ga_summary = "\n### ⚡ Simple Training (No GA)\n"
561
  ga_history_df = None
562
  training_log = "Simple training mode - no GA logs"
563
-
564
  if progress_callback:
565
  progress_callback(0.9, desc="Creating ensemble...")
566
-
567
- # Ensemble evaluation
568
  predictions = [
569
  models['xgboost'].predict_proba(X_test_selected),
570
  models['lightgbm'].predict_proba(X_test_selected),
571
  models['gradientboosting'].predict_proba(X_test_selected),
572
  models['adaboost'].predict_proba(X_test_selected)
573
  ]
574
-
575
  ensemble_pred = np.average(predictions, axis=0, weights=weights)
576
  ensemble_labels = np.argmax(ensemble_pred, axis=1)
577
  ensemble_acc = accuracy_score(y_test, ensemble_labels)
578
-
579
  if progress_callback:
580
  progress_callback(0.95, desc="Saving models...")
581
-
582
- # Save models
583
  _save_models(models, scaler, label_encoder, selected_indices, weights,
584
- accuracies, ensemble_acc)
585
-
586
  if progress_callback:
587
  progress_callback(1.0, desc="Complete!")
588
-
589
- # Create results table
590
  results_df = pd.DataFrame({
591
  'Model': ['XGBoost', 'LightGBM', 'Gradient Boosting', 'AdaBoost', 'Ensemble'],
592
  'Test Accuracy': [
@@ -597,7 +607,7 @@ The genetic algorithm did not produce a valid configuration.
597
  ensemble_acc
598
  ]
599
  })
600
-
601
  summary = f"""
602
  ## ✅ Training Complete!
603
 
@@ -628,7 +638,7 @@ The genetic algorithm did not produce a valid configuration.
628
 
629
  ⚠️ **Note**: Single train/test split. For more reliable results, use Cross-Validation!
630
  """
631
-
632
  return summary, results_df, ga_history_df, training_log
633
 
634
 
@@ -636,12 +646,15 @@ def _train_all_models(X_train, y_train, X_test, y_test, n_classes, config_dict):
636
  """Train all models with given configuration"""
637
  models = {}
638
  accuracies = {}
639
-
640
- # XGBoost
641
  xgb = XGBClassifier(
642
  n_estimators=config_dict['xgb_n_estimators'],
643
  max_depth=config_dict['xgb_max_depth'],
644
  learning_rate=config_dict['xgb_learning_rate'],
 
 
 
 
645
  objective='multi:softprob',
646
  num_class=n_classes,
647
  random_state=config.RANDOM_STATE,
@@ -651,60 +664,66 @@ def _train_all_models(X_train, y_train, X_test, y_test, n_classes, config_dict):
651
  xgb.fit(X_train, y_train)
652
  models['xgboost'] = xgb
653
  accuracies['xgboost'] = xgb.score(X_test, y_test)
654
-
655
- # LightGBM
656
  lgbm = LGBMClassifier(
657
  n_estimators=config_dict['lgbm_n_estimators'],
658
  num_leaves=config_dict['lgbm_num_leaves'],
659
  learning_rate=config_dict['lgbm_learning_rate'],
 
 
 
 
 
660
  objective='multiclass',
661
  num_class=n_classes,
662
  random_state=config.RANDOM_STATE,
663
  n_jobs=-1,
664
- verbose=-1
 
665
  )
666
  lgbm.fit(X_train, y_train)
667
  models['lightgbm'] = lgbm
668
  accuracies['lightgbm'] = lgbm.score(X_test, y_test)
669
-
670
- # Gradient Boosting
671
  gb = GradientBoostingClassifier(
672
  n_estimators=config_dict['gb_n_estimators'],
673
  max_depth=config_dict['gb_max_depth'],
674
  learning_rate=config_dict['gb_learning_rate'],
 
 
 
675
  random_state=config.RANDOM_STATE
676
  )
677
  gb.fit(X_train, y_train)
678
  models['gradientboosting'] = gb
679
  accuracies['gradientboosting'] = gb.score(X_test, y_test)
680
-
681
- # AdaBoost
682
  ada = AdaBoostClassifier(
683
  n_estimators=config_dict['ada_n_estimators'],
684
  learning_rate=config_dict['ada_learning_rate'],
685
- algorithm='SAMME',
686
  random_state=config.RANDOM_STATE
687
  )
688
  ada.fit(X_train, y_train)
689
  models['adaboost'] = ada
690
  accuracies['adaboost'] = ada.score(X_test, y_test)
691
-
692
  return models, accuracies
693
 
694
 
695
- def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
696
- progress_callback=None, fold_idx=None, n_folds=None,
697
  base_progress=0, total_steps=1):
698
  """Train all models with default hyperparameters"""
699
  models = {}
700
  accuracies = {}
701
-
702
  if progress_callback and fold_idx:
703
- progress_callback(base_progress + 0.4/total_steps,
704
- desc=f"Fold {fold_idx}/{n_folds}: Training XGBoost...")
705
  elif progress_callback:
706
  progress_callback(0.4, desc="Training XGBoost...")
707
-
708
  xgb = XGBClassifier(
709
  n_estimators=150, max_depth=5, learning_rate=0.1,
710
  objective='multi:softprob', num_class=n_classes,
@@ -713,28 +732,29 @@ def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
713
  xgb.fit(X_train, y_train)
714
  models['xgboost'] = xgb
715
  accuracies['xgboost'] = xgb.score(X_test, y_test)
716
-
717
  if progress_callback and fold_idx:
718
  progress_callback(base_progress + 0.5/total_steps,
719
- desc=f"Fold {fold_idx}/{n_folds}: Training LightGBM...")
720
  elif progress_callback:
721
  progress_callback(0.5, desc="Training LightGBM...")
722
-
723
  lgbm = LGBMClassifier(
724
  n_estimators=150, num_leaves=40, learning_rate=0.1,
725
  objective='multiclass', num_class=n_classes,
726
- random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1
 
727
  )
728
  lgbm.fit(X_train, y_train)
729
  models['lightgbm'] = lgbm
730
  accuracies['lightgbm'] = lgbm.score(X_test, y_test)
731
-
732
  if progress_callback and fold_idx:
733
  progress_callback(base_progress + 0.65/total_steps,
734
- desc=f"Fold {fold_idx}/{n_folds}: Training Gradient Boosting...")
735
  elif progress_callback:
736
  progress_callback(0.65, desc="Training Gradient Boosting...")
737
-
738
  gb = GradientBoostingClassifier(
739
  n_estimators=100, max_depth=4, learning_rate=0.1,
740
  random_state=config.RANDOM_STATE
@@ -742,50 +762,49 @@ def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
742
  gb.fit(X_train, y_train)
743
  models['gradientboosting'] = gb
744
  accuracies['gradientboosting'] = gb.score(X_test, y_test)
745
-
746
  if progress_callback and fold_idx:
747
  progress_callback(base_progress + 0.8/total_steps,
748
- desc=f"Fold {fold_idx}/{n_folds}: Training AdaBoost...")
749
  elif progress_callback:
750
  progress_callback(0.8, desc="Training AdaBoost...")
751
-
752
  ada = AdaBoostClassifier(
753
- n_estimators=100, learning_rate=1.0, algorithm='SAMME',
 
 
754
  random_state=config.RANDOM_STATE
755
  )
756
  ada.fit(X_train, y_train)
757
  models['adaboost'] = ada
758
  accuracies['adaboost'] = ada.score(X_test, y_test)
759
-
760
  return models, accuracies
761
 
762
 
763
- def _save_models(models, scaler, label_encoder, selected_indices, weights,
764
- accuracies, ensemble_acc, cv_results=None):
765
  """Save all models and configuration"""
766
  config.WEIGHTS_DIR.mkdir(exist_ok=True)
767
-
768
- # Save models
769
  with open(config.WEIGHTS_DIR / 'xgboost_model.pkl', 'wb') as f:
770
  pickle.dump(models['xgboost'], f)
771
-
772
  with open(config.WEIGHTS_DIR / 'lightgbm_model.pkl', 'wb') as f:
773
  pickle.dump(models['lightgbm'], f)
774
-
775
  with open(config.WEIGHTS_DIR / 'gradientboost_model.pkl', 'wb') as f:
776
  pickle.dump(models['gradientboosting'], f)
777
-
778
  with open(config.WEIGHTS_DIR / 'adaboost_model.pkl', 'wb') as f:
779
  pickle.dump(models['adaboost'], f)
780
-
781
- # Save preprocessing
782
  with open(config.WEIGHTS_DIR / 'scaler.pkl', 'wb') as f:
783
  pickle.dump(scaler, f)
784
-
785
  with open(config.WEIGHTS_DIR / 'label_encoder.pkl', 'wb') as f:
786
  pickle.dump(label_encoder, f)
787
-
788
- # Save configuration
789
  model_config = {
790
  'selected_features': selected_indices.tolist(),
791
  'ensemble_weights': weights.tolist(),
@@ -799,13 +818,12 @@ def _save_models(models, scaler, label_encoder, selected_indices, weights,
799
  'ensemble': float(ensemble_acc)
800
  }
801
  }
802
-
803
- # Add CV results if available
804
  if cv_results is not None:
805
  model_config['cv_results'] = cv_results
806
- model_config['training_method'] = 'k-fold-cv'
807
  else:
808
- model_config['training_method'] = 'single-split'
809
-
810
  with open(config.WEIGHTS_DIR / 'config.json', 'w') as f:
811
- json.dump(model_config, f, indent=2)
 
1
  """
2
+ Model training functions with K-Fold Cross-Validation
3
  """
4
 
5
  import os
 
22
 
23
 
24
  def train_models_with_ga(use_ga: bool = True,
25
+ use_cv: bool = False,
26
+ n_folds: int = 5,
27
+ ga_generations: int = 20,
28
+ ga_population: int = 15,
29
+ n_jobs: int = 2,
30
+ optimize_features: bool = True,
31
+ n_features_select: int = 100,
32
+ progress_callback: Optional[callable] = None) -> Tuple[str, pd.DataFrame, Optional[pd.DataFrame], str]:
33
  """
34
  Train models with or without GA optimization and optional K-Fold CV
35
+
36
  Args:
37
  use_ga: Whether to use GA optimization
38
  use_cv: Whether to use K-Fold Cross-Validation
 
40
  ga_generations: Number of GA generations
41
  ga_population: GA population size
42
  n_jobs: Number of parallel jobs
43
+ optimize_features: Whether GA should optimize feature selection
44
+ n_features_select: Number of features to select
45
  progress_callback: Optional progress callback function
46
+
47
  Returns:
48
  tuple: (summary_text, results_df, ga_history_df, training_log)
49
  """
50
+
51
  if not os.path.exists(config.FEATURES_CSV):
52
  return """
53
  ## ❌ Error: Dataset Not Found
 
56
 
57
  Click "🔊 Extract Features" to process the dataset.
58
  """, None, None, ""
59
+
60
  try:
61
  if progress_callback:
62
  progress_callback(0, desc="Loading dataset...")
63
+
64
  # Load data
65
  df = pd.read_csv(config.FEATURES_CSV)
66
+
67
+ # Extract only numeric feature columns
68
+ feature_cols = [col for col in df.columns
69
+ if col.startswith('feature_')
70
+ and col.replace('feature_', '').isdigit()]
71
+
72
+ feature_cols = sorted(
73
+ feature_cols, key=lambda x: int(x.replace('feature_', '')))
74
+
75
+ if len(feature_cols) == 0:
76
+ return """
77
+ ## ❌ Error: No numeric feature columns found!
78
+
79
+ Please re-run feature extraction in Tab 1.
80
+ """, None, None, ""
81
+
82
  X = df[feature_cols].values
83
  y = df['emotion'].values
84
+
85
+ # Adjust n_features_select based on available features
86
+ n_features_available = X.shape[1]
87
+
88
+ if not optimize_features:
89
+ n_features_select = n_features_available
90
+ print(f"✅ Feature Selection: DISABLED")
91
+ print(f" Using all {n_features_available} features")
92
+ else:
93
+ if n_features_select > n_features_available:
94
+ print(
95
+ f"⚠️ Requested {n_features_select} features, but only {n_features_available} available")
96
+ print(f" Auto-adjusting to {n_features_available}")
97
+ n_features_select = n_features_available
98
+ else:
99
+ print(f"✅ Feature Selection: ENABLED")
100
+ print(
101
+ f" Selecting {n_features_select}/{n_features_available} features ({n_features_select/n_features_available*100:.1f}%)")
102
+
103
+ print(f"✅ Dataset loaded:")
104
+ print(f" - Total features: {n_features_available}")
105
+ print(f" - Features for GA: {n_features_select}")
106
+ print(f" - Shape: {X.shape}")
107
+ print(f" - Samples: {len(y)}")
108
+
109
  label_encoder = LabelEncoder()
110
  y_encoded = label_encoder.fit_transform(y)
111
+
112
  n_classes = len(label_encoder.classes_)
113
+
114
  training_log = ""
115
+
 
 
 
116
  if use_cv:
117
  return _train_with_cross_validation(
118
  X, y_encoded, label_encoder, n_classes,
119
  use_ga, n_folds, ga_generations, ga_population, n_jobs,
120
+ optimize_features, n_features_select,
121
  progress_callback
122
  )
 
 
 
 
123
  else:
124
  return _train_single_split(
125
  X, y_encoded, label_encoder, n_classes,
126
  use_ga, ga_generations, ga_population, n_jobs,
127
+ optimize_features, n_features_select,
128
  progress_callback
129
  )
130
+
131
  except Exception as e:
132
  import traceback
133
  error_trace = traceback.format_exc()
 
136
 
137
  def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
138
  use_ga, n_folds, ga_generations, ga_population, n_jobs,
139
+ optimize_features, n_features_select,
140
  progress_callback):
141
  """
142
  Train with K-Fold Cross-Validation
143
  """
144
+
145
  print("="*80)
146
  print(f"{'K-FOLD CROSS-VALIDATION TRAINING':^80}")
147
  print("="*80)
148
  print(f"Number of folds: {n_folds}")
149
  print(f"Use GA: {use_ga}")
150
+ print(f"Optimize Features: {optimize_features}")
151
+ print(f"Features to select: {n_features_select}")
152
  print(f"Total samples: {len(X)}")
153
  print("="*80)
154
+
155
+ skf = StratifiedKFold(n_splits=n_folds, shuffle=True,
156
+ random_state=config.RANDOM_STATE)
157
+
 
158
  fold_results = []
159
  fold_models = []
160
  all_ga_history = []
161
  training_log = ""
162
+
 
163
  total_steps = n_folds
164
  current_step = 0
165
+
 
166
  for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded), 1):
167
  fold_log = f"\n{'='*80}\n"
168
  fold_log += f"FOLD {fold_idx}/{n_folds}\n"
169
  fold_log += f"{'='*80}\n"
170
  print(fold_log)
171
  training_log += fold_log
172
+
173
  if progress_callback:
174
  base_progress = current_step / total_steps
175
+ progress_callback(
176
+ base_progress, desc=f"Fold {fold_idx}/{n_folds}: Preparing data...")
177
+
178
  X_train, X_test = X[train_idx], X[test_idx]
179
  y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]
180
+
181
  fold_log = f"Train samples: {len(X_train)}, Test samples: {len(X_test)}\n"
182
  print(fold_log)
183
  training_log += fold_log
184
+
 
185
  scaler = StandardScaler()
186
  X_train_scaled = scaler.fit_transform(X_train)
187
  X_test_scaled = scaler.transform(X_test)
188
+
 
 
 
189
  if use_ga:
190
  if progress_callback:
191
+ progress_callback(base_progress + 0.05/total_steps,
192
+ desc=f"Fold {fold_idx}/{n_folds}: Splitting for GA...")
193
+
 
194
  X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
195
  X_train_scaled, y_train,
196
  test_size=0.2,
197
  random_state=config.RANDOM_STATE,
198
  stratify=y_train
199
  )
200
+
201
  if progress_callback:
202
  progress_callback(base_progress + 0.1/total_steps,
203
+ desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...")
204
+
205
+ ga = GeneticAlgorithm(X_train_ga, y_train_ga,
206
+ n_features_to_select=n_features_select)
207
  ga.population_size = ga_population
208
  ga.n_generations = ga_generations
209
+
210
  def ga_progress(p, desc):
211
  if progress_callback:
 
212
  ga_progress_in_fold = 0.1 + 0.6 * p
213
+ progress_callback(base_progress + ga_progress_in_fold/total_steps,
214
+ desc=f"Fold {fold_idx}/{n_folds}: {desc}")
215
+
216
  best_config = ga.evolve(
217
  X_train_ga, y_train_ga, X_val_ga, y_val_ga,
218
  progress_callback=ga_progress,
219
  n_jobs=n_jobs
220
  )
221
+
 
222
  training_log += "\n".join(ga.log_messages) + "\n"
223
  all_ga_history.extend(ga.history)
224
+
225
  if best_config is None:
226
  fold_log = f"❌ GA optimization failed for Fold {fold_idx}\n"
227
  print(fold_log)
228
  training_log += fold_log
229
  continue
230
+
 
231
  selected_indices = best_config['feature_indices']
232
  X_train_selected = X_train_scaled[:, selected_indices]
233
  X_test_selected = X_test_scaled[:, selected_indices]
234
+
235
  if progress_callback:
236
  progress_callback(base_progress + 0.7/total_steps,
237
+ desc=f"Fold {fold_idx}/{n_folds}: Training models with GA config...")
238
+
 
239
  models, accuracies = _train_all_models(
240
  X_train_selected, y_train, X_test_selected, y_test,
241
  n_classes, best_config
242
  )
243
+
244
  weights = best_config['weights']
245
+
246
  fold_log = f"\n✅ GA optimization completed for Fold {fold_idx}\n"
247
  fold_log += f"Best fitness: {ga.best_fitness:.4f}\n"
248
  fold_log += f"Generations: {len(ga.history)}/{ga_generations}\n"
249
  print(fold_log)
250
  training_log += fold_log
251
+
 
 
 
252
  else:
253
  if progress_callback:
254
  progress_callback(base_progress + 0.2/total_steps,
255
+ desc=f"Fold {fold_idx}/{n_folds}: Selecting features...")
256
+
257
+ if not optimize_features:
258
+ selected_indices = np.arange(X_train_scaled.shape[1])
259
+ else:
260
+ feature_variance = np.var(X_train_scaled, axis=0)
261
+ selected_indices = np.argsort(
262
+ feature_variance)[-n_features_select:]
263
+
264
  X_train_selected = X_train_scaled[:, selected_indices]
265
  X_test_selected = X_test_scaled[:, selected_indices]
266
+
267
  if progress_callback:
268
  progress_callback(base_progress + 0.3/total_steps,
269
+ desc=f"Fold {fold_idx}/{n_folds}: Training models...")
270
+
271
  models, accuracies = _train_all_models_default(
272
  X_train_selected, y_train, X_test_selected, y_test,
273
  n_classes, progress_callback, fold_idx, n_folds, base_progress, total_steps
274
  )
275
+
 
276
  acc_values = np.array(list(accuracies.values()))
277
  weights = acc_values / acc_values.sum()
278
+
 
 
 
279
  if progress_callback:
280
  progress_callback(base_progress + 0.9/total_steps,
281
+ desc=f"Fold {fold_idx}/{n_folds}: Evaluating ensemble...")
282
+
283
  predictions = [
284
  models['xgboost'].predict_proba(X_test_selected),
285
  models['lightgbm'].predict_proba(X_test_selected),
286
  models['gradientboosting'].predict_proba(X_test_selected),
287
  models['adaboost'].predict_proba(X_test_selected)
288
  ]
289
+
290
  ensemble_pred = np.average(predictions, axis=0, weights=weights)
291
  ensemble_labels = np.argmax(ensemble_pred, axis=1)
292
  ensemble_acc = accuracy_score(y_test, ensemble_labels)
293
+
 
294
  fold_result = {
295
  'fold': fold_idx,
296
  'xgboost': accuracies['xgboost'],
 
302
  'n_test': len(X_test)
303
  }
304
  fold_results.append(fold_result)
305
+
306
  fold_models.append({
307
  'models': models,
308
  'scaler': scaler,
309
  'selected_indices': selected_indices,
310
  'weights': weights
311
  })
312
+
 
313
  fold_log = f"\n📊 Fold {fold_idx} Results:\n"
314
  fold_log += f" XGBoost: {accuracies['xgboost']:.4f}\n"
315
  fold_log += f" LightGBM: {accuracies['lightgbm']:.4f}\n"
 
318
  fold_log += f" Ensemble: {ensemble_acc:.4f} ⭐\n"
319
  print(fold_log)
320
  training_log += fold_log
321
+
322
  current_step += 1
323
+
 
 
 
324
  if len(fold_results) == 0:
325
  return "❌ All folds failed", None, None, training_log
326
+
327
  results_df = pd.DataFrame(fold_results)
328
+
 
329
  stats_log = f"\n{'='*80}\n"
330
  stats_log += f"{'CROSS-VALIDATION SUMMARY':^80}\n"
331
  stats_log += f"{'='*80}\n\n"
332
+
333
  stats_log += "Per-Fold Results:\n"
334
  stats_log += results_df.to_string(index=False) + "\n\n"
335
+
336
  stats_log += "="*80 + "\n"
337
  stats_log += "SUMMARY STATISTICS\n"
338
  stats_log += "="*80 + "\n"
339
+
340
  stats_summary = []
341
+
342
  for model_name in ['xgboost', 'lightgbm', 'gradientboosting', 'adaboost', 'ensemble']:
343
  scores = results_df[model_name].values
344
  mean_score = scores.mean()
345
  std_score = scores.std()
346
+
347
  model_stats = f"\n{model_name.upper()}:\n"
348
  model_stats += f" Mean Accuracy: {mean_score:.4f}\n"
349
  model_stats += f" Std Deviation: {std_score:.4f}\n"
350
  model_stats += f" 95% CI: [{mean_score - 1.96*std_score:.4f}, {mean_score + 1.96*std_score:.4f}]\n"
351
  model_stats += f" Min: {scores.min():.4f}\n"
352
  model_stats += f" Max: {scores.max():.4f}\n"
353
+
354
  stats_log += model_stats
355
+
356
  stats_summary.append({
357
  'Model': model_name.upper(),
358
  'Mean': mean_score,
 
360
  'Min': scores.min(),
361
  'Max': scores.max()
362
  })
363
+
364
  print(stats_log)
365
  training_log += stats_log
366
+
 
 
 
367
  best_fold_idx = results_df['ensemble'].idxmax()
368
  best_fold = fold_results[best_fold_idx]
369
  best_models = fold_models[best_fold_idx]
370
+
371
  save_log = f"\n{'='*80}\n"
372
  save_log += f"Best performing fold: Fold {best_fold['fold']} (Ensemble: {best_fold['ensemble']:.4f})\n"
373
  save_log += "Saving this model...\n"
374
  save_log += "="*80 + "\n"
375
  print(save_log)
376
  training_log += save_log
377
+
378
  if progress_callback:
379
  progress_callback(0.95, desc="Saving best model...")
380
+
381
  _save_models(
382
  best_models['models'],
383
  best_models['scaler'],
 
393
  best_fold['ensemble'],
394
  cv_results=results_df.to_dict('records')
395
  )
396
+
397
  if progress_callback:
398
  progress_callback(1.0, desc="Complete!")
399
+
 
 
 
 
400
  ensemble_mean = results_df['ensemble'].mean()
401
  ensemble_std = results_df['ensemble'].std()
402
  consistency = (1 - ensemble_std / ensemble_mean) * 100
403
+
404
  summary = f"""
405
  ## ✅ Cross-Validation Training Complete!
406
 
 
434
 
435
  📝 **Note**: This is a more reliable estimate than single train/test split!
436
  """
437
+
 
438
  ga_history_df = None
439
  if use_ga and len(all_ga_history) > 0:
440
  ga_history_df = pd.DataFrame(all_ga_history)
441
+
 
442
  summary_stats_df = pd.DataFrame(stats_summary)
443
+
444
  return summary, summary_stats_df, ga_history_df, training_log
445
 
446
 
447
  def _train_single_split(X, y_encoded, label_encoder, n_classes,
448
+ use_ga, ga_generations, ga_population, n_jobs,
449
+ optimize_features, n_features_select,
450
+ progress_callback):
451
  """
452
  Train with single train/test split (Original method)
453
  """
454
+
 
455
  X_train, X_test, y_train, y_test = train_test_split(
456
  X, y_encoded,
457
  test_size=config.TRAIN_TEST_SPLIT,
458
  random_state=config.RANDOM_STATE,
459
  stratify=y_encoded
460
  )
461
+
462
  if progress_callback:
463
  progress_callback(0.1, desc="Scaling features...")
464
+
465
  scaler = StandardScaler()
466
  X_train_scaled = scaler.fit_transform(X_train)
467
  X_test_scaled = scaler.transform(X_test)
468
+
469
  training_log = ""
470
+
471
  if use_ga:
 
472
  if progress_callback:
473
  progress_callback(0.2, desc="Initializing GA...")
474
+
475
  X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
476
  X_train_scaled, y_train,
477
  test_size=0.2,
478
  random_state=config.RANDOM_STATE,
479
  stratify=y_train
480
  )
481
+
482
+ ga = GeneticAlgorithm(X_train_ga, y_train_ga,
483
+ n_features_to_select=n_features_select)
484
  ga.population_size = ga_population
485
  ga.n_generations = ga_generations
486
+
487
  def ga_progress(p, desc):
488
  if progress_callback:
489
  progress_callback(0.2 + 0.6*p, desc=desc)
490
+
491
  best_config = ga.evolve(
492
  X_train_ga, y_train_ga, X_val_ga, y_val_ga,
493
  progress_callback=ga_progress,
494
  n_jobs=n_jobs
495
  )
496
+
497
  training_log = "\n".join(ga.log_messages)
498
+
499
  if best_config is None:
500
  error_msg = """
501
  ## ❌ GA Optimization Failed
 
514
  **Training Log:**
515
  """
516
  return error_msg + training_log, None, None, training_log
517
+
518
  if progress_callback:
519
+ progress_callback(
520
+ 0.8, desc="Training final models with GA config...")
521
+
522
  selected_indices = best_config['feature_indices']
523
  X_train_selected = X_train_scaled[:, selected_indices]
524
  X_test_selected = X_test_scaled[:, selected_indices]
525
+
 
526
  models, accuracies = _train_all_models(
527
  X_train_selected, y_train, X_test_selected, y_test,
528
  n_classes, best_config
529
  )
530
+
531
  weights = best_config['weights']
532
+
533
  ga_summary = f"""
534
  ### 🧬 GA Optimization Results:
535
  - **Generations Completed**: {len(ga.history)}/{ga_generations}
536
  - **Population Size**: {ga_population}
537
  - **Best Fitness**: {ga.best_fitness:.4f}
538
  - **Parallel Jobs**: {n_jobs}
539
+ - **Feature Selection**: {'Enabled' if optimize_features else 'Disabled'}
540
+ - **Features Used**: {len(selected_indices)}
541
 
542
  ### 🎯 Best Configuration:
543
  - **XGBoost**: n_est={best_config['xgb_n_estimators']}, depth={best_config['xgb_max_depth']}, lr={best_config['xgb_learning_rate']}
 
545
  - **Gradient Boosting**: n_est={best_config['gb_n_estimators']}, depth={best_config['gb_max_depth']}, lr={best_config['gb_learning_rate']}
546
  - **AdaBoost**: n_est={best_config['ada_n_estimators']}, lr={best_config['ada_learning_rate']}
547
  """
548
+
549
  ga_history_df = pd.DataFrame(ga.history)
550
+
551
  else:
 
552
  if progress_callback:
553
+ progress_callback(0.3, desc="Selecting features...")
554
+
555
+ if not optimize_features:
556
+ selected_indices = np.arange(X_train_scaled.shape[1])
557
+ else:
558
+ feature_variance = np.var(X_train_scaled, axis=0)
559
+ selected_indices = np.argsort(
560
+ feature_variance)[-n_features_select:]
561
+
562
  X_train_selected = X_train_scaled[:, selected_indices]
563
  X_test_selected = X_test_scaled[:, selected_indices]
564
+
565
  models, accuracies = _train_all_models_default(
566
  X_train_selected, y_train, X_test_selected, y_test,
567
  n_classes, progress_callback
568
  )
569
+
 
570
  acc_values = list(accuracies.values())
571
  weights = np.array(acc_values) / sum(acc_values)
572
+
573
+ ga_summary = f"\n### ⚡ Simple Training (No GA)\n- **Feature Selection**: {'Enabled' if optimize_features else 'Disabled'}\n- **Features Used**: {len(selected_indices)}\n"
574
  ga_history_df = None
575
  training_log = "Simple training mode - no GA logs"
576
+
577
  if progress_callback:
578
  progress_callback(0.9, desc="Creating ensemble...")
579
+
 
580
  predictions = [
581
  models['xgboost'].predict_proba(X_test_selected),
582
  models['lightgbm'].predict_proba(X_test_selected),
583
  models['gradientboosting'].predict_proba(X_test_selected),
584
  models['adaboost'].predict_proba(X_test_selected)
585
  ]
586
+
587
  ensemble_pred = np.average(predictions, axis=0, weights=weights)
588
  ensemble_labels = np.argmax(ensemble_pred, axis=1)
589
  ensemble_acc = accuracy_score(y_test, ensemble_labels)
590
+
591
  if progress_callback:
592
  progress_callback(0.95, desc="Saving models...")
593
+
 
594
  _save_models(models, scaler, label_encoder, selected_indices, weights,
595
+ accuracies, ensemble_acc)
596
+
597
  if progress_callback:
598
  progress_callback(1.0, desc="Complete!")
599
+
 
600
  results_df = pd.DataFrame({
601
  'Model': ['XGBoost', 'LightGBM', 'Gradient Boosting', 'AdaBoost', 'Ensemble'],
602
  'Test Accuracy': [
 
607
  ensemble_acc
608
  ]
609
  })
610
+
611
  summary = f"""
612
  ## ✅ Training Complete!
613
 
 
638
 
639
  ⚠️ **Note**: Single train/test split. For more reliable results, use Cross-Validation!
640
  """
641
+
642
  return summary, results_df, ga_history_df, training_log
643
 
644
 
 
646
  """Train all models with given configuration"""
647
  models = {}
648
  accuracies = {}
649
+
 
650
  xgb = XGBClassifier(
651
  n_estimators=config_dict['xgb_n_estimators'],
652
  max_depth=config_dict['xgb_max_depth'],
653
  learning_rate=config_dict['xgb_learning_rate'],
654
+ subsample=config_dict.get('xgb_subsample', 0.8),
655
+ colsample_bytree=config_dict.get('xgb_colsample_bytree', 0.8),
656
+ min_child_weight=config_dict.get('xgb_min_child_weight', 1),
657
+ gamma=config_dict.get('xgb_gamma', 0),
658
  objective='multi:softprob',
659
  num_class=n_classes,
660
  random_state=config.RANDOM_STATE,
 
664
  xgb.fit(X_train, y_train)
665
  models['xgboost'] = xgb
666
  accuracies['xgboost'] = xgb.score(X_test, y_test)
667
+
 
668
  lgbm = LGBMClassifier(
669
  n_estimators=config_dict['lgbm_n_estimators'],
670
  num_leaves=config_dict['lgbm_num_leaves'],
671
  learning_rate=config_dict['lgbm_learning_rate'],
672
+ min_child_samples=config_dict.get('lgbm_min_child_samples', 20),
673
+ subsample=config_dict.get('lgbm_subsample', 0.8),
674
+ colsample_bytree=config_dict.get('lgbm_colsample_bytree', 0.8),
675
+ reg_alpha=config_dict.get('lgbm_reg_alpha', 0),
676
+ reg_lambda=config_dict.get('lgbm_reg_lambda', 0),
677
  objective='multiclass',
678
  num_class=n_classes,
679
  random_state=config.RANDOM_STATE,
680
  n_jobs=-1,
681
+ verbose=-1,
682
+ force_col_wise=True
683
  )
684
  lgbm.fit(X_train, y_train)
685
  models['lightgbm'] = lgbm
686
  accuracies['lightgbm'] = lgbm.score(X_test, y_test)
687
+
 
688
  gb = GradientBoostingClassifier(
689
  n_estimators=config_dict['gb_n_estimators'],
690
  max_depth=config_dict['gb_max_depth'],
691
  learning_rate=config_dict['gb_learning_rate'],
692
+ subsample=config_dict.get('gb_subsample', 0.8),
693
+ min_samples_split=config_dict.get('gb_min_samples_split', 2),
694
+ min_samples_leaf=config_dict.get('gb_min_samples_leaf', 1),
695
  random_state=config.RANDOM_STATE
696
  )
697
  gb.fit(X_train, y_train)
698
  models['gradientboosting'] = gb
699
  accuracies['gradientboosting'] = gb.score(X_test, y_test)
700
+
 
701
  ada = AdaBoostClassifier(
702
  n_estimators=config_dict['ada_n_estimators'],
703
  learning_rate=config_dict['ada_learning_rate'],
704
+ algorithm=config.ADABOOST_ALGORITHM,
705
  random_state=config.RANDOM_STATE
706
  )
707
  ada.fit(X_train, y_train)
708
  models['adaboost'] = ada
709
  accuracies['adaboost'] = ada.score(X_test, y_test)
710
+
711
  return models, accuracies
712
 
713
 
714
+ def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
715
+ progress_callback=None, fold_idx=None, n_folds=None,
716
  base_progress=0, total_steps=1):
717
  """Train all models with default hyperparameters"""
718
  models = {}
719
  accuracies = {}
720
+
721
  if progress_callback and fold_idx:
722
+ progress_callback(base_progress + 0.4/total_steps,
723
+ desc=f"Fold {fold_idx}/{n_folds}: Training XGBoost...")
724
  elif progress_callback:
725
  progress_callback(0.4, desc="Training XGBoost...")
726
+
727
  xgb = XGBClassifier(
728
  n_estimators=150, max_depth=5, learning_rate=0.1,
729
  objective='multi:softprob', num_class=n_classes,
 
732
  xgb.fit(X_train, y_train)
733
  models['xgboost'] = xgb
734
  accuracies['xgboost'] = xgb.score(X_test, y_test)
735
+
736
  if progress_callback and fold_idx:
737
  progress_callback(base_progress + 0.5/total_steps,
738
+ desc=f"Fold {fold_idx}/{n_folds}: Training LightGBM...")
739
  elif progress_callback:
740
  progress_callback(0.5, desc="Training LightGBM...")
741
+
742
  lgbm = LGBMClassifier(
743
  n_estimators=150, num_leaves=40, learning_rate=0.1,
744
  objective='multiclass', num_class=n_classes,
745
+ random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1,
746
+ force_col_wise=True
747
  )
748
  lgbm.fit(X_train, y_train)
749
  models['lightgbm'] = lgbm
750
  accuracies['lightgbm'] = lgbm.score(X_test, y_test)
751
+
752
  if progress_callback and fold_idx:
753
  progress_callback(base_progress + 0.65/total_steps,
754
+ desc=f"Fold {fold_idx}/{n_folds}: Training Gradient Boosting...")
755
  elif progress_callback:
756
  progress_callback(0.65, desc="Training Gradient Boosting...")
757
+
758
  gb = GradientBoostingClassifier(
759
  n_estimators=100, max_depth=4, learning_rate=0.1,
760
  random_state=config.RANDOM_STATE
 
762
  gb.fit(X_train, y_train)
763
  models['gradientboosting'] = gb
764
  accuracies['gradientboosting'] = gb.score(X_test, y_test)
765
+
766
  if progress_callback and fold_idx:
767
  progress_callback(base_progress + 0.8/total_steps,
768
+ desc=f"Fold {fold_idx}/{n_folds}: Training AdaBoost...")
769
  elif progress_callback:
770
  progress_callback(0.8, desc="Training AdaBoost...")
771
+
772
  ada = AdaBoostClassifier(
773
+ n_estimators=100,
774
+ learning_rate=1.0,
775
+ algorithm=config.ADABOOST_ALGORITHM,
776
  random_state=config.RANDOM_STATE
777
  )
778
  ada.fit(X_train, y_train)
779
  models['adaboost'] = ada
780
  accuracies['adaboost'] = ada.score(X_test, y_test)
781
+
782
  return models, accuracies
783
 
784
 
785
+ def _save_models(models, scaler, label_encoder, selected_indices, weights,
786
+ accuracies, ensemble_acc, cv_results=None):
787
  """Save all models and configuration"""
788
  config.WEIGHTS_DIR.mkdir(exist_ok=True)
789
+
 
790
  with open(config.WEIGHTS_DIR / 'xgboost_model.pkl', 'wb') as f:
791
  pickle.dump(models['xgboost'], f)
792
+
793
  with open(config.WEIGHTS_DIR / 'lightgbm_model.pkl', 'wb') as f:
794
  pickle.dump(models['lightgbm'], f)
795
+
796
  with open(config.WEIGHTS_DIR / 'gradientboost_model.pkl', 'wb') as f:
797
  pickle.dump(models['gradientboosting'], f)
798
+
799
  with open(config.WEIGHTS_DIR / 'adaboost_model.pkl', 'wb') as f:
800
  pickle.dump(models['adaboost'], f)
801
+
 
802
  with open(config.WEIGHTS_DIR / 'scaler.pkl', 'wb') as f:
803
  pickle.dump(scaler, f)
804
+
805
  with open(config.WEIGHTS_DIR / 'label_encoder.pkl', 'wb') as f:
806
  pickle.dump(label_encoder, f)
807
+
 
808
  model_config = {
809
  'selected_features': selected_indices.tolist(),
810
  'ensemble_weights': weights.tolist(),
 
818
  'ensemble': float(ensemble_acc)
819
  }
820
  }
821
+
 
822
  if cv_results is not None:
823
  model_config['cv_results'] = cv_results
824
+ model_config['training_mode'] = 'cross_validation'
825
  else:
826
+ model_config['training_mode'] = 'single_split'
827
+
828
  with open(config.WEIGHTS_DIR / 'config.json', 'w') as f:
829
+ json.dump(model_config, f, indent=2)
src/ui/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (492 Bytes). View file
 
src/ui/__pycache__/tab1_extraction.cpython-311.pyc ADDED
Binary file (19.9 kB). View file
 
src/ui/__pycache__/tab2_training.cpython-311.pyc ADDED
Binary file (13.4 kB). View file
 
src/ui/__pycache__/tab3_prediction.cpython-311.pyc ADDED
Binary file (6.56 kB). View file
 
src/ui/tab1_extraction.py CHANGED
@@ -1,19 +1,66 @@
1
  """
2
- Tab 1: Feature Extraction UI
3
  """
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  from pathlib import Path
 
8
 
9
  from src.data_loader import scan_dataset_directory, extract_emotion_from_filename, extract_actor_from_filename, get_dataset_statistics
10
- from src.feature_extraction import extract_features
11
  from src.utils import create_waveform_plot, create_spectrogram_plot
12
  import config
13
 
14
 
15
- def extract_dataset_features(progress=gr.Progress()):
16
- """Extract features from dataset in data/ directory"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  try:
19
  progress(0, desc="Scanning dataset directory...")
@@ -21,50 +68,34 @@ def extract_dataset_features(progress=gr.Progress()):
21
  audio_files, error = scan_dataset_directory()
22
 
23
  if error:
24
- return f"""
25
- ## ❌ Error: {error}
26
-
27
- **Expected structure:**
28
- data/
29
- └── RAVDESS/
30
- └── audio_speech_actors_01-24/
31
- ├── Actor_01/
32
- │ ├── 03-01-01-01-01-01-01.wav
33
- │ └── ...
34
- ├── Actor_02/
35
- └── ...
36
-
37
- **Please ensure dataset is in correct location.**
38
- """, None, None
39
 
40
  if len(audio_files) == 0:
41
  return "❌ No audio files found", None, None
42
 
 
 
43
  progress(
44
- 0.05, desc=f"Found {len(audio_files)} files. Extracting features...")
45
 
46
  data_list = []
47
  failed_files = []
48
  total_files = len(audio_files)
49
 
50
  for idx, audio_file in enumerate(audio_files):
51
- progress(
52
- 0.05 + (idx + 1) / total_files * 0.90,
53
- desc=f"Processing {idx + 1}/{total_files}: {audio_file.name}"
54
- )
55
 
56
  try:
57
- features, _, _ = extract_features(str(audio_file))
 
 
58
  filename = audio_file.name
59
  emotion = extract_emotion_from_filename(filename)
60
  actor = extract_actor_from_filename(filename)
61
 
62
- row = {
63
- 'file_path': str(audio_file),
64
- 'filename': filename,
65
- 'actor': actor,
66
- 'emotion': emotion
67
- }
68
 
69
  for i, feat in enumerate(features):
70
  row[f'feature_{i}'] = feat
@@ -81,29 +112,65 @@ data/
81
  progress(0.95, desc="Saving to CSV...")
82
 
83
  df = pd.DataFrame(data_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  df.to_csv(config.FEATURES_CSV, index=False)
85
 
 
 
 
 
86
  progress(1.0, desc="Complete!")
87
 
88
  stats = get_dataset_statistics(audio_files)
89
 
90
- summary = f"""
91
- ## Feature Extraction Complete!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  ### 📊 Statistics:
94
  - **Total Files**: {stats['total_files']}
95
  - **Successfully Processed**: {len(df)}
96
  - **Failed**: {len(failed_files)}
97
- - **Features per File**: {config.N_FEATURES}
98
- - **Output**: `{config.FEATURES_CSV}`
 
99
 
100
  ### 🎭 Emotion Distribution:
101
  {df['emotion'].value_counts().to_string()}
102
 
103
  ### 👥 Actors: {stats['n_actors']}
104
 
105
- ✅ **Ready for training! Go to Tab 2.**
106
- """
107
 
108
  if failed_files:
109
  summary += f"\n\n### ⚠️ Failed Files ({len(failed_files)}):\n"
@@ -124,56 +191,66 @@ def check_dataset_status():
124
  audio_files, error = scan_dataset_directory()
125
 
126
  if error:
127
- return f"""
128
- ## ⚠️ Dataset Not Found
129
-
130
- {error}
131
-
132
- **Please upload RAVDESS dataset to the correct location.**
133
- """
134
 
135
  stats = get_dataset_statistics(audio_files)
136
 
137
- status = f"""
138
- ## Dataset Found!
139
 
140
- ### 📊 Statistics:
141
- - **Total Files**: {stats['total_files']}
142
- - **Location**: `{config.DATA_DIR}`
 
 
 
143
 
144
- ### 🎭 Emotions:
145
- """
146
 
147
  for emotion, count in sorted(stats['emotion_counts'].items()):
148
  status += f"- **{emotion.capitalize()}**: {count} files\n"
149
 
150
- status += f"""
151
 
152
- ### 👥 Actors: {stats['n_actors']}
153
-
154
- **Click "🔊 Extract Features" to process the dataset.**
155
- """
156
 
157
  return status
158
 
159
 
160
- def preview_single_audio(audio_file):
161
- """Preview single audio file"""
162
  if audio_file is None:
163
  return "Please upload an audio file", None, None
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  try:
166
- features, y, sr = extract_features(audio_file)
 
167
 
168
- summary = f"""
169
- ## 🔍 Single File Preview
 
 
 
170
 
171
- - **File**: {Path(audio_file).name}
172
- - **Features**: {config.N_FEATURES}
173
- - **Sample Rate**: {sr} Hz
174
- - **Duration**: {len(y)/sr:.2f}s
175
- - **Emotion**: {extract_emotion_from_filename(Path(audio_file).name)}
176
- """
177
 
178
  waveform = create_waveform_plot(y, sr)
179
  spectrogram = create_spectrogram_plot(y, sr)
@@ -186,41 +263,58 @@ def preview_single_audio(audio_file):
186
 
187
 
188
  def create_tab1():
189
- """Create Tab 1: Feature Extraction"""
190
 
191
  with gr.Tab("1️⃣ Feature Extraction"):
192
- gr.Markdown("""
193
- ## 📁 Extract Features from Dataset
194
-
195
- Automatically processes all audio files in `data/RAVDESS/audio_speech_actors_01-24/`
196
- """)
197
 
198
  with gr.Row():
199
  with gr.Column(scale=1):
200
- check_btn = gr.Button("🔄 Check Dataset Status", size="sm")
201
 
202
- gr.Markdown("---")
 
 
 
 
 
 
203
 
204
- extract_btn = gr.Button(
205
- "🔊 Extract Features",
206
- variant="primary",
207
- size="lg"
208
- )
209
-
210
- gr.Markdown("""
211
- ---
212
- ### 🔍 Preview Single Audio
213
-
214
- Test feature extraction on one file.
215
- """)
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  preview_audio = gr.Audio(
218
- sources=["upload"],
219
- type="filepath",
220
- label="Upload Single File"
221
- )
222
  preview_btn = gr.Button("Preview Features")
223
 
 
 
224
  with gr.Column(scale=2):
225
  output_text = gr.Markdown()
226
  preview_df = gr.Dataframe(label="Dataset Preview")
@@ -230,19 +324,8 @@ def create_tab1():
230
  waveform_plot = gr.Plot(label="Waveform")
231
  spectrogram_plot = gr.Plot(label="Spectrogram")
232
 
233
- # Event handlers
234
- check_btn.click(
235
- fn=check_dataset_status,
236
- outputs=[output_text]
237
- )
238
-
239
- extract_btn.click(
240
- fn=extract_dataset_features,
241
- outputs=[output_text, preview_df, emotion_dist]
242
- )
243
-
244
- preview_btn.click(
245
- fn=preview_single_audio,
246
- inputs=[preview_audio],
247
- outputs=[output_text, waveform_plot, spectrogram_plot]
248
- )
 
1
  """
2
+ Tab 1: Feature Extraction UI with Feature Type Selection and MFCC Count
3
  """
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  from pathlib import Path
8
+ import json
9
 
10
  from src.data_loader import scan_dataset_directory, extract_emotion_from_filename, extract_actor_from_filename, get_dataset_statistics
11
+ from src.feature_extraction import extract_features, get_feature_count
12
  from src.utils import create_waveform_plot, create_spectrogram_plot
13
  import config
14
 
15
 
16
+ def calculate_feature_count(zcr, chroma, mfcc, rms, mel, n_mfcc):
17
+ """Calculate total feature count based on selections"""
18
+ feature_types = []
19
+ if zcr:
20
+ feature_types.append('zcr')
21
+ if chroma:
22
+ feature_types.append('chroma')
23
+ if mfcc:
24
+ feature_types.append('mfcc')
25
+ if rms:
26
+ feature_types.append('rms')
27
+ if mel:
28
+ feature_types.append('mel')
29
+
30
+ total = get_feature_count(feature_types, n_mfcc=n_mfcc)
31
+
32
+ breakdown = []
33
+ if zcr:
34
+ breakdown.append("ZCR: 1")
35
+ if chroma:
36
+ breakdown.append("Chroma: 12")
37
+ if mfcc:
38
+ breakdown.append(f"MFCC: {n_mfcc}")
39
+ if rms:
40
+ breakdown.append("RMS: 1")
41
+ if mel:
42
+ breakdown.append("Mel: 128")
43
+
44
+ return f"**Total Features: {total}**\n\n*Breakdown: {' + '.join(breakdown)}*"
45
+
46
+
47
+ def extract_dataset_features(zcr, chroma, mfcc, rms, mel, n_mfcc, progress=gr.Progress()):
48
+ """Extract features from dataset with selected feature types"""
49
+
50
+ feature_types = []
51
+ if zcr:
52
+ feature_types.append('zcr')
53
+ if chroma:
54
+ feature_types.append('chroma')
55
+ if mfcc:
56
+ feature_types.append('mfcc')
57
+ if rms:
58
+ feature_types.append('rms')
59
+ if mel:
60
+ feature_types.append('mel')
61
+
62
+ if len(feature_types) == 0:
63
+ return "❌ Please select at least one feature type!", None, None
64
 
65
  try:
66
  progress(0, desc="Scanning dataset directory...")
 
68
  audio_files, error = scan_dataset_directory()
69
 
70
  if error:
71
+ return f"## ❌ Error: {error}\n\n**Please ensure dataset is in correct location.**", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  if len(audio_files) == 0:
74
  return "❌ No audio files found", None, None
75
 
76
+ total_features = get_feature_count(feature_types, n_mfcc=n_mfcc)
77
+
78
  progress(
79
+ 0.05, desc=f"Found {len(audio_files)} files. Extracting {total_features} features...")
80
 
81
  data_list = []
82
  failed_files = []
83
  total_files = len(audio_files)
84
 
85
  for idx, audio_file in enumerate(audio_files):
86
+ progress(0.05 + (idx + 1) / total_files * 0.90,
87
+ desc=f"Processing {idx + 1}/{total_files}: {audio_file.name}")
 
 
88
 
89
  try:
90
+ features, _, _, feature_info = extract_features(
91
+ str(audio_file), feature_types=feature_types, n_mfcc=n_mfcc)
92
+
93
  filename = audio_file.name
94
  emotion = extract_emotion_from_filename(filename)
95
  actor = extract_actor_from_filename(filename)
96
 
97
+ row = {'file_path': str(
98
+ audio_file), 'filename': filename, 'actor': actor, 'emotion': emotion}
 
 
 
 
99
 
100
  for i, feat in enumerate(features):
101
  row[f'feature_{i}'] = feat
 
112
  progress(0.95, desc="Saving to CSV...")
113
 
114
  df = pd.DataFrame(data_list)
115
+
116
+ extraction_config = {
117
+ 'feature_types': feature_types,
118
+ 'n_mfcc': n_mfcc if 'mfcc' in feature_types else 0,
119
+ 'total_features': total_features,
120
+ 'feature_breakdown': {
121
+ 'zcr': 1 if 'zcr' in feature_types else 0,
122
+ 'chroma': 12 if 'chroma' in feature_types else 0,
123
+ 'mfcc': n_mfcc if 'mfcc' in feature_types else 0,
124
+ 'rms': 1 if 'rms' in feature_types else 0,
125
+ 'mel': 128 if 'mel' in feature_types else 0
126
+ },
127
+ 'n_samples': len(df),
128
+ 'extraction_date': pd.Timestamp.now().isoformat()
129
+ }
130
+
131
  df.to_csv(config.FEATURES_CSV, index=False)
132
 
133
+ config_file = Path(config.FEATURES_CSV).with_suffix('.json')
134
+ with open(config_file, 'w') as f:
135
+ json.dump(extraction_config, f, indent=2)
136
+
137
  progress(1.0, desc="Complete!")
138
 
139
  stats = get_dataset_statistics(audio_files)
140
 
141
+ feature_summary_lines = []
142
+ if 'zcr' in feature_types:
143
+ feature_summary_lines.append("- **ZCR**: 1 feature")
144
+ if 'chroma' in feature_types:
145
+ feature_summary_lines.append("- **CHROMA**: 12 features")
146
+ if 'mfcc' in feature_types:
147
+ feature_summary_lines.append(f"- **MFCC**: {n_mfcc} features")
148
+ if 'rms' in feature_types:
149
+ feature_summary_lines.append("- **RMS**: 1 feature")
150
+ if 'mel' in feature_types:
151
+ feature_summary_lines.append("- **MEL**: 128 features")
152
+
153
+ feature_summary = "\n".join(feature_summary_lines)
154
+
155
+ summary = f"""## ✅ Feature Extraction Complete!
156
+
157
+ ### 🎨 Selected Feature Types:
158
+ {feature_summary}
159
 
160
  ### 📊 Statistics:
161
  - **Total Files**: {stats['total_files']}
162
  - **Successfully Processed**: {len(df)}
163
  - **Failed**: {len(failed_files)}
164
+ - **Features per File**: {total_features}
165
+ - **Output CSV**: `{config.FEATURES_CSV}`
166
+ - **Config File**: `{config_file}`
167
 
168
  ### 🎭 Emotion Distribution:
169
  {df['emotion'].value_counts().to_string()}
170
 
171
  ### 👥 Actors: {stats['n_actors']}
172
 
173
+ ✅ **Ready for training! Go to Tab 2.**"""
 
174
 
175
  if failed_files:
176
  summary += f"\n\n### ⚠️ Failed Files ({len(failed_files)}):\n"
 
191
  audio_files, error = scan_dataset_directory()
192
 
193
  if error:
194
+ return f"## ⚠️ Dataset Not Found\n\n{error}\n\n**Please upload RAVDESS dataset to the correct location.**"
 
 
 
 
 
 
195
 
196
  stats = get_dataset_statistics(audio_files)
197
 
198
+ config_file = Path(config.FEATURES_CSV).with_suffix('.json')
199
+ existing_config = None
200
 
201
+ if config_file.exists():
202
+ try:
203
+ with open(config_file, 'r') as f:
204
+ existing_config = json.load(f)
205
+ except:
206
+ pass
207
 
208
+ status = f"## ✅ Dataset Found!\n\n### 📊 Statistics:\n- **Total Files**: {stats['total_files']}\n- **Location**: `{config.DATA_DIR}`\n\n### 🎭 Emotions:\n"
 
209
 
210
  for emotion, count in sorted(stats['emotion_counts'].items()):
211
  status += f"- **{emotion.capitalize()}**: {count} files\n"
212
 
213
+ status += f"\n### 👥 Actors: {stats['n_actors']}\n"
214
 
215
+ if existing_config:
216
+ status += f"\n---\n\n### 📋 Previous Extraction Found:\n- **Feature Types**: {', '.join(existing_config.get('feature_types', []))}\n- **Total Features**: {existing_config.get('total_features', 'Unknown')}\n- **MFCC Count**: {existing_config.get('n_mfcc', 'N/A')}\n- **Samples**: {existing_config.get('n_samples', 'Unknown')}\n\n**Note**: Re-extracting will overwrite previous features."
217
+ else:
218
+ status += '\n**Select feature types and click "🔊 Extract Features".**'
219
 
220
  return status
221
 
222
 
223
+ def preview_single_audio(audio_file, zcr, chroma, mfcc, rms, mel, n_mfcc):
224
+ """Preview single audio file with selected features"""
225
  if audio_file is None:
226
  return "Please upload an audio file", None, None
227
 
228
+ feature_types = []
229
+ if zcr:
230
+ feature_types.append('zcr')
231
+ if chroma:
232
+ feature_types.append('chroma')
233
+ if mfcc:
234
+ feature_types.append('mfcc')
235
+ if rms:
236
+ feature_types.append('rms')
237
+ if mel:
238
+ feature_types.append('mel')
239
+
240
+ if len(feature_types) == 0:
241
+ return "❌ Please select at least one feature type!", None, None
242
+
243
  try:
244
+ features, y, sr, feature_info = extract_features(
245
+ audio_file, feature_types=feature_types, n_mfcc=n_mfcc)
246
 
247
+ feature_breakdown_lines = []
248
+ for ftype, count in feature_info['counts'].items():
249
+ feature_breakdown_lines.append(
250
+ f"- **{ftype.upper()}**: {count} features")
251
+ feature_breakdown = "\n".join(feature_breakdown_lines)
252
 
253
+ summary = f"## 🔍 Single File Preview\n\n- **File**: {Path(audio_file).name}\n- **Sample Rate**: {sr} Hz\n- **Duration**: {len(y)/sr:.2f}s\n- **Emotion**: {extract_emotion_from_filename(Path(audio_file).name)}\n\n### 🎨 Extracted Features:\n{feature_breakdown}\n\n**Total Features**: {feature_info['total']}"
 
 
 
 
 
254
 
255
  waveform = create_waveform_plot(y, sr)
256
  spectrogram = create_spectrogram_plot(y, sr)
 
263
 
264
 
265
  def create_tab1():
266
+ """Create Tab 1: Feature Extraction with Feature Type Selection"""
267
 
268
  with gr.Tab("1️⃣ Feature Extraction"):
269
+ gr.Markdown(
270
+ "## 📁 Extract Features from Dataset\n\n**Select which feature types to extract:**")
 
 
 
271
 
272
  with gr.Row():
273
  with gr.Column(scale=1):
274
+ gr.Markdown("### 🎨 Feature Types")
275
 
276
+ with gr.Group():
277
+ zcr_check = gr.Checkbox(
278
+ label="🌊 ZCR - Zero Crossing Rate (1 feature)", value=True, info="Signal sign change frequency")
279
+ chroma_check = gr.Checkbox(
280
+ label="🎵 Chroma STFT (12 features)", value=True, info="Pitch class distribution")
281
+ mfcc_check = gr.Checkbox(label="🎤 MFCC (20-40 features, configurable below)",
282
+ value=True, info="Mel-frequency cepstral coefficients - MOST IMPORTANT")
283
 
284
+ n_mfcc_slider = gr.Slider(minimum=config.MFCC_MIN, maximum=config.MFCC_MAX, value=config.MFCC_DEFAULT, step=1,
285
+ label="Number of MFCC Coefficients", info="More MFCC = more detail but slower extraction", visible=True)
286
+
287
+ rms_check = gr.Checkbox(
288
+ label="📊 RMS Energy (1 feature)", value=True, info="Signal amplitude/loudness")
289
+ mel_check = gr.Checkbox(
290
+ label="🎹 Mel Spectrogram (128 features)", value=True, info="Frequency distribution over time")
291
+
292
+ feature_count_display = gr.Markdown(calculate_feature_count(
293
+ True, True, True, True, True, config.MFCC_DEFAULT))
 
 
294
 
295
+ for control in [zcr_check, chroma_check, mfcc_check, rms_check, mel_check, n_mfcc_slider]:
296
+ control.change(fn=calculate_feature_count, inputs=[
297
+ zcr_check, chroma_check, mfcc_check, rms_check, mel_check, n_mfcc_slider], outputs=[feature_count_display])
298
+
299
+ def toggle_mfcc_slider(mfcc_enabled):
300
+ return gr.update(visible=mfcc_enabled)
301
+
302
+ mfcc_check.change(fn=toggle_mfcc_slider, inputs=[
303
+ mfcc_check], outputs=[n_mfcc_slider])
304
+
305
+ gr.Markdown("---")
306
+ check_btn = gr.Button("🔄 Check Dataset Status", size="sm")
307
+ gr.Markdown("---")
308
+ extract_btn = gr.Button(
309
+ "🔊 Extract Features", variant="primary", size="lg")
310
+ gr.Markdown(
311
+ "---\n### 🔍 Preview Single Audio\n\nTest feature extraction on one file.")
312
  preview_audio = gr.Audio(
313
+ sources=["upload"], type="filepath", label="Upload Single File")
 
 
 
314
  preview_btn = gr.Button("Preview Features")
315
 
316
+ gr.Markdown("---\n### 💡 Feature Selection Tips\n\n**All Features (162):**\n- MFCC: 20 (default)\n- Most balanced\n- ~87-90% accuracy\n\n**MFCC Only (20):**\n- Fast extraction\n- Good baseline\n- ~80-85% accuracy\n\n---\n\n### 📋 Output Files:\n- **CSV**: `features_ravdess.csv` (data)\n- **JSON**: `features_ravdess.json` (config)")
317
+
318
  with gr.Column(scale=2):
319
  output_text = gr.Markdown()
320
  preview_df = gr.Dataframe(label="Dataset Preview")
 
324
  waveform_plot = gr.Plot(label="Waveform")
325
  spectrogram_plot = gr.Plot(label="Spectrogram")
326
 
327
+ check_btn.click(fn=check_dataset_status, outputs=[output_text])
328
+ extract_btn.click(fn=extract_dataset_features, inputs=[
329
+ zcr_check, chroma_check, mfcc_check, rms_check, mel_check, n_mfcc_slider], outputs=[output_text, preview_df, emotion_dist])
330
+ preview_btn.click(fn=preview_single_audio, inputs=[preview_audio, zcr_check, chroma_check, mfcc_check,
331
+ rms_check, mel_check, n_mfcc_slider], outputs=[output_text, waveform_plot, spectrogram_plot])
 
 
 
 
 
 
 
 
 
 
 
src/ui/tab2_training.py CHANGED
@@ -1,30 +1,85 @@
1
  """
2
- Tab 2: Model Training UI
3
  """
4
 
5
  import gradio as gr
6
  from src.training import train_models_with_ga
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def create_tab2():
10
  """Create Tab 2: Model Training"""
11
-
12
  with gr.Tab("2️⃣ Model Training"):
13
  gr.Markdown("""
14
  ## 🧬 Train Models with Genetic Algorithm
15
 
16
- Optimize feature selection, hyperparameters, and ensemble weights.
17
  """)
18
-
19
  with gr.Row():
20
  with gr.Column(scale=1):
21
- # Cross-Validation Toggle
22
  use_cv = gr.Checkbox(
23
  label="🔄 Use K-Fold Cross-Validation",
24
  value=False,
25
  info="More reliable evaluation but slower (recommended for final training)"
26
  )
27
-
28
  n_folds = gr.Slider(
29
  minimum=3,
30
  maximum=10,
@@ -34,34 +89,63 @@ def create_tab2():
34
  info="More folds = more reliable but slower",
35
  visible=False
36
  )
37
-
38
  gr.Markdown("---")
39
-
40
- # GA Toggle
41
  use_ga = gr.Checkbox(
42
  label="🧬 Use Genetic Algorithm Optimization",
43
  value=True,
44
- info="GA optimizes features + hyperparameters + ensemble weights"
45
  )
46
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  ga_generations = gr.Slider(
48
  minimum=5,
49
  maximum=50,
50
- value=20,
51
  step=5,
52
  label="GA Generations",
53
  info="More generations = better optimization but slower"
54
  )
55
-
56
  ga_population = gr.Slider(
57
  minimum=5,
58
  maximum=30,
59
- value=15,
60
  step=5,
61
  label="GA Population Size",
62
  info="Larger population = more exploration but slower"
63
  )
64
-
65
  n_jobs = gr.Slider(
66
  minimum=1,
67
  maximum=8,
@@ -70,78 +154,113 @@ def create_tab2():
70
  label="Parallel Jobs",
71
  info="Number of CPU cores (2 for free tier, 4+ for better hardware)"
72
  )
73
-
74
- # Toggle visibility of CV and GA parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  def toggle_cv_params(use_cv_val):
76
  return gr.update(visible=use_cv_val)
77
-
78
  def toggle_ga_params(use_ga_val):
 
 
 
79
  return (
80
- gr.update(visible=use_ga_val),
81
- gr.update(visible=use_ga_val),
82
- gr.update(visible=use_ga_val)
83
  )
84
-
85
  use_cv.change(
86
  fn=toggle_cv_params,
87
  inputs=[use_cv],
88
  outputs=[n_folds]
89
  )
90
-
91
  use_ga.change(
92
  fn=toggle_ga_params,
93
  inputs=[use_ga],
94
- outputs=[ga_generations, ga_population, n_jobs]
 
95
  )
96
-
 
 
 
 
 
 
97
  gr.Markdown("---")
98
-
99
  train_btn = gr.Button(
100
  "🚀 Start Training",
101
  variant="primary",
102
  size="lg"
103
  )
104
-
105
  gr.Markdown("""
106
  ### 🔬 Training Modes:
107
 
108
- **Single Split (Fast):**
109
- - Quick results (15-30 min with GA)
110
- - Good for experimentation
111
- - ⚠️ Less reliable estimate
112
-
113
- **K-Fold CV (Recommended):**
114
- - ✓ Reliable accuracy estimate
115
- - ✓ Mean ± Std reported
116
- - ✓ Detects overfitting
117
- - ⚠️ Slower (5x longer)
118
 
119
- ### 🧬 GA Optimizations:
120
- - **Parallel Evaluation**: 2-4x speedup
121
- - ✅ **Early Stopping**: Auto-stop when converged
122
- - **Real-time Logging**: Progress details
123
- - **Feature Selection**: 80 best from 162
124
 
125
- ### ⏱️ Estimated Time:
 
 
 
126
 
127
- **Single Split:**
128
- - With GA (Parallel): 15-30 minutes
129
- - Without GA: 5-10 minutes
130
 
131
- **5-Fold CV:**
132
- - With GA (Parallel): 75-150 minutes
133
- - Without GA: 25-50 minutes
134
 
135
- **10-Fold CV:**
136
- - With GA (Parallel): 150-300 minutes
137
- - Without GA: 50-100 minutes
138
 
139
- ---
 
 
140
 
141
- 💡 **Tip**: Start with Single Split for quick testing,
142
- then use 5-Fold CV for final model!
 
143
  """)
144
-
145
  with gr.Column(scale=2):
146
  training_output = gr.Markdown()
147
  results_table = gr.Dataframe(
@@ -152,7 +271,7 @@ def create_tab2():
152
  label="GA Evolution History / CV Statistics",
153
  headers=None
154
  )
155
-
156
  with gr.Accordion("📜 Detailed Training Log", open=False):
157
  training_log = gr.Textbox(
158
  label="Training Log",
@@ -161,119 +280,59 @@ def create_tab2():
161
  interactive=False,
162
  show_copy_button=True
163
  )
164
-
165
- # Information boxes
166
- with gr.Accordion("ℹ️ Understanding Cross-Validation", open=False):
167
  gr.Markdown("""
168
- ## 🔄 What is K-Fold Cross-Validation?
169
-
170
- ### How it works:
171
-
172
- 1. **Split data into K folds** (e.g., 5 folds)
173
- 2. **Train K times**, each time using:
174
- - K-1 folds for training
175
- - 1 fold for testing
176
- 3. **Average results** across all folds
177
 
178
- ### Example with 5-Fold CV:
179
- Fold 1: Train on [2,3,4,5], Test on [1]
180
- Fold 2: Train on [1,3,4,5], Test on [2]
181
- Fold 3: Train on [1,2,4,5], Test on [3]
182
- Fold 4: Train on [1,2,3,5], Test on [4]
183
- Fold 5: Train on [1,2,3,4], Test on [5]
184
-
185
- Final Result: Average of all 5 test accuracies
186
-
187
- ### Why use CV?
188
-
189
- ✅ **More Reliable**: Every sample is tested exactly once
190
 
191
- **Variance Estimate**: Get Mean ± Std instead of single number
 
 
 
 
192
 
193
- **Better Generalization**: Uses all data for both training and testing
194
 
195
- **Detect Overfitting**: High std = unstable model
 
 
 
 
 
196
 
197
- ### Interpreting Results:
198
 
199
- **Good Model:**
200
- - Mean: 90.1%
201
- - Std: 0.5%
202
- - Interpretation: Stable, reliable performance ✓
203
 
204
- **Overfitting Model:**
205
- - Mean: 92.3%
206
- - Std: 5.2%
207
- - Interpretation: Unstable, unreliable ✗
208
 
209
- **Underfitting Model:**
210
- - Mean: 75.0%
211
- - Std: 0.3%
212
- - Interpretation: Stable but poor performance ✗
213
 
214
- ### When to use what?
215
-
216
- | Scenario | Recommendation |
217
  |----------|---------------|
218
- | Quick experiment | Single Split |
219
- | Final model | 5-Fold CV |
220
- | Small dataset (<1000 samples) | 10-Fold CV |
221
- | Large dataset (>100k samples) | Single Split |
222
- | Publication/Research | 5 or 10-Fold CV |
223
- """)
224
-
225
- with gr.Accordion("🧬 Understanding Genetic Algorithm", open=False):
226
- gr.Markdown("""
227
- ## 🧬 What does GA optimize?
228
-
229
- ### 1. Feature Selection (80/162)
230
- - Finds best combination of audio features
231
- - Removes redundant/noisy features
232
- - Reduces overfitting
233
-
234
- ### 2. Hyperparameters
235
- - **XGBoost**: n_estimators, max_depth, learning_rate
236
- - **LightGBM**: n_estimators, num_leaves, learning_rate
237
- - **Gradient Boosting**: n_estimators, max_depth, learning_rate
238
- - **AdaBoost**: n_estimators, learning_rate
239
-
240
- ### 3. Ensemble Weights
241
- - Optimal weights for combining models
242
- - NOT equal weights [0.25, 0.25, 0.25, 0.25]
243
- - NOT accuracy-based weights
244
- - Learned from validation performance
245
-
246
- ### How GA works:
247
- 1. Create random population (15 solutions)
248
- 2. Evaluate fitness (train models, measure accuracy)
249
- 3. Select best solutions (tournament selection)
250
- 4. Create offspring (crossover + mutation)
251
- 5. Repeat for 20 generations
252
- 6. Return best solution found
253
-
254
- ### Why GA vs Grid Search?
255
-
256
- **Grid Search:**
257
- - Tests every combination
258
- - Very slow (days for this problem)
259
- - Guarantees finding best in grid
260
-
261
- **Genetic Algorithm:**
262
- - Intelligent search (evolutionary)
263
- - Fast (minutes to hours)
264
- - Finds near-optimal solution
265
- - Can optimize multiple objectives
266
-
267
- ### Typical Improvement:
268
-
269
- - **Without GA**: 82-85% accuracy
270
- - **With GA**: 87-90% accuracy
271
- - **Gain**: +5% absolute improvement
272
  """)
273
-
274
- # Event handler
275
  train_btn.click(
276
  fn=train_models_with_ga,
277
- inputs=[use_ga, use_cv, n_folds, ga_generations, ga_population, n_jobs],
278
- outputs=[training_output, results_table, ga_history_table, training_log]
279
- )
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Tab 2: Model Training UI with K-Fold Cross-Validation and GA Feature Selection
3
  """
4
 
5
  import gradio as gr
6
  from src.training import train_models_with_ga
7
+ import config
8
+
9
+
10
+ def calculate_ga_feature_info(optimize_features, n_features_select, total_features=162):
11
+ """Calculate and display GA feature selection info"""
12
+ if optimize_features:
13
+ percentage = (n_features_select / total_features *
14
+ 100) if total_features > 0 else 0
15
+ return f"""
16
+ ### 🧬 GA Feature Selection: **ENABLED**
17
+
18
+ GA will optimize:
19
+ 1. **Which specific features to use**: {n_features_select}/{total_features} ({percentage:.1f}%)
20
+ 2. **Model hyperparameters** (all 4 models)
21
+ 3. **Ensemble weights**
22
+
23
+ **Search Space:**
24
+ - Feature combinations: C({total_features}, {n_features_select}) = Very Large!
25
+ - Plus hyperparameter combinations
26
+ - Total optimization space: **MASSIVE**
27
+
28
+ **Expected:** GA will find optimal feature subset + model configurations
29
+ """
30
+ else:
31
+ return f"""
32
+ ### 🧬 GA Feature Selection: **DISABLED**
33
+
34
+ GA will optimize:
35
+ - **Model hyperparameters ONLY** (all 4 models)
36
+ - **Ensemble weights**
37
+
38
+ **Note:** All {total_features} extracted features will be used (no feature selection)
39
+
40
+ This is faster but may include noisy/redundant features.
41
+ """
42
+
43
+
44
+ def update_feature_slider_max(csv_path='features_ravdess.csv'):
45
+ """Update slider maximum based on extracted features"""
46
+ import pandas as pd
47
+ import os
48
+
49
+ if not os.path.exists(csv_path):
50
+ return gr.update(maximum=162, value=100)
51
+
52
+ try:
53
+ df = pd.read_csv(csv_path)
54
+ feature_cols = [col for col in df.columns if col.startswith(
55
+ 'feature_') and col.replace('feature_', '').isdigit()]
56
+ n_features = len(feature_cols)
57
+
58
+ default_select = min(100, int(n_features * 0.7))
59
+
60
+ return gr.update(maximum=n_features, value=default_select, label=f"Features to Select (Max: {n_features})")
61
+ except:
62
+ return gr.update(maximum=162, value=100)
63
 
64
 
65
  def create_tab2():
66
  """Create Tab 2: Model Training"""
67
+
68
  with gr.Tab("2️⃣ Model Training"):
69
  gr.Markdown("""
70
  ## 🧬 Train Models with Genetic Algorithm
71
 
72
+ Optimize hyperparameters and optionally feature selection.
73
  """)
74
+
75
  with gr.Row():
76
  with gr.Column(scale=1):
 
77
  use_cv = gr.Checkbox(
78
  label="🔄 Use K-Fold Cross-Validation",
79
  value=False,
80
  info="More reliable evaluation but slower (recommended for final training)"
81
  )
82
+
83
  n_folds = gr.Slider(
84
  minimum=3,
85
  maximum=10,
 
89
  info="More folds = more reliable but slower",
90
  visible=False
91
  )
92
+
93
  gr.Markdown("---")
94
+
 
95
  use_ga = gr.Checkbox(
96
  label="🧬 Use Genetic Algorithm Optimization",
97
  value=True,
98
+ info="GA optimizes hyperparameters + optionally features"
99
  )
100
+
101
+ optimize_features = gr.Checkbox(
102
+ label="✨ GA Optimize Feature Selection",
103
+ value=True,
104
+ info="Let GA select best feature subset (recommended)"
105
+ )
106
+
107
+ n_features_select = gr.Slider(
108
+ minimum=10,
109
+ maximum=162,
110
+ value=100,
111
+ step=5,
112
+ label="Features to Select (Max: 162)",
113
+ info="Number of features GA will select from extracted features",
114
+ visible=True
115
+ )
116
+
117
+ update_slider_btn = gr.Button(
118
+ "🔄 Update from Extracted Features",
119
+ size="sm",
120
+ visible=True
121
+ )
122
+
123
+ update_slider_btn.click(
124
+ fn=update_feature_slider_max,
125
+ inputs=[],
126
+ outputs=[n_features_select]
127
+ )
128
+
129
+ gr.Markdown("---")
130
+
131
  ga_generations = gr.Slider(
132
  minimum=5,
133
  maximum=50,
134
+ value=30,
135
  step=5,
136
  label="GA Generations",
137
  info="More generations = better optimization but slower"
138
  )
139
+
140
  ga_population = gr.Slider(
141
  minimum=5,
142
  maximum=30,
143
+ value=20,
144
  step=5,
145
  label="GA Population Size",
146
  info="Larger population = more exploration but slower"
147
  )
148
+
149
  n_jobs = gr.Slider(
150
  minimum=1,
151
  maximum=8,
 
154
  label="Parallel Jobs",
155
  info="Number of CPU cores (2 for free tier, 4+ for better hardware)"
156
  )
157
+
158
+ ga_feature_info = gr.Markdown(
159
+ calculate_ga_feature_info(True, 100, 162)
160
+ )
161
+
162
+ def update_ga_info_wrapper(opt_feat, n_feat):
163
+ import pandas as pd
164
+ import os
165
+ total = 162
166
+ if os.path.exists(config.FEATURES_CSV):
167
+ try:
168
+ df = pd.read_csv(config.FEATURES_CSV)
169
+ feature_cols = [col for col in df.columns if col.startswith(
170
+ 'feature_') and col.replace('feature_', '').isdigit()]
171
+ total = len(feature_cols)
172
+ except:
173
+ pass
174
+ return calculate_ga_feature_info(opt_feat, n_feat, total)
175
+
176
+ optimize_features.change(
177
+ fn=update_ga_info_wrapper,
178
+ inputs=[optimize_features, n_features_select],
179
+ outputs=[ga_feature_info]
180
+ )
181
+
182
+ n_features_select.change(
183
+ fn=update_ga_info_wrapper,
184
+ inputs=[optimize_features, n_features_select],
185
+ outputs=[ga_feature_info]
186
+ )
187
+
188
  def toggle_cv_params(use_cv_val):
189
  return gr.update(visible=use_cv_val)
190
+
191
  def toggle_ga_params(use_ga_val):
192
+ return tuple([gr.update(visible=use_ga_val)] * 6)
193
+
194
+ def toggle_feature_slider(opt_feat_val):
195
  return (
196
+ gr.update(visible=opt_feat_val),
197
+ gr.update(visible=opt_feat_val)
 
198
  )
199
+
200
  use_cv.change(
201
  fn=toggle_cv_params,
202
  inputs=[use_cv],
203
  outputs=[n_folds]
204
  )
205
+
206
  use_ga.change(
207
  fn=toggle_ga_params,
208
  inputs=[use_ga],
209
+ outputs=[optimize_features, n_features_select,
210
+ update_slider_btn, ga_generations, ga_population, n_jobs]
211
  )
212
+
213
+ optimize_features.change(
214
+ fn=toggle_feature_slider,
215
+ inputs=[optimize_features],
216
+ outputs=[n_features_select, update_slider_btn]
217
+ )
218
+
219
  gr.Markdown("---")
220
+
221
  train_btn = gr.Button(
222
  "🚀 Start Training",
223
  variant="primary",
224
  size="lg"
225
  )
226
+
227
  gr.Markdown("""
228
  ### 🔬 Training Modes:
229
 
230
+ **Mode 1: Full GA (Recommended)**
231
+ - GA Feature Selection: ON
232
+ - GA Hyperparameter Tuning: ON
233
+ - ⏱️ Time: 60-120 min
234
+ - 🎯 Best accuracy
 
 
 
 
 
235
 
236
+ **Mode 2: GA Hyperparameters Only**
237
+ - GA Feature Selection: OFF
238
+ - ✅ GA Hyperparameter Tuning: ON
239
+ - ⏱️ Time: 30-60 min
240
+ - 🎯 Good accuracy, faster
241
 
242
+ **Mode 3: No GA (Fast)**
243
+ - ❌ GA: OFF
244
+ - ⏱️ Time: 5-10 min
245
+ - 🎯 Baseline accuracy
246
 
247
+ ---
 
 
248
 
249
+ ### 💡 Feature Selection Tips:
 
 
250
 
251
+ **Many features (>100):**
252
+ - Select 60-80%
253
+ - GA finds most informative
254
 
255
+ **Few features (<50):**
256
+ - Use all features
257
+ - Disable feature selection
258
 
259
+ **Medium features (50-100):**
260
+ - Select 70-90%
261
+ - Balance info and speed
262
  """)
263
+
264
  with gr.Column(scale=2):
265
  training_output = gr.Markdown()
266
  results_table = gr.Dataframe(
 
271
  label="GA Evolution History / CV Statistics",
272
  headers=None
273
  )
274
+
275
  with gr.Accordion("📜 Detailed Training Log", open=False):
276
  training_log = gr.Textbox(
277
  label="Training Log",
 
280
  interactive=False,
281
  show_copy_button=True
282
  )
283
+
284
+ with gr.Accordion("ℹ️ Understanding Feature Selection", open=False):
 
285
  gr.Markdown("""
286
+ ## 🎯 What is Feature Selection?
 
 
 
 
 
 
 
 
287
 
288
+ ### Why select features?
 
 
 
 
 
 
 
 
 
 
 
289
 
290
+ **Too many features can cause:**
291
+ - ❌ **Overfitting**: Model memorizes noise
292
+ - ❌ **Curse of dimensionality**: Need exponentially more data
293
+ - ❌ **Slow training**: More features = more computation
294
+ - ❌ **Redundancy**: Correlated features don't add info
295
 
296
+ ### How GA selects features:
297
 
298
+ Evolution process finds optimal feature subset through:
299
+ - Random initialization
300
+ - Fitness evaluation (accuracy)
301
+ - Selection (keep best)
302
+ - Crossover (combine good solutions)
303
+ - Mutation (explore new combinations)
304
 
305
+ ### Example Results:
306
 
307
+ **Full features (162):**
308
+ - Accuracy: 87%
309
+ - Training time: 60 min
 
310
 
311
+ **GA selected (80 features):**
312
+ - Accuracy: 90% ✓ (better!)
313
+ - Training time: 40 min ✓ (faster!)
 
314
 
315
+ ### When to use:
 
 
 
316
 
317
+ | Features | Recommendation |
 
 
318
  |----------|---------------|
319
+ | >100 | Use GA (60-80%) |
320
+ | 50-100 | ✅ Optional (70-90%) |
321
+ | <50 | Use all features |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  """)
323
+
 
324
  train_btn.click(
325
  fn=train_models_with_ga,
326
+ inputs=[
327
+ use_ga,
328
+ use_cv,
329
+ n_folds,
330
+ ga_generations,
331
+ ga_population,
332
+ n_jobs,
333
+ optimize_features,
334
+ n_features_select
335
+ ],
336
+ outputs=[training_output, results_table,
337
+ ga_history_table, training_log]
338
+ )