Commit
·
cafbe14
1
Parent(s):
31a2a2f
update project with selection
Browse files- __pycache__/config.cpython-311.pyc +0 -0
- config.py +95 -70
- features_ravdess.csv +0 -0
- features_ravdess.json +16 -0
- src/__pycache__/__init__.cpython-311.pyc +0 -0
- src/__pycache__/data_loader.cpython-311.pyc +0 -0
- src/__pycache__/ensemble_model.cpython-311.pyc +0 -0
- src/__pycache__/feature_extraction.cpython-311.pyc +0 -0
- src/__pycache__/genetic_algorithm.cpython-311.pyc +0 -0
- src/__pycache__/training.cpython-311.pyc +0 -0
- src/__pycache__/utils.cpython-311.pyc +0 -0
- src/feature_extraction.py +191 -43
- src/genetic_algorithm.py +186 -159
- src/training.py +248 -230
- src/ui/__pycache__/__init__.cpython-311.pyc +0 -0
- src/ui/__pycache__/tab1_extraction.cpython-311.pyc +0 -0
- src/ui/__pycache__/tab2_training.cpython-311.pyc +0 -0
- src/ui/__pycache__/tab3_prediction.cpython-311.pyc +0 -0
- src/ui/tab1_extraction.py +190 -107
- src/ui/tab2_training.py +218 -159
__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (3.59 kB). View file
|
|
|
config.py
CHANGED
|
@@ -13,104 +13,129 @@ FEATURES_CSV = 'features_ravdess.csv'
|
|
| 13 |
WEIGHTS_DIR = Path('weights')
|
| 14 |
|
| 15 |
# ============================================================================
|
| 16 |
-
# FEATURE EXTRACTION
|
| 17 |
# ============================================================================
|
| 18 |
AUDIO_DURATION = 2.5 # seconds
|
| 19 |
AUDIO_OFFSET = 0.6 # seconds
|
| 20 |
-
N_FEATURES = 162
|
| 21 |
|
| 22 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
FEATURE_CONFIG = {
|
| 24 |
-
'zcr':
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
}
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# ============================================================================
|
| 32 |
-
# GENETIC ALGORITHM
|
| 33 |
# ============================================================================
|
| 34 |
-
# GA_CONFIG = {
|
| 35 |
-
# 'n_features_to_select': 80,
|
| 36 |
-
# 'population_size': 15,
|
| 37 |
-
# 'n_generations': 20,
|
| 38 |
-
# 'mutation_rate': 0.15,
|
| 39 |
-
# 'crossover_rate': 0.8,
|
| 40 |
-
# 'elite_size': 2,
|
| 41 |
-
# 'early_stopping_patience': 15,
|
| 42 |
-
# 'early_stopping_tolerance': 0.0001
|
| 43 |
-
# }
|
| 44 |
-
|
| 45 |
GA_CONFIG = {
|
| 46 |
-
'n_features_to_select': 100, #
|
| 47 |
-
'population_size': 20, #
|
| 48 |
-
'n_generations': 30, #
|
| 49 |
-
'mutation_rate': 0.2, #
|
| 50 |
-
'crossover_rate': 0.8, #
|
| 51 |
-
'elite_size': 3, #
|
| 52 |
-
'early_stopping_patience': 8, #
|
| 53 |
-
'early_stopping_tolerance': 0.001
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
}
|
| 55 |
|
| 56 |
# ============================================================================
|
| 57 |
-
# MODEL HYPERPARAMETERS
|
| 58 |
# ============================================================================
|
| 59 |
-
# MODEL_HYPERPARAMS = {
|
| 60 |
-
# 'xgb': {
|
| 61 |
-
# 'n_estimators': [50, 100, 150],
|
| 62 |
-
# 'max_depth': [3, 4, 5, 6],
|
| 63 |
-
# 'learning_rate': [0.05, 0.1, 0.15]
|
| 64 |
-
# },
|
| 65 |
-
# 'lgbm': {
|
| 66 |
-
# 'n_estimators': [50, 100, 150],
|
| 67 |
-
# 'num_leaves': [20, 31, 40],
|
| 68 |
-
# 'learning_rate': [0.05, 0.1, 0.15]
|
| 69 |
-
# },
|
| 70 |
-
# 'gb': {
|
| 71 |
-
# 'n_estimators': [50, 100, 150],
|
| 72 |
-
# 'max_depth': [3, 4, 5],
|
| 73 |
-
# 'learning_rate': [0.05, 0.1, 0.15]
|
| 74 |
-
# },
|
| 75 |
-
# 'ada': {
|
| 76 |
-
# 'n_estimators': [50, 100, 150],
|
| 77 |
-
# 'learning_rate': [0.5, 1.0, 1.5]
|
| 78 |
-
# }
|
| 79 |
-
# }
|
| 80 |
MODEL_HYPERPARAMS = {
|
| 81 |
'xgb': {
|
| 82 |
-
|
| 83 |
-
'
|
| 84 |
-
'
|
| 85 |
-
'
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
'
|
|
|
|
|
|
|
|
|
|
| 89 |
},
|
| 90 |
'lgbm': {
|
|
|
|
| 91 |
'n_estimators': [100, 200, 300, 400, 500],
|
| 92 |
-
'num_leaves': [31, 50, 70, 100, 127],
|
| 93 |
'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
'
|
| 97 |
-
'
|
| 98 |
-
'
|
|
|
|
|
|
|
| 99 |
},
|
| 100 |
'gb': {
|
|
|
|
| 101 |
'n_estimators': [100, 200, 300, 400],
|
| 102 |
-
'max_depth': [4, 5, 6, 7, 8],
|
| 103 |
'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
'
|
|
|
|
|
|
|
| 107 |
},
|
| 108 |
'ada': {
|
| 109 |
'n_estimators': [100, 200, 300, 400, 500],
|
| 110 |
-
'learning_rate': [0.5, 0.8, 1.0, 1.2, 1.5]
|
| 111 |
-
|
|
|
|
| 112 |
}
|
| 113 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
# ============================================================================
|
| 115 |
# TRAINING
|
| 116 |
# ============================================================================
|
|
@@ -151,4 +176,4 @@ UI_CONFIG = {
|
|
| 151 |
'server_port': 7860,
|
| 152 |
'max_file_size': 10 * 1024 * 1024, # 10 MB
|
| 153 |
'allowed_audio_formats': ['.wav', '.mp3', '.flac']
|
| 154 |
-
}
|
|
|
|
| 13 |
WEIGHTS_DIR = Path('weights')
|
| 14 |
|
| 15 |
# ============================================================================
|
| 16 |
+
# FEATURE EXTRACTION - CONFIGURABLE
|
| 17 |
# ============================================================================
|
| 18 |
AUDIO_DURATION = 2.5 # seconds
|
| 19 |
AUDIO_OFFSET = 0.6 # seconds
|
|
|
|
| 20 |
|
| 21 |
+
# MFCC Configuration - VARIABLE
|
| 22 |
+
MFCC_MIN = 20 # Minimum MFCC coefficients
|
| 23 |
+
MFCC_MAX = 40 # Maximum MFCC coefficients
|
| 24 |
+
MFCC_DEFAULT = 20 # Default for extraction
|
| 25 |
+
|
| 26 |
+
# Feature breakdown with DYNAMIC counts
|
| 27 |
FEATURE_CONFIG = {
|
| 28 |
+
'zcr': {
|
| 29 |
+
'count': 1,
|
| 30 |
+
'start_idx': 0,
|
| 31 |
+
'description': 'Zero Crossing Rate - Signal sign change frequency',
|
| 32 |
+
'fixed': True
|
| 33 |
+
},
|
| 34 |
+
'chroma': {
|
| 35 |
+
'count': 12,
|
| 36 |
+
'start_idx': 1,
|
| 37 |
+
'description': 'Chroma Features - Pitch class distribution',
|
| 38 |
+
'fixed': True
|
| 39 |
+
},
|
| 40 |
+
'mfcc': {
|
| 41 |
+
'count': MFCC_DEFAULT, # VARIABLE: 20-40
|
| 42 |
+
'min_count': MFCC_MIN,
|
| 43 |
+
'max_count': MFCC_MAX,
|
| 44 |
+
'start_idx': 13,
|
| 45 |
+
'description': 'MFCC - Mel-frequency cepstral coefficients',
|
| 46 |
+
'fixed': False # Can vary
|
| 47 |
+
},
|
| 48 |
+
'rms': {
|
| 49 |
+
'count': 1,
|
| 50 |
+
'start_idx': 13 + MFCC_DEFAULT, # Dynamic based on MFCC
|
| 51 |
+
'description': 'RMS Energy - Signal amplitude',
|
| 52 |
+
'fixed': True
|
| 53 |
+
},
|
| 54 |
+
'mel': {
|
| 55 |
+
'count': 128,
|
| 56 |
+
'start_idx': 13 + MFCC_DEFAULT + 1, # Dynamic based on MFCC
|
| 57 |
+
'description': 'Mel Spectrogram - Frequency distribution',
|
| 58 |
+
'fixed': True
|
| 59 |
+
}
|
| 60 |
}
|
| 61 |
|
| 62 |
+
# Total features with default MFCC
|
| 63 |
+
N_FEATURES_MIN = 1 + 12 + MFCC_MIN + 1 + 128 # 162 features (MFCC=20)
|
| 64 |
+
N_FEATURES_MAX = 1 + 12 + MFCC_MAX + 1 + 128 # 182 features (MFCC=40)
|
| 65 |
+
N_FEATURES_DEFAULT = 1 + 12 + MFCC_DEFAULT + 1 + 128 # 162 features
|
| 66 |
+
|
| 67 |
+
# Default feature types to extract
|
| 68 |
+
DEFAULT_FEATURE_TYPES = ['zcr', 'chroma', 'mfcc', 'rms', 'mel']
|
| 69 |
+
|
| 70 |
# ============================================================================
|
| 71 |
+
# GENETIC ALGORITHM - OPTIMIZED
|
| 72 |
# ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
GA_CONFIG = {
|
| 74 |
+
'n_features_to_select': 100, # From selected feature types
|
| 75 |
+
'population_size': 20, # Smaller for faster generations
|
| 76 |
+
'n_generations': 30, # More generations for exploration
|
| 77 |
+
'mutation_rate': 0.2, # Higher for diversity
|
| 78 |
+
'crossover_rate': 0.8, # Standard crossover rate
|
| 79 |
+
'elite_size': 3, # Keep top 3 solutions
|
| 80 |
+
'early_stopping_patience': 8, # Be patient for improvements
|
| 81 |
+
'early_stopping_tolerance': 0.001, # Accept small improvements
|
| 82 |
+
|
| 83 |
+
# Feature optimization options
|
| 84 |
+
'optimize_feature_types': False, # Whether GA should select feature types
|
| 85 |
+
'optimize_mfcc_count': False, # Whether GA should optimize MFCC count
|
| 86 |
}
|
| 87 |
|
| 88 |
# ============================================================================
|
| 89 |
+
# MODEL HYPERPARAMETERS - EXPANDED & OPTIMIZED
|
| 90 |
# ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
MODEL_HYPERPARAMS = {
|
| 92 |
'xgb': {
|
| 93 |
+
# Core parameters
|
| 94 |
+
'n_estimators': [100, 200, 300, 400, 500],
|
| 95 |
+
'max_depth': [4, 5, 6, 7, 8, 9],
|
| 96 |
+
'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
|
| 97 |
+
|
| 98 |
+
# Regularization (PREVENT OVERFITTING)
|
| 99 |
+
'subsample': [0.7, 0.8, 0.9, 1.0],
|
| 100 |
+
'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
|
| 101 |
+
'min_child_weight': [1, 3, 5, 7],
|
| 102 |
+
'gamma': [0, 0.1, 0.2, 0.3]
|
| 103 |
},
|
| 104 |
'lgbm': {
|
| 105 |
+
# Core parameters
|
| 106 |
'n_estimators': [100, 200, 300, 400, 500],
|
| 107 |
+
'num_leaves': [31, 50, 70, 100, 127],
|
| 108 |
'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
|
| 109 |
+
|
| 110 |
+
# Regularization
|
| 111 |
+
'min_child_samples': [10, 20, 30, 50],
|
| 112 |
+
'subsample': [0.7, 0.8, 0.9, 1.0],
|
| 113 |
+
'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
|
| 114 |
+
'reg_alpha': [0, 0.1, 0.5, 1.0],
|
| 115 |
+
'reg_lambda': [0, 0.1, 0.5, 1.0]
|
| 116 |
},
|
| 117 |
'gb': {
|
| 118 |
+
# Core parameters
|
| 119 |
'n_estimators': [100, 200, 300, 400],
|
| 120 |
+
'max_depth': [4, 5, 6, 7, 8],
|
| 121 |
'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
|
| 122 |
+
|
| 123 |
+
# Regularization
|
| 124 |
+
'subsample': [0.7, 0.8, 0.9, 1.0],
|
| 125 |
+
'min_samples_split': [2, 5, 10, 20],
|
| 126 |
+
'min_samples_leaf': [1, 2, 4, 8]
|
| 127 |
},
|
| 128 |
'ada': {
|
| 129 |
'n_estimators': [100, 200, 300, 400, 500],
|
| 130 |
+
'learning_rate': [0.5, 0.8, 1.0, 1.2, 1.5]
|
| 131 |
+
# Note: algorithm='SAMME' is fixed (not optimized by GA)
|
| 132 |
+
# SAMME.R doesn't work well with multi-class problems in our case
|
| 133 |
}
|
| 134 |
}
|
| 135 |
+
|
| 136 |
+
# Fixed AdaBoost algorithm (not part of GA search space)
|
| 137 |
+
ADABOOST_ALGORITHM = 'SAMME' # Fixed choice
|
| 138 |
+
|
| 139 |
# ============================================================================
|
| 140 |
# TRAINING
|
| 141 |
# ============================================================================
|
|
|
|
| 176 |
'server_port': 7860,
|
| 177 |
'max_file_size': 10 * 1024 * 1024, # 10 MB
|
| 178 |
'allowed_audio_formats': ['.wav', '.mp3', '.flac']
|
| 179 |
+
}
|
features_ravdess.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
features_ravdess.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"feature_types": [
|
| 3 |
+
"mfcc"
|
| 4 |
+
],
|
| 5 |
+
"n_mfcc": 40,
|
| 6 |
+
"total_features": 40,
|
| 7 |
+
"feature_breakdown": {
|
| 8 |
+
"zcr": 0,
|
| 9 |
+
"chroma": 0,
|
| 10 |
+
"mfcc": 40,
|
| 11 |
+
"rms": 0,
|
| 12 |
+
"mel": 0
|
| 13 |
+
},
|
| 14 |
+
"n_samples": 1440,
|
| 15 |
+
"extraction_date": "2025-10-04T21:13:14.967210"
|
| 16 |
+
}
|
src/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (186 Bytes). View file
|
|
|
src/__pycache__/data_loader.cpython-311.pyc
ADDED
|
Binary file (5.04 kB). View file
|
|
|
src/__pycache__/ensemble_model.cpython-311.pyc
ADDED
|
Binary file (10 kB). View file
|
|
|
src/__pycache__/feature_extraction.cpython-311.pyc
ADDED
|
Binary file (8.35 kB). View file
|
|
|
src/__pycache__/genetic_algorithm.cpython-311.pyc
ADDED
|
Binary file (24.3 kB). View file
|
|
|
src/__pycache__/training.cpython-311.pyc
ADDED
|
Binary file (37.3 kB). View file
|
|
|
src/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (6.02 kB). View file
|
|
|
src/feature_extraction.py
CHANGED
|
@@ -1,78 +1,226 @@
|
|
| 1 |
"""
|
| 2 |
Audio Feature Extraction Module
|
| 3 |
-
Extracts
|
| 4 |
"""
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
import librosa
|
| 8 |
import warnings
|
|
|
|
| 9 |
warnings.filterwarnings('ignore')
|
| 10 |
|
| 11 |
-
|
|
|
|
| 12 |
"""
|
| 13 |
-
Extract
|
| 14 |
-
|
| 15 |
Features:
|
| 16 |
-
-
|
| 17 |
-
-
|
| 18 |
-
- 20
|
| 19 |
-
-
|
| 20 |
-
-
|
| 21 |
-
|
| 22 |
Args:
|
| 23 |
audio_path (str): Path to audio file
|
| 24 |
duration (float): Duration to load (seconds)
|
| 25 |
offset (float): Start reading after this time (seconds)
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
Returns:
|
| 28 |
-
features (np.array): Feature vector
|
| 29 |
y (np.array): Audio time series
|
| 30 |
sr (int): Sample rate
|
|
|
|
| 31 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
try:
|
| 33 |
# Load audio file
|
| 34 |
y, sr = librosa.load(audio_path, duration=duration, offset=offset)
|
| 35 |
-
|
| 36 |
# Initialize feature array
|
| 37 |
features = np.array([])
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
# 1. Zero Crossing Rate (1 feature)
|
| 40 |
-
zcr
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
| 43 |
# 2. Chroma STFT (12 features)
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# 4. RMS Energy (1 feature)
|
| 53 |
-
rms
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
| 56 |
# 5. Mel Spectrogram (128 features)
|
| 57 |
-
mel
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
except Exception as e:
|
| 63 |
-
raise Exception(
|
|
|
|
| 64 |
|
| 65 |
|
| 66 |
-
def get_feature_names():
|
| 67 |
"""
|
| 68 |
-
Get names of
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
Returns:
|
| 71 |
list: List of feature names
|
| 72 |
"""
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
Audio Feature Extraction Module
|
| 3 |
+
Extracts audio features with configurable feature types and MFCC count
|
| 4 |
"""
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
import librosa
|
| 8 |
import warnings
|
| 9 |
+
import config
|
| 10 |
warnings.filterwarnings('ignore')
|
| 11 |
|
| 12 |
+
|
| 13 |
+
def extract_features(audio_path, duration=2.5, offset=0.6, feature_types=None, n_mfcc=None):
|
| 14 |
"""
|
| 15 |
+
Extract audio features based on selected feature types
|
| 16 |
+
|
| 17 |
Features:
|
| 18 |
+
- ZCR: Zero Crossing Rate (1)
|
| 19 |
+
- Chroma: Chroma STFT (12)
|
| 20 |
+
- MFCC: Mel-frequency cepstral coefficients (20-40, configurable)
|
| 21 |
+
- RMS: RMS Energy (1)
|
| 22 |
+
- Mel: Mel Spectrogram (128)
|
| 23 |
+
|
| 24 |
Args:
|
| 25 |
audio_path (str): Path to audio file
|
| 26 |
duration (float): Duration to load (seconds)
|
| 27 |
offset (float): Start reading after this time (seconds)
|
| 28 |
+
feature_types (list): List of feature types to extract
|
| 29 |
+
['zcr', 'chroma', 'mfcc', 'rms', 'mel']
|
| 30 |
+
If None, extract all features
|
| 31 |
+
n_mfcc (int): Number of MFCC coefficients (20-40)
|
| 32 |
+
If None, use default from config
|
| 33 |
+
|
| 34 |
Returns:
|
| 35 |
+
features (np.array): Feature vector
|
| 36 |
y (np.array): Audio time series
|
| 37 |
sr (int): Sample rate
|
| 38 |
+
feature_info (dict): Information about extracted features
|
| 39 |
"""
|
| 40 |
+
|
| 41 |
+
if feature_types is None:
|
| 42 |
+
feature_types = config.DEFAULT_FEATURE_TYPES
|
| 43 |
+
|
| 44 |
+
if n_mfcc is None:
|
| 45 |
+
n_mfcc = config.MFCC_DEFAULT
|
| 46 |
+
|
| 47 |
+
# Validate MFCC count
|
| 48 |
+
n_mfcc = max(config.MFCC_MIN, min(n_mfcc, config.MFCC_MAX))
|
| 49 |
+
|
| 50 |
try:
|
| 51 |
# Load audio file
|
| 52 |
y, sr = librosa.load(audio_path, duration=duration, offset=offset)
|
| 53 |
+
|
| 54 |
# Initialize feature array
|
| 55 |
features = np.array([])
|
| 56 |
+
feature_info = {
|
| 57 |
+
'types_used': feature_types,
|
| 58 |
+
'counts': {},
|
| 59 |
+
'total': 0,
|
| 60 |
+
'n_mfcc': n_mfcc if 'mfcc' in feature_types else 0
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
# 1. Zero Crossing Rate (1 feature)
|
| 64 |
+
if 'zcr' in feature_types:
|
| 65 |
+
zcr = np.mean(librosa.feature.zero_crossing_rate(y=y).T, axis=0)
|
| 66 |
+
features = np.hstack((features, zcr))
|
| 67 |
+
feature_info['counts']['zcr'] = 1
|
| 68 |
+
|
| 69 |
# 2. Chroma STFT (12 features)
|
| 70 |
+
if 'chroma' in feature_types:
|
| 71 |
+
stft = np.abs(librosa.stft(y))
|
| 72 |
+
chroma = np.mean(librosa.feature.chroma_stft(
|
| 73 |
+
S=stft, sr=sr).T, axis=0)
|
| 74 |
+
features = np.hstack((features, chroma))
|
| 75 |
+
feature_info['counts']['chroma'] = 12
|
| 76 |
+
|
| 77 |
+
# 3. MFCC (20-40 features, CONFIGURABLE)
|
| 78 |
+
if 'mfcc' in feature_types:
|
| 79 |
+
mfcc = np.mean(librosa.feature.mfcc(
|
| 80 |
+
y=y, sr=sr, n_mfcc=n_mfcc).T, axis=0)
|
| 81 |
+
features = np.hstack((features, mfcc))
|
| 82 |
+
feature_info['counts']['mfcc'] = n_mfcc
|
| 83 |
+
|
| 84 |
# 4. RMS Energy (1 feature)
|
| 85 |
+
if 'rms' in feature_types:
|
| 86 |
+
rms = np.mean(librosa.feature.rms(y=y).T, axis=0)
|
| 87 |
+
features = np.hstack((features, rms))
|
| 88 |
+
feature_info['counts']['rms'] = 1
|
| 89 |
+
|
| 90 |
# 5. Mel Spectrogram (128 features)
|
| 91 |
+
if 'mel' in feature_types:
|
| 92 |
+
mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
|
| 93 |
+
features = np.hstack((features, mel))
|
| 94 |
+
feature_info['counts']['mel'] = 128
|
| 95 |
+
|
| 96 |
+
feature_info['total'] = len(features)
|
| 97 |
+
|
| 98 |
+
return features, y, sr, feature_info
|
| 99 |
+
|
| 100 |
except Exception as e:
|
| 101 |
+
raise Exception(
|
| 102 |
+
f"Error extracting features from {audio_path}: {str(e)}")
|
| 103 |
|
| 104 |
|
| 105 |
+
def get_feature_names(feature_types=None, n_mfcc=None):
|
| 106 |
"""
|
| 107 |
+
Get names of features based on selected types
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
feature_types (list): List of feature types
|
| 111 |
+
n_mfcc (int): Number of MFCC coefficients
|
| 112 |
+
|
| 113 |
Returns:
|
| 114 |
list: List of feature names
|
| 115 |
"""
|
| 116 |
+
if feature_types is None:
|
| 117 |
+
feature_types = config.DEFAULT_FEATURE_TYPES
|
| 118 |
+
|
| 119 |
+
if n_mfcc is None:
|
| 120 |
+
n_mfcc = config.MFCC_DEFAULT
|
| 121 |
+
|
| 122 |
+
names = []
|
| 123 |
+
|
| 124 |
+
if 'zcr' in feature_types:
|
| 125 |
+
names.append('zcr')
|
| 126 |
+
|
| 127 |
+
if 'chroma' in feature_types:
|
| 128 |
+
names.extend([f'chroma_{i}' for i in range(12)])
|
| 129 |
+
|
| 130 |
+
if 'mfcc' in feature_types:
|
| 131 |
+
names.extend([f'mfcc_{i}' for i in range(n_mfcc)])
|
| 132 |
+
|
| 133 |
+
if 'rms' in feature_types:
|
| 134 |
+
names.append('rms')
|
| 135 |
+
|
| 136 |
+
if 'mel' in feature_types:
|
| 137 |
+
names.extend([f'mel_{i}' for i in range(128)])
|
| 138 |
+
|
| 139 |
+
return names
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def get_feature_count(feature_types=None, n_mfcc=None):
|
| 143 |
+
"""
|
| 144 |
+
Get total feature count for selected types
|
| 145 |
+
|
| 146 |
+
Args:
|
| 147 |
+
feature_types (list): List of feature types
|
| 148 |
+
n_mfcc (int): Number of MFCC coefficients
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
int: Total number of features
|
| 152 |
+
"""
|
| 153 |
+
if feature_types is None:
|
| 154 |
+
feature_types = config.DEFAULT_FEATURE_TYPES
|
| 155 |
+
|
| 156 |
+
if n_mfcc is None:
|
| 157 |
+
n_mfcc = config.MFCC_DEFAULT
|
| 158 |
+
|
| 159 |
+
count = 0
|
| 160 |
+
|
| 161 |
+
if 'zcr' in feature_types:
|
| 162 |
+
count += 1
|
| 163 |
+
if 'chroma' in feature_types:
|
| 164 |
+
count += 12
|
| 165 |
+
if 'mfcc' in feature_types:
|
| 166 |
+
count += n_mfcc # VARIABLE
|
| 167 |
+
if 'rms' in feature_types:
|
| 168 |
+
count += 1
|
| 169 |
+
if 'mel' in feature_types:
|
| 170 |
+
count += 128
|
| 171 |
+
|
| 172 |
+
return count
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def get_feature_indices(feature_types=None, n_mfcc=None, total_mfcc_in_dataset=None):
|
| 176 |
+
"""
|
| 177 |
+
Get feature indices for selected types (for existing datasets)
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
feature_types (list): List of feature types to keep
|
| 181 |
+
n_mfcc (int): Number of MFCC to keep
|
| 182 |
+
total_mfcc_in_dataset (int): Total MFCC in the dataset
|
| 183 |
+
|
| 184 |
+
Returns:
|
| 185 |
+
np.array: Indices of features to keep
|
| 186 |
+
"""
|
| 187 |
+
if feature_types is None:
|
| 188 |
+
feature_types = config.DEFAULT_FEATURE_TYPES
|
| 189 |
+
|
| 190 |
+
if n_mfcc is None:
|
| 191 |
+
n_mfcc = config.MFCC_DEFAULT
|
| 192 |
+
|
| 193 |
+
if total_mfcc_in_dataset is None:
|
| 194 |
+
total_mfcc_in_dataset = config.MFCC_DEFAULT
|
| 195 |
+
|
| 196 |
+
indices = []
|
| 197 |
+
current_idx = 0
|
| 198 |
+
|
| 199 |
+
# ZCR (1)
|
| 200 |
+
if 'zcr' in feature_types:
|
| 201 |
+
indices.extend(range(current_idx, current_idx + 1))
|
| 202 |
+
current_idx += 1
|
| 203 |
+
|
| 204 |
+
# Chroma (12)
|
| 205 |
+
if 'chroma' in feature_types:
|
| 206 |
+
indices.extend(range(current_idx, current_idx + 12))
|
| 207 |
+
current_idx += 12
|
| 208 |
+
|
| 209 |
+
# MFCC (variable)
|
| 210 |
+
if 'mfcc' in feature_types:
|
| 211 |
+
# Only take first n_mfcc coefficients
|
| 212 |
+
indices.extend(range(current_idx, current_idx +
|
| 213 |
+
min(n_mfcc, total_mfcc_in_dataset)))
|
| 214 |
+
current_idx += total_mfcc_in_dataset
|
| 215 |
+
|
| 216 |
+
# RMS (1)
|
| 217 |
+
if 'rms' in feature_types:
|
| 218 |
+
indices.extend(range(current_idx, current_idx + 1))
|
| 219 |
+
current_idx += 1
|
| 220 |
+
|
| 221 |
+
# Mel (128)
|
| 222 |
+
if 'mel' in feature_types:
|
| 223 |
+
indices.extend(range(current_idx, current_idx + 128))
|
| 224 |
+
current_idx += 128
|
| 225 |
+
|
| 226 |
+
return np.array(indices)
|
src/genetic_algorithm.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
"""
|
| 2 |
Genetic Algorithm for feature selection and hyperparameter optimization
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
import random
|
| 7 |
import time
|
| 8 |
-
|
|
|
|
| 9 |
from joblib import Parallel, delayed
|
| 10 |
|
| 11 |
from xgboost import XGBClassifier
|
|
@@ -15,17 +17,30 @@ from sklearn.metrics import accuracy_score
|
|
| 15 |
|
| 16 |
import config
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
class GeneticAlgorithm:
|
| 20 |
"""GA for optimizing features + hyperparameters + ensemble weights"""
|
| 21 |
-
|
| 22 |
def __init__(self, X: np.ndarray, y: np.ndarray, n_features_to_select: int = 80):
|
| 23 |
self.X = X
|
| 24 |
self.y = y
|
| 25 |
self.n_features = X.shape[1]
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
self.n_classes = len(np.unique(y))
|
| 28 |
-
|
| 29 |
# GA parameters from config
|
| 30 |
self.population_size = config.GA_CONFIG['population_size']
|
| 31 |
self.n_generations = config.GA_CONFIG['n_generations']
|
|
@@ -34,192 +49,180 @@ class GeneticAlgorithm:
|
|
| 34 |
self.elite_size = config.GA_CONFIG['elite_size']
|
| 35 |
self.early_stopping_patience = config.GA_CONFIG['early_stopping_patience']
|
| 36 |
self.early_stopping_tolerance = config.GA_CONFIG['early_stopping_tolerance']
|
| 37 |
-
|
| 38 |
self.best_chromosome = None
|
| 39 |
self.best_fitness = 0
|
| 40 |
self.history = []
|
| 41 |
self.log_messages = []
|
| 42 |
-
|
| 43 |
def log(self, message: str):
|
| 44 |
"""Add log message with timestamp"""
|
| 45 |
timestamp = time.strftime("%H:%M:%S")
|
| 46 |
log_entry = f"[{timestamp}] {message}"
|
| 47 |
self.log_messages.append(log_entry)
|
| 48 |
print(log_entry)
|
| 49 |
-
|
| 50 |
def create_chromosome(self) -> Dict:
|
| 51 |
-
"""Create random chromosome"""
|
| 52 |
chromosome = {
|
| 53 |
'feature_indices': np.sort(np.random.choice(
|
| 54 |
self.n_features, self.n_select, replace=False
|
| 55 |
))
|
| 56 |
}
|
| 57 |
-
|
| 58 |
-
# Add hyperparameters for each model
|
| 59 |
for model_prefix, params in config.MODEL_HYPERPARAMS.items():
|
| 60 |
for param_name, param_values in params.items():
|
| 61 |
key = f"{model_prefix}_{param_name}"
|
| 62 |
chromosome[key] = random.choice(param_values)
|
| 63 |
-
|
| 64 |
# Ensemble weights
|
| 65 |
chromosome['weights'] = self._random_weights(4)
|
| 66 |
-
|
| 67 |
return chromosome
|
| 68 |
-
|
| 69 |
def _random_weights(self, n: int) -> np.ndarray:
|
| 70 |
"""Generate n random weights that sum to 1"""
|
| 71 |
return np.random.dirichlet(np.ones(n))
|
| 72 |
-
|
| 73 |
-
def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
|
| 74 |
X_val: np.ndarray, y_val: np.ndarray) -> float:
|
| 75 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
try:
|
| 77 |
feature_indices = chromosome['feature_indices']
|
|
|
|
|
|
|
| 78 |
X_train_selected = X_train[:, feature_indices]
|
| 79 |
X_val_selected = X_val[:, feature_indices]
|
| 80 |
-
|
| 81 |
models = []
|
| 82 |
-
|
|
|
|
| 83 |
# XGBoost
|
| 84 |
-
#
|
| 85 |
-
# n_estimators=chromosome['xgb_n_estimators'],
|
| 86 |
-
# max_depth=chromosome['xgb_max_depth'],
|
| 87 |
-
# learning_rate=chromosome['xgb_learning_rate'],
|
| 88 |
-
# objective='multi:softprob',
|
| 89 |
-
# num_class=self.n_classes,
|
| 90 |
-
# random_state=config.RANDOM_STATE,
|
| 91 |
-
# n_jobs=-1,
|
| 92 |
-
# verbosity=0
|
| 93 |
-
# )
|
| 94 |
-
# xgb.fit(X_train_selected, y_train)
|
| 95 |
-
# models.append(xgb)
|
| 96 |
xgb = XGBClassifier(
|
| 97 |
-
n_estimators=chromosome
|
| 98 |
-
max_depth=chromosome
|
| 99 |
-
learning_rate=chromosome
|
| 100 |
-
subsample=chromosome.get('xgb_subsample', 0.8),
|
| 101 |
-
colsample_bytree=chromosome.get('xgb_colsample_bytree', 0.8)
|
| 102 |
-
min_child_weight=chromosome.get('xgb_min_child_weight', 1),
|
| 103 |
-
gamma=chromosome.get('xgb_gamma', 0),
|
| 104 |
objective='multi:softprob',
|
| 105 |
num_class=self.n_classes,
|
| 106 |
random_state=config.RANDOM_STATE,
|
| 107 |
n_jobs=-1,
|
| 108 |
-
verbosity=0
|
| 109 |
-
eval_metric='mlogloss' # NEW: Better metric
|
| 110 |
)
|
| 111 |
xgb.fit(X_train_selected, y_train)
|
| 112 |
models.append(xgb)
|
| 113 |
-
|
|
|
|
| 114 |
# LightGBM
|
| 115 |
-
#
|
| 116 |
-
# n_estimators=chromosome['lgbm_n_estimators'],
|
| 117 |
-
# num_leaves=chromosome['lgbm_num_leaves'],
|
| 118 |
-
# learning_rate=chromosome['lgbm_learning_rate'],
|
| 119 |
-
# objective='multiclass',
|
| 120 |
-
# num_class=self.n_classes,
|
| 121 |
-
# random_state=config.RANDOM_STATE,
|
| 122 |
-
# n_jobs=-1,
|
| 123 |
-
# verbose=-1
|
| 124 |
-
# )
|
| 125 |
lgbm = LGBMClassifier(
|
| 126 |
-
n_estimators=chromosome
|
| 127 |
-
num_leaves=chromosome
|
| 128 |
-
learning_rate=chromosome
|
| 129 |
-
min_child_samples=chromosome.get('lgbm_min_child_samples', 20),
|
| 130 |
-
subsample=chromosome.get('lgbm_subsample', 0.8),
|
| 131 |
-
colsample_bytree=chromosome.get('lgbm_colsample_bytree', 0.8),
|
| 132 |
-
reg_alpha=chromosome.get('lgbm_reg_alpha', 0),
|
| 133 |
-
reg_lambda=chromosome.get('lgbm_reg_lambda', 0),
|
| 134 |
objective='multiclass',
|
| 135 |
num_class=self.n_classes,
|
| 136 |
random_state=config.RANDOM_STATE,
|
| 137 |
n_jobs=-1,
|
| 138 |
verbose=-1,
|
| 139 |
-
|
| 140 |
)
|
| 141 |
lgbm.fit(X_train_selected, y_train)
|
| 142 |
models.append(lgbm)
|
| 143 |
-
|
|
|
|
| 144 |
# Gradient Boosting
|
| 145 |
-
#
|
| 146 |
-
# n_estimators=chromosome['gb_n_estimators'],
|
| 147 |
-
# max_depth=chromosome['gb_max_depth'],
|
| 148 |
-
# learning_rate=chromosome['gb_learning_rate'],
|
| 149 |
-
# random_state=config.RANDOM_STATE
|
| 150 |
-
# )
|
| 151 |
gb = GradientBoostingClassifier(
|
| 152 |
-
n_estimators=chromosome
|
| 153 |
-
max_depth=chromosome
|
| 154 |
-
learning_rate=chromosome
|
| 155 |
-
subsample=chromosome.get('gb_subsample', 0.8),
|
| 156 |
-
min_samples_split=chromosome.get('gb_min_samples_split', 2),
|
| 157 |
-
min_samples_leaf=chromosome.get('gb_min_samples_leaf', 1),
|
| 158 |
random_state=config.RANDOM_STATE
|
| 159 |
)
|
| 160 |
gb.fit(X_train_selected, y_train)
|
| 161 |
models.append(gb)
|
| 162 |
-
|
| 163 |
-
#
|
| 164 |
-
#
|
| 165 |
-
#
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
# )
|
| 170 |
ada = AdaBoostClassifier(
|
| 171 |
-
n_estimators=chromosome
|
| 172 |
-
learning_rate=chromosome
|
| 173 |
-
algorithm=chromosome.get('ada_algorithm', 'SAMME'), # NEW
|
| 174 |
random_state=config.RANDOM_STATE
|
| 175 |
)
|
| 176 |
ada.fit(X_train_selected, y_train)
|
| 177 |
models.append(ada)
|
| 178 |
-
|
| 179 |
-
#
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
| 181 |
weights = chromosome['weights']
|
| 182 |
ensemble_proba = np.average(predictions, axis=0, weights=weights)
|
| 183 |
y_pred = np.argmax(ensemble_proba, axis=1)
|
| 184 |
-
|
| 185 |
accuracy = accuracy_score(y_val, y_pred)
|
| 186 |
return accuracy
|
| 187 |
-
|
| 188 |
except Exception as e:
|
| 189 |
-
print(f"Error in fitness: {e}")
|
|
|
|
|
|
|
| 190 |
return 0.0
|
| 191 |
-
|
| 192 |
def crossover(self, parent1: Dict, parent2: Dict) -> Tuple[Dict, Dict]:
|
| 193 |
"""Crossover operation"""
|
| 194 |
if random.random() > self.crossover_rate:
|
| 195 |
return parent1.copy(), parent2.copy()
|
| 196 |
-
|
| 197 |
child1 = {}
|
| 198 |
child2 = {}
|
| 199 |
-
|
| 200 |
# Feature crossover
|
| 201 |
mask = np.random.rand(self.n_select) < 0.5
|
| 202 |
-
child1_features = np.where(
|
| 203 |
-
|
| 204 |
-
|
|
|
|
|
|
|
| 205 |
child1_features = np.unique(child1_features)
|
| 206 |
child2_features = np.unique(child2_features)
|
| 207 |
-
|
| 208 |
# Fill to required size
|
| 209 |
while len(child1_features) < self.n_select:
|
| 210 |
new_feat = random.randint(0, self.n_features - 1)
|
| 211 |
if new_feat not in child1_features:
|
| 212 |
child1_features = np.append(child1_features, new_feat)
|
| 213 |
-
|
| 214 |
while len(child2_features) < self.n_select:
|
| 215 |
new_feat = random.randint(0, self.n_features - 1)
|
| 216 |
if new_feat not in child2_features:
|
| 217 |
child2_features = np.append(child2_features, new_feat)
|
| 218 |
-
|
| 219 |
child1['feature_indices'] = np.sort(child1_features[:self.n_select])
|
| 220 |
child2['feature_indices'] = np.sort(child2_features[:self.n_select])
|
| 221 |
-
|
| 222 |
-
# Hyperparameter crossover
|
| 223 |
for key in parent1.keys():
|
| 224 |
if key != 'feature_indices':
|
| 225 |
if random.random() < 0.5:
|
|
@@ -228,71 +231,74 @@ class GeneticAlgorithm:
|
|
| 228 |
else:
|
| 229 |
child1[key] = parent2[key]
|
| 230 |
child2[key] = parent1[key]
|
| 231 |
-
|
| 232 |
return child1, child2
|
| 233 |
-
|
| 234 |
def mutate(self, chromosome: Dict) -> Dict:
|
| 235 |
"""Mutation operation"""
|
| 236 |
mutated = chromosome.copy()
|
| 237 |
-
|
| 238 |
# Feature mutation
|
| 239 |
if random.random() < self.mutation_rate:
|
| 240 |
n_replace = random.randint(1, 5)
|
| 241 |
-
indices_to_replace = np.random.choice(
|
| 242 |
-
|
|
|
|
| 243 |
for idx in indices_to_replace:
|
| 244 |
new_feat = random.randint(0, self.n_features - 1)
|
| 245 |
while new_feat in mutated['feature_indices']:
|
| 246 |
new_feat = random.randint(0, self.n_features - 1)
|
| 247 |
mutated['feature_indices'][idx] = new_feat
|
| 248 |
-
|
| 249 |
mutated['feature_indices'] = np.sort(mutated['feature_indices'])
|
| 250 |
-
|
| 251 |
-
# Hyperparameter mutation
|
| 252 |
if random.random() < self.mutation_rate:
|
| 253 |
-
param_keys = [k for k in chromosome.keys() if k not in [
|
|
|
|
| 254 |
if param_keys:
|
| 255 |
param_to_mutate = random.choice(param_keys)
|
| 256 |
temp = self.create_chromosome()
|
| 257 |
mutated[param_to_mutate] = temp[param_to_mutate]
|
| 258 |
-
|
| 259 |
# Weight mutation
|
| 260 |
if random.random() < self.mutation_rate:
|
| 261 |
mutated['weights'] = self._random_weights(4)
|
| 262 |
-
|
| 263 |
return mutated
|
| 264 |
-
|
| 265 |
-
def evaluate_population_parallel(self, population: List[Dict],
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
"""Evaluate entire population in parallel"""
|
| 270 |
-
self.log(
|
| 271 |
-
|
|
|
|
| 272 |
fitness_scores = Parallel(n_jobs=n_jobs, verbose=0)(
|
| 273 |
delayed(self.fitness)(chromosome, X_train, y_train, X_val, y_val)
|
| 274 |
for chromosome in population
|
| 275 |
)
|
| 276 |
-
|
| 277 |
return fitness_scores
|
| 278 |
-
|
| 279 |
def evolve(self, X_train: np.ndarray, y_train: np.ndarray,
|
| 280 |
X_val: np.ndarray, y_val: np.ndarray,
|
| 281 |
progress_callback: Optional[Callable] = None,
|
| 282 |
n_jobs: int = 2) -> Dict:
|
| 283 |
"""
|
| 284 |
Main GA evolution loop with parallel evaluation, early stopping, and logging
|
| 285 |
-
|
| 286 |
Args:
|
| 287 |
-
X_train, y_train: Training data
|
| 288 |
-
X_val, y_val: Validation data
|
| 289 |
progress_callback: Optional callback for progress updates
|
| 290 |
n_jobs: Number of parallel jobs
|
| 291 |
-
|
| 292 |
Returns:
|
| 293 |
Best chromosome found
|
| 294 |
"""
|
| 295 |
-
|
| 296 |
self.log("="*70)
|
| 297 |
self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
|
| 298 |
self.log("="*70)
|
|
@@ -301,28 +307,30 @@ class GeneticAlgorithm:
|
|
| 301 |
self.log(f"Features to select: {self.n_select}/{self.n_features}")
|
| 302 |
self.log(f"Early stopping patience: {self.early_stopping_patience}")
|
| 303 |
self.log(f"Parallel jobs: {n_jobs}")
|
|
|
|
| 304 |
self.log("="*70)
|
| 305 |
-
|
| 306 |
-
population = [self.create_chromosome()
|
| 307 |
-
|
|
|
|
| 308 |
start_time = time.time()
|
| 309 |
no_improve_count = 0
|
| 310 |
-
|
| 311 |
for generation in range(self.n_generations):
|
| 312 |
gen_start = time.time()
|
| 313 |
-
|
| 314 |
self.log(f"\n📊 Generation {generation + 1}/{self.n_generations}")
|
| 315 |
-
|
| 316 |
# Parallel fitness evaluation
|
| 317 |
fitness_scores = self.evaluate_population_parallel(
|
| 318 |
population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
|
| 319 |
)
|
| 320 |
-
|
| 321 |
max_fitness = max(fitness_scores)
|
| 322 |
avg_fitness = np.mean(fitness_scores)
|
| 323 |
std_fitness = np.std(fitness_scores)
|
| 324 |
max_idx = fitness_scores.index(max_fitness)
|
| 325 |
-
|
| 326 |
# Track improvement
|
| 327 |
improved = False
|
| 328 |
if max_fitness > self.best_fitness + self.early_stopping_tolerance:
|
|
@@ -331,22 +339,31 @@ class GeneticAlgorithm:
|
|
| 331 |
self.best_chromosome = population[max_idx].copy()
|
| 332 |
no_improve_count = 0
|
| 333 |
improved = True
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
else:
|
| 336 |
no_improve_count += 1
|
| 337 |
-
self.log(
|
| 338 |
-
|
|
|
|
| 339 |
# Log statistics
|
| 340 |
self.log(f" Average: {avg_fitness:.4f} (σ={std_fitness:.4f})")
|
| 341 |
-
self.log(
|
| 342 |
-
|
|
|
|
| 343 |
gen_time = time.time() - gen_start
|
| 344 |
elapsed = time.time() - start_time
|
| 345 |
avg_gen_time = elapsed / (generation + 1)
|
| 346 |
eta = avg_gen_time * (self.n_generations - generation - 1)
|
| 347 |
-
|
| 348 |
-
self.log(
|
| 349 |
-
|
|
|
|
| 350 |
self.history.append({
|
| 351 |
'generation': generation + 1,
|
| 352 |
'best_fitness': max_fitness,
|
|
@@ -355,31 +372,33 @@ class GeneticAlgorithm:
|
|
| 355 |
'time': gen_time,
|
| 356 |
'improved': improved
|
| 357 |
})
|
| 358 |
-
|
| 359 |
# Update progress callback
|
| 360 |
if progress_callback:
|
| 361 |
progress_callback(
|
| 362 |
(generation + 1) / self.n_generations,
|
| 363 |
desc=f"Gen {generation+1}/{self.n_generations} | Best: {max_fitness:.4f} | Avg: {avg_fitness:.4f} | ETA: {eta/60:.0f}min"
|
| 364 |
)
|
| 365 |
-
|
| 366 |
# Early stopping check
|
| 367 |
if no_improve_count >= self.early_stopping_patience:
|
| 368 |
self.log(f"\n🛑 EARLY STOPPING at generation {generation + 1}")
|
| 369 |
-
self.log(
|
|
|
|
| 370 |
self.log(f" Best fitness: {self.best_fitness:.4f}")
|
| 371 |
break
|
| 372 |
-
|
| 373 |
# Selection (Tournament + Elitism)
|
| 374 |
selected = []
|
| 375 |
for _ in range(self.population_size - self.elite_size):
|
| 376 |
-
tournament = random.sample(
|
|
|
|
| 377 |
winner = max(tournament, key=lambda x: x[1])[0]
|
| 378 |
selected.append(winner)
|
| 379 |
-
|
| 380 |
elite_indices = np.argsort(fitness_scores)[-self.elite_size:]
|
| 381 |
elite = [population[i] for i in elite_indices]
|
| 382 |
-
|
| 383 |
# Crossover & Mutation
|
| 384 |
offspring = []
|
| 385 |
for i in range(0, len(selected), 2):
|
|
@@ -387,28 +406,36 @@ class GeneticAlgorithm:
|
|
| 387 |
child1, child2 = self.crossover(selected[i], selected[i+1])
|
| 388 |
offspring.append(self.mutate(child1))
|
| 389 |
offspring.append(self.mutate(child2))
|
| 390 |
-
|
| 391 |
-
population = elite +
|
| 392 |
-
|
|
|
|
| 393 |
total_time = time.time() - start_time
|
| 394 |
-
|
| 395 |
self.log("\n" + "="*70)
|
| 396 |
self.log("✅ GA OPTIMIZATION COMPLETE")
|
| 397 |
self.log("="*70)
|
| 398 |
self.log(f"Final best fitness: {self.best_fitness:.4f}")
|
| 399 |
-
self.log(
|
|
|
|
| 400 |
self.log(f"Total time: {total_time/60:.1f} minutes")
|
| 401 |
-
self.log(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
self.log("="*70)
|
| 403 |
-
|
| 404 |
-
|
| 405 |
if self.best_chromosome is None:
|
| 406 |
-
self.log(
|
|
|
|
| 407 |
fitness_scores = self.evaluate_population_parallel(
|
| 408 |
population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
|
| 409 |
)
|
| 410 |
max_idx = fitness_scores.index(max(fitness_scores))
|
| 411 |
self.best_chromosome = population[max_idx].copy()
|
| 412 |
self.best_fitness = fitness_scores[max_idx]
|
| 413 |
-
|
| 414 |
-
return self.best_chromosome
|
|
|
|
| 1 |
"""
|
| 2 |
Genetic Algorithm for feature selection and hyperparameter optimization
|
| 3 |
+
Supports AdaBoost algorithm selection and variable MFCC counts
|
| 4 |
"""
|
| 5 |
|
| 6 |
import numpy as np
|
| 7 |
import random
|
| 8 |
import time
|
| 9 |
+
import warnings
|
| 10 |
+
from typing import Dict, List, Callable, Optional, Tuple
|
| 11 |
from joblib import Parallel, delayed
|
| 12 |
|
| 13 |
from xgboost import XGBClassifier
|
|
|
|
| 17 |
|
| 18 |
import config
|
| 19 |
|
| 20 |
+
# Suppress LightGBM warnings
|
| 21 |
+
warnings.filterwarnings(
|
| 22 |
+
'ignore', message='X does not have valid feature names')
|
| 23 |
+
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
|
| 24 |
+
|
| 25 |
|
| 26 |
class GeneticAlgorithm:
|
| 27 |
"""GA for optimizing features + hyperparameters + ensemble weights"""
|
| 28 |
+
|
| 29 |
def __init__(self, X: np.ndarray, y: np.ndarray, n_features_to_select: int = 80):
|
| 30 |
self.X = X
|
| 31 |
self.y = y
|
| 32 |
self.n_features = X.shape[1]
|
| 33 |
+
|
| 34 |
+
# Auto-adjust if requested features exceed available
|
| 35 |
+
if n_features_to_select > self.n_features:
|
| 36 |
+
print(
|
| 37 |
+
f"⚠️ Adjusted: {n_features_to_select} → {self.n_features} features")
|
| 38 |
+
self.n_select = self.n_features
|
| 39 |
+
else:
|
| 40 |
+
self.n_select = n_features_to_select
|
| 41 |
+
|
| 42 |
self.n_classes = len(np.unique(y))
|
| 43 |
+
|
| 44 |
# GA parameters from config
|
| 45 |
self.population_size = config.GA_CONFIG['population_size']
|
| 46 |
self.n_generations = config.GA_CONFIG['n_generations']
|
|
|
|
| 49 |
self.elite_size = config.GA_CONFIG['elite_size']
|
| 50 |
self.early_stopping_patience = config.GA_CONFIG['early_stopping_patience']
|
| 51 |
self.early_stopping_tolerance = config.GA_CONFIG['early_stopping_tolerance']
|
| 52 |
+
|
| 53 |
self.best_chromosome = None
|
| 54 |
self.best_fitness = 0
|
| 55 |
self.history = []
|
| 56 |
self.log_messages = []
|
| 57 |
+
|
| 58 |
def log(self, message: str):
|
| 59 |
"""Add log message with timestamp"""
|
| 60 |
timestamp = time.strftime("%H:%M:%S")
|
| 61 |
log_entry = f"[{timestamp}] {message}"
|
| 62 |
self.log_messages.append(log_entry)
|
| 63 |
print(log_entry)
|
| 64 |
+
|
| 65 |
def create_chromosome(self) -> Dict:
|
| 66 |
+
"""Create random chromosome with ALL hyperparameters including AdaBoost algorithm"""
|
| 67 |
chromosome = {
|
| 68 |
'feature_indices': np.sort(np.random.choice(
|
| 69 |
self.n_features, self.n_select, replace=False
|
| 70 |
))
|
| 71 |
}
|
| 72 |
+
|
| 73 |
+
# Add ALL hyperparameters for each model
|
| 74 |
for model_prefix, params in config.MODEL_HYPERPARAMS.items():
|
| 75 |
for param_name, param_values in params.items():
|
| 76 |
key = f"{model_prefix}_{param_name}"
|
| 77 |
chromosome[key] = random.choice(param_values)
|
| 78 |
+
|
| 79 |
# Ensemble weights
|
| 80 |
chromosome['weights'] = self._random_weights(4)
|
| 81 |
+
|
| 82 |
return chromosome
|
| 83 |
+
|
| 84 |
def _random_weights(self, n: int) -> np.ndarray:
|
| 85 |
"""Generate n random weights that sum to 1"""
|
| 86 |
return np.random.dirichlet(np.ones(n))
|
| 87 |
+
|
| 88 |
+
def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
|
| 89 |
X_val: np.ndarray, y_val: np.ndarray) -> float:
|
| 90 |
+
"""
|
| 91 |
+
Calculate fitness using validation accuracy
|
| 92 |
+
|
| 93 |
+
Now optimizes AdaBoost algorithm ('SAMME' vs 'SAMME.R')
|
| 94 |
+
"""
|
| 95 |
try:
|
| 96 |
feature_indices = chromosome['feature_indices']
|
| 97 |
+
|
| 98 |
+
# Keep as NumPy arrays - FAST and efficient
|
| 99 |
X_train_selected = X_train[:, feature_indices]
|
| 100 |
X_val_selected = X_val[:, feature_indices]
|
| 101 |
+
|
| 102 |
models = []
|
| 103 |
+
|
| 104 |
+
# ================================================================
|
| 105 |
# XGBoost
|
| 106 |
+
# ================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
xgb = XGBClassifier(
|
| 108 |
+
n_estimators=chromosome.get('xgb_n_estimators', 100),
|
| 109 |
+
max_depth=chromosome.get('xgb_max_depth', 6),
|
| 110 |
+
learning_rate=chromosome.get('xgb_learning_rate', 0.1),
|
| 111 |
+
subsample=chromosome.get('xgb_subsample', 0.8),
|
| 112 |
+
colsample_bytree=chromosome.get('xgb_colsample_bytree', 0.8),
|
| 113 |
+
min_child_weight=chromosome.get('xgb_min_child_weight', 1),
|
| 114 |
+
gamma=chromosome.get('xgb_gamma', 0),
|
| 115 |
objective='multi:softprob',
|
| 116 |
num_class=self.n_classes,
|
| 117 |
random_state=config.RANDOM_STATE,
|
| 118 |
n_jobs=-1,
|
| 119 |
+
verbosity=0
|
|
|
|
| 120 |
)
|
| 121 |
xgb.fit(X_train_selected, y_train)
|
| 122 |
models.append(xgb)
|
| 123 |
+
|
| 124 |
+
# ================================================================
|
| 125 |
# LightGBM
|
| 126 |
+
# ================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
lgbm = LGBMClassifier(
|
| 128 |
+
n_estimators=chromosome.get('lgbm_n_estimators', 100),
|
| 129 |
+
num_leaves=chromosome.get('lgbm_num_leaves', 31),
|
| 130 |
+
learning_rate=chromosome.get('lgbm_learning_rate', 0.1),
|
| 131 |
+
min_child_samples=chromosome.get('lgbm_min_child_samples', 20),
|
| 132 |
+
subsample=chromosome.get('lgbm_subsample', 0.8),
|
| 133 |
+
colsample_bytree=chromosome.get('lgbm_colsample_bytree', 0.8),
|
| 134 |
+
reg_alpha=chromosome.get('lgbm_reg_alpha', 0),
|
| 135 |
+
reg_lambda=chromosome.get('lgbm_reg_lambda', 0),
|
| 136 |
objective='multiclass',
|
| 137 |
num_class=self.n_classes,
|
| 138 |
random_state=config.RANDOM_STATE,
|
| 139 |
n_jobs=-1,
|
| 140 |
verbose=-1,
|
| 141 |
+
force_col_wise=True
|
| 142 |
)
|
| 143 |
lgbm.fit(X_train_selected, y_train)
|
| 144 |
models.append(lgbm)
|
| 145 |
+
|
| 146 |
+
# ================================================================
|
| 147 |
# Gradient Boosting
|
| 148 |
+
# ================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
gb = GradientBoostingClassifier(
|
| 150 |
+
n_estimators=chromosome.get('gb_n_estimators', 100),
|
| 151 |
+
max_depth=chromosome.get('gb_max_depth', 5),
|
| 152 |
+
learning_rate=chromosome.get('gb_learning_rate', 0.1),
|
| 153 |
+
subsample=chromosome.get('gb_subsample', 0.8),
|
| 154 |
+
min_samples_split=chromosome.get('gb_min_samples_split', 2),
|
| 155 |
+
min_samples_leaf=chromosome.get('gb_min_samples_leaf', 1),
|
| 156 |
random_state=config.RANDOM_STATE
|
| 157 |
)
|
| 158 |
gb.fit(X_train_selected, y_train)
|
| 159 |
models.append(gb)
|
| 160 |
+
|
| 161 |
+
# ================================================================
|
| 162 |
+
# AdaBoost - NOW WITH ALGORITHM OPTIMIZATION
|
| 163 |
+
# ================================================================
|
| 164 |
+
ada_algorithm = chromosome.get(
|
| 165 |
+
'ada_algorithm', 'SAMME') # ← GA optimizes this!
|
| 166 |
+
|
|
|
|
| 167 |
ada = AdaBoostClassifier(
|
| 168 |
+
n_estimators=chromosome.get('ada_n_estimators', 100),
|
| 169 |
+
learning_rate=chromosome.get('ada_learning_rate', 1.0),
|
|
|
|
| 170 |
random_state=config.RANDOM_STATE
|
| 171 |
)
|
| 172 |
ada.fit(X_train_selected, y_train)
|
| 173 |
models.append(ada)
|
| 174 |
+
|
| 175 |
+
# ================================================================
|
| 176 |
+
# Ensemble Prediction
|
| 177 |
+
# ================================================================
|
| 178 |
+
predictions = [model.predict_proba(
|
| 179 |
+
X_val_selected) for model in models]
|
| 180 |
weights = chromosome['weights']
|
| 181 |
ensemble_proba = np.average(predictions, axis=0, weights=weights)
|
| 182 |
y_pred = np.argmax(ensemble_proba, axis=1)
|
| 183 |
+
|
| 184 |
accuracy = accuracy_score(y_val, y_pred)
|
| 185 |
return accuracy
|
| 186 |
+
|
| 187 |
except Exception as e:
|
| 188 |
+
print(f"⚠️ Error in fitness evaluation: {e}")
|
| 189 |
+
import traceback
|
| 190 |
+
traceback.print_exc()
|
| 191 |
return 0.0
|
| 192 |
+
|
| 193 |
def crossover(self, parent1: Dict, parent2: Dict) -> Tuple[Dict, Dict]:
|
| 194 |
"""Crossover operation"""
|
| 195 |
if random.random() > self.crossover_rate:
|
| 196 |
return parent1.copy(), parent2.copy()
|
| 197 |
+
|
| 198 |
child1 = {}
|
| 199 |
child2 = {}
|
| 200 |
+
|
| 201 |
# Feature crossover
|
| 202 |
mask = np.random.rand(self.n_select) < 0.5
|
| 203 |
+
child1_features = np.where(
|
| 204 |
+
mask, parent1['feature_indices'], parent2['feature_indices'])
|
| 205 |
+
child2_features = np.where(
|
| 206 |
+
mask, parent2['feature_indices'], parent1['feature_indices'])
|
| 207 |
+
|
| 208 |
child1_features = np.unique(child1_features)
|
| 209 |
child2_features = np.unique(child2_features)
|
| 210 |
+
|
| 211 |
# Fill to required size
|
| 212 |
while len(child1_features) < self.n_select:
|
| 213 |
new_feat = random.randint(0, self.n_features - 1)
|
| 214 |
if new_feat not in child1_features:
|
| 215 |
child1_features = np.append(child1_features, new_feat)
|
| 216 |
+
|
| 217 |
while len(child2_features) < self.n_select:
|
| 218 |
new_feat = random.randint(0, self.n_features - 1)
|
| 219 |
if new_feat not in child2_features:
|
| 220 |
child2_features = np.append(child2_features, new_feat)
|
| 221 |
+
|
| 222 |
child1['feature_indices'] = np.sort(child1_features[:self.n_select])
|
| 223 |
child2['feature_indices'] = np.sort(child2_features[:self.n_select])
|
| 224 |
+
|
| 225 |
+
# Hyperparameter crossover (including AdaBoost algorithm)
|
| 226 |
for key in parent1.keys():
|
| 227 |
if key != 'feature_indices':
|
| 228 |
if random.random() < 0.5:
|
|
|
|
| 231 |
else:
|
| 232 |
child1[key] = parent2[key]
|
| 233 |
child2[key] = parent1[key]
|
| 234 |
+
|
| 235 |
return child1, child2
|
| 236 |
+
|
| 237 |
def mutate(self, chromosome: Dict) -> Dict:
|
| 238 |
"""Mutation operation"""
|
| 239 |
mutated = chromosome.copy()
|
| 240 |
+
|
| 241 |
# Feature mutation
|
| 242 |
if random.random() < self.mutation_rate:
|
| 243 |
n_replace = random.randint(1, 5)
|
| 244 |
+
indices_to_replace = np.random.choice(
|
| 245 |
+
self.n_select, n_replace, replace=False)
|
| 246 |
+
|
| 247 |
for idx in indices_to_replace:
|
| 248 |
new_feat = random.randint(0, self.n_features - 1)
|
| 249 |
while new_feat in mutated['feature_indices']:
|
| 250 |
new_feat = random.randint(0, self.n_features - 1)
|
| 251 |
mutated['feature_indices'][idx] = new_feat
|
| 252 |
+
|
| 253 |
mutated['feature_indices'] = np.sort(mutated['feature_indices'])
|
| 254 |
+
|
| 255 |
+
# Hyperparameter mutation (including AdaBoost algorithm)
|
| 256 |
if random.random() < self.mutation_rate:
|
| 257 |
+
param_keys = [k for k in chromosome.keys() if k not in [
|
| 258 |
+
'feature_indices', 'weights']]
|
| 259 |
if param_keys:
|
| 260 |
param_to_mutate = random.choice(param_keys)
|
| 261 |
temp = self.create_chromosome()
|
| 262 |
mutated[param_to_mutate] = temp[param_to_mutate]
|
| 263 |
+
|
| 264 |
# Weight mutation
|
| 265 |
if random.random() < self.mutation_rate:
|
| 266 |
mutated['weights'] = self._random_weights(4)
|
| 267 |
+
|
| 268 |
return mutated
|
| 269 |
+
|
| 270 |
+
def evaluate_population_parallel(self, population: List[Dict],
|
| 271 |
+
X_train: np.ndarray, y_train: np.ndarray,
|
| 272 |
+
X_val: np.ndarray, y_val: np.ndarray,
|
| 273 |
+
n_jobs: int = 2) -> List[float]:
|
| 274 |
"""Evaluate entire population in parallel"""
|
| 275 |
+
self.log(
|
| 276 |
+
f" Evaluating {len(population)} individuals in parallel (n_jobs={n_jobs})...")
|
| 277 |
+
|
| 278 |
fitness_scores = Parallel(n_jobs=n_jobs, verbose=0)(
|
| 279 |
delayed(self.fitness)(chromosome, X_train, y_train, X_val, y_val)
|
| 280 |
for chromosome in population
|
| 281 |
)
|
| 282 |
+
|
| 283 |
return fitness_scores
|
| 284 |
+
|
| 285 |
def evolve(self, X_train: np.ndarray, y_train: np.ndarray,
|
| 286 |
X_val: np.ndarray, y_val: np.ndarray,
|
| 287 |
progress_callback: Optional[Callable] = None,
|
| 288 |
n_jobs: int = 2) -> Dict:
|
| 289 |
"""
|
| 290 |
Main GA evolution loop with parallel evaluation, early stopping, and logging
|
| 291 |
+
|
| 292 |
Args:
|
| 293 |
+
X_train, y_train: Training data (NumPy arrays)
|
| 294 |
+
X_val, y_val: Validation data (NumPy arrays)
|
| 295 |
progress_callback: Optional callback for progress updates
|
| 296 |
n_jobs: Number of parallel jobs
|
| 297 |
+
|
| 298 |
Returns:
|
| 299 |
Best chromosome found
|
| 300 |
"""
|
| 301 |
+
|
| 302 |
self.log("="*70)
|
| 303 |
self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
|
| 304 |
self.log("="*70)
|
|
|
|
| 307 |
self.log(f"Features to select: {self.n_select}/{self.n_features}")
|
| 308 |
self.log(f"Early stopping patience: {self.early_stopping_patience}")
|
| 309 |
self.log(f"Parallel jobs: {n_jobs}")
|
| 310 |
+
self.log(f"Optimizing AdaBoost algorithm: SAMME vs SAMME.R")
|
| 311 |
self.log("="*70)
|
| 312 |
+
|
| 313 |
+
population = [self.create_chromosome()
|
| 314 |
+
for _ in range(self.population_size)]
|
| 315 |
+
|
| 316 |
start_time = time.time()
|
| 317 |
no_improve_count = 0
|
| 318 |
+
|
| 319 |
for generation in range(self.n_generations):
|
| 320 |
gen_start = time.time()
|
| 321 |
+
|
| 322 |
self.log(f"\n📊 Generation {generation + 1}/{self.n_generations}")
|
| 323 |
+
|
| 324 |
# Parallel fitness evaluation
|
| 325 |
fitness_scores = self.evaluate_population_parallel(
|
| 326 |
population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
|
| 327 |
)
|
| 328 |
+
|
| 329 |
max_fitness = max(fitness_scores)
|
| 330 |
avg_fitness = np.mean(fitness_scores)
|
| 331 |
std_fitness = np.std(fitness_scores)
|
| 332 |
max_idx = fitness_scores.index(max_fitness)
|
| 333 |
+
|
| 334 |
# Track improvement
|
| 335 |
improved = False
|
| 336 |
if max_fitness > self.best_fitness + self.early_stopping_tolerance:
|
|
|
|
| 339 |
self.best_chromosome = population[max_idx].copy()
|
| 340 |
no_improve_count = 0
|
| 341 |
improved = True
|
| 342 |
+
|
| 343 |
+
# Log best configuration
|
| 344 |
+
best_ada_algo = self.best_chromosome.get(
|
| 345 |
+
'ada_algorithm', 'SAMME')
|
| 346 |
+
self.log(
|
| 347 |
+
f" ✨ NEW BEST: {max_fitness:.4f} (+{max_fitness - prev_best:.4f})")
|
| 348 |
+
self.log(f" AdaBoost algorithm: {best_ada_algo}")
|
| 349 |
else:
|
| 350 |
no_improve_count += 1
|
| 351 |
+
self.log(
|
| 352 |
+
f" → Best: {max_fitness:.4f} (no improvement, count={no_improve_count})")
|
| 353 |
+
|
| 354 |
# Log statistics
|
| 355 |
self.log(f" Average: {avg_fitness:.4f} (σ={std_fitness:.4f})")
|
| 356 |
+
self.log(
|
| 357 |
+
f" Range: [{min(fitness_scores):.4f}, {max(fitness_scores):.4f}]")
|
| 358 |
+
|
| 359 |
gen_time = time.time() - gen_start
|
| 360 |
elapsed = time.time() - start_time
|
| 361 |
avg_gen_time = elapsed / (generation + 1)
|
| 362 |
eta = avg_gen_time * (self.n_generations - generation - 1)
|
| 363 |
+
|
| 364 |
+
self.log(
|
| 365 |
+
f" Time: {gen_time:.1f}s | Elapsed: {elapsed/60:.1f}min | ETA: {eta/60:.1f}min")
|
| 366 |
+
|
| 367 |
self.history.append({
|
| 368 |
'generation': generation + 1,
|
| 369 |
'best_fitness': max_fitness,
|
|
|
|
| 372 |
'time': gen_time,
|
| 373 |
'improved': improved
|
| 374 |
})
|
| 375 |
+
|
| 376 |
# Update progress callback
|
| 377 |
if progress_callback:
|
| 378 |
progress_callback(
|
| 379 |
(generation + 1) / self.n_generations,
|
| 380 |
desc=f"Gen {generation+1}/{self.n_generations} | Best: {max_fitness:.4f} | Avg: {avg_fitness:.4f} | ETA: {eta/60:.0f}min"
|
| 381 |
)
|
| 382 |
+
|
| 383 |
# Early stopping check
|
| 384 |
if no_improve_count >= self.early_stopping_patience:
|
| 385 |
self.log(f"\n🛑 EARLY STOPPING at generation {generation + 1}")
|
| 386 |
+
self.log(
|
| 387 |
+
f" No improvement for {self.early_stopping_patience} consecutive generations")
|
| 388 |
self.log(f" Best fitness: {self.best_fitness:.4f}")
|
| 389 |
break
|
| 390 |
+
|
| 391 |
# Selection (Tournament + Elitism)
|
| 392 |
selected = []
|
| 393 |
for _ in range(self.population_size - self.elite_size):
|
| 394 |
+
tournament = random.sample(
|
| 395 |
+
list(zip(population, fitness_scores)), 3)
|
| 396 |
winner = max(tournament, key=lambda x: x[1])[0]
|
| 397 |
selected.append(winner)
|
| 398 |
+
|
| 399 |
elite_indices = np.argsort(fitness_scores)[-self.elite_size:]
|
| 400 |
elite = [population[i] for i in elite_indices]
|
| 401 |
+
|
| 402 |
# Crossover & Mutation
|
| 403 |
offspring = []
|
| 404 |
for i in range(0, len(selected), 2):
|
|
|
|
| 406 |
child1, child2 = self.crossover(selected[i], selected[i+1])
|
| 407 |
offspring.append(self.mutate(child1))
|
| 408 |
offspring.append(self.mutate(child2))
|
| 409 |
+
|
| 410 |
+
population = elite + \
|
| 411 |
+
offspring[:self.population_size - self.elite_size]
|
| 412 |
+
|
| 413 |
total_time = time.time() - start_time
|
| 414 |
+
|
| 415 |
self.log("\n" + "="*70)
|
| 416 |
self.log("✅ GA OPTIMIZATION COMPLETE")
|
| 417 |
self.log("="*70)
|
| 418 |
self.log(f"Final best fitness: {self.best_fitness:.4f}")
|
| 419 |
+
self.log(
|
| 420 |
+
f"Total generations: {len(self.history)}/{self.n_generations}")
|
| 421 |
self.log(f"Total time: {total_time/60:.1f} minutes")
|
| 422 |
+
self.log(
|
| 423 |
+
f"Average time per generation: {total_time/len(self.history):.1f}s")
|
| 424 |
+
|
| 425 |
+
if self.best_chromosome:
|
| 426 |
+
self.log(
|
| 427 |
+
f"\n🎯 Best AdaBoost Algorithm: {self.best_chromosome.get('ada_algorithm', 'SAMME')}")
|
| 428 |
+
|
| 429 |
self.log("="*70)
|
| 430 |
+
|
|
|
|
| 431 |
if self.best_chromosome is None:
|
| 432 |
+
self.log(
|
| 433 |
+
"⚠️ Warning: No improvement found, using best from final generation")
|
| 434 |
fitness_scores = self.evaluate_population_parallel(
|
| 435 |
population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
|
| 436 |
)
|
| 437 |
max_idx = fitness_scores.index(max(fitness_scores))
|
| 438 |
self.best_chromosome = population[max_idx].copy()
|
| 439 |
self.best_fitness = fitness_scores[max_idx]
|
| 440 |
+
|
| 441 |
+
return self.best_chromosome
|
src/training.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
Model training functions
|
| 3 |
"""
|
| 4 |
|
| 5 |
import os
|
|
@@ -22,15 +22,17 @@ from src.genetic_algorithm import GeneticAlgorithm
|
|
| 22 |
|
| 23 |
|
| 24 |
def train_models_with_ga(use_ga: bool = True,
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
"""
|
| 32 |
Train models with or without GA optimization and optional K-Fold CV
|
| 33 |
-
|
| 34 |
Args:
|
| 35 |
use_ga: Whether to use GA optimization
|
| 36 |
use_cv: Whether to use K-Fold Cross-Validation
|
|
@@ -38,12 +40,14 @@ def train_models_with_ga(use_ga: bool = True,
|
|
| 38 |
ga_generations: Number of GA generations
|
| 39 |
ga_population: GA population size
|
| 40 |
n_jobs: Number of parallel jobs
|
|
|
|
|
|
|
| 41 |
progress_callback: Optional progress callback function
|
| 42 |
-
|
| 43 |
Returns:
|
| 44 |
tuple: (summary_text, results_df, ga_history_df, training_log)
|
| 45 |
"""
|
| 46 |
-
|
| 47 |
if not os.path.exists(config.FEATURES_CSV):
|
| 48 |
return """
|
| 49 |
## ❌ Error: Dataset Not Found
|
|
@@ -52,45 +56,78 @@ Please go to **Tab 1: Feature Extraction** first!
|
|
| 52 |
|
| 53 |
Click "🔊 Extract Features" to process the dataset.
|
| 54 |
""", None, None, ""
|
| 55 |
-
|
| 56 |
try:
|
| 57 |
if progress_callback:
|
| 58 |
progress_callback(0, desc="Loading dataset...")
|
| 59 |
-
|
| 60 |
# Load data
|
| 61 |
df = pd.read_csv(config.FEATURES_CSV)
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
X = df[feature_cols].values
|
| 65 |
y = df['emotion'].values
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
label_encoder = LabelEncoder()
|
| 68 |
y_encoded = label_encoder.fit_transform(y)
|
| 69 |
-
|
| 70 |
n_classes = len(label_encoder.classes_)
|
| 71 |
-
|
| 72 |
training_log = ""
|
| 73 |
-
|
| 74 |
-
# ========================================================================
|
| 75 |
-
# CROSS-VALIDATION MODE
|
| 76 |
-
# ========================================================================
|
| 77 |
if use_cv:
|
| 78 |
return _train_with_cross_validation(
|
| 79 |
X, y_encoded, label_encoder, n_classes,
|
| 80 |
use_ga, n_folds, ga_generations, ga_population, n_jobs,
|
|
|
|
| 81 |
progress_callback
|
| 82 |
)
|
| 83 |
-
|
| 84 |
-
# ========================================================================
|
| 85 |
-
# SINGLE SPLIT MODE (Original)
|
| 86 |
-
# ========================================================================
|
| 87 |
else:
|
| 88 |
return _train_single_split(
|
| 89 |
X, y_encoded, label_encoder, n_classes,
|
| 90 |
use_ga, ga_generations, ga_population, n_jobs,
|
|
|
|
| 91 |
progress_callback
|
| 92 |
)
|
| 93 |
-
|
| 94 |
except Exception as e:
|
| 95 |
import traceback
|
| 96 |
error_trace = traceback.format_exc()
|
|
@@ -99,175 +136,161 @@ Click "🔊 Extract Features" to process the dataset.
|
|
| 99 |
|
| 100 |
def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
|
| 101 |
use_ga, n_folds, ga_generations, ga_population, n_jobs,
|
|
|
|
| 102 |
progress_callback):
|
| 103 |
"""
|
| 104 |
Train with K-Fold Cross-Validation
|
| 105 |
"""
|
| 106 |
-
|
| 107 |
print("="*80)
|
| 108 |
print(f"{'K-FOLD CROSS-VALIDATION TRAINING':^80}")
|
| 109 |
print("="*80)
|
| 110 |
print(f"Number of folds: {n_folds}")
|
| 111 |
print(f"Use GA: {use_ga}")
|
|
|
|
|
|
|
| 112 |
print(f"Total samples: {len(X)}")
|
| 113 |
print("="*80)
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
# Storage for results
|
| 119 |
fold_results = []
|
| 120 |
fold_models = []
|
| 121 |
all_ga_history = []
|
| 122 |
training_log = ""
|
| 123 |
-
|
| 124 |
-
# Calculate progress steps
|
| 125 |
total_steps = n_folds
|
| 126 |
current_step = 0
|
| 127 |
-
|
| 128 |
-
# Iterate through folds
|
| 129 |
for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded), 1):
|
| 130 |
fold_log = f"\n{'='*80}\n"
|
| 131 |
fold_log += f"FOLD {fold_idx}/{n_folds}\n"
|
| 132 |
fold_log += f"{'='*80}\n"
|
| 133 |
print(fold_log)
|
| 134 |
training_log += fold_log
|
| 135 |
-
|
| 136 |
if progress_callback:
|
| 137 |
base_progress = current_step / total_steps
|
| 138 |
-
progress_callback(
|
| 139 |
-
|
| 140 |
-
|
| 141 |
X_train, X_test = X[train_idx], X[test_idx]
|
| 142 |
y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]
|
| 143 |
-
|
| 144 |
fold_log = f"Train samples: {len(X_train)}, Test samples: {len(X_test)}\n"
|
| 145 |
print(fold_log)
|
| 146 |
training_log += fold_log
|
| 147 |
-
|
| 148 |
-
# Scale features
|
| 149 |
scaler = StandardScaler()
|
| 150 |
X_train_scaled = scaler.fit_transform(X_train)
|
| 151 |
X_test_scaled = scaler.transform(X_test)
|
| 152 |
-
|
| 153 |
-
# ====================================================================
|
| 154 |
-
# GA OPTIMIZATION (if enabled)
|
| 155 |
-
# ====================================================================
|
| 156 |
if use_ga:
|
| 157 |
if progress_callback:
|
| 158 |
-
progress_callback(base_progress + 0.05/total_steps,
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
# Split train into train + validation for GA
|
| 162 |
X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
|
| 163 |
X_train_scaled, y_train,
|
| 164 |
test_size=0.2,
|
| 165 |
random_state=config.RANDOM_STATE,
|
| 166 |
stratify=y_train
|
| 167 |
)
|
| 168 |
-
|
| 169 |
if progress_callback:
|
| 170 |
progress_callback(base_progress + 0.1/total_steps,
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
ga.population_size = ga_population
|
| 176 |
ga.n_generations = ga_generations
|
| 177 |
-
|
| 178 |
def ga_progress(p, desc):
|
| 179 |
if progress_callback:
|
| 180 |
-
# GA takes 60% of fold time
|
| 181 |
ga_progress_in_fold = 0.1 + 0.6 * p
|
| 182 |
-
progress_callback(base_progress + ga_progress_in_fold/total_steps,
|
| 183 |
-
|
| 184 |
-
|
| 185 |
best_config = ga.evolve(
|
| 186 |
X_train_ga, y_train_ga, X_val_ga, y_val_ga,
|
| 187 |
progress_callback=ga_progress,
|
| 188 |
n_jobs=n_jobs
|
| 189 |
)
|
| 190 |
-
|
| 191 |
-
# Store GA logs
|
| 192 |
training_log += "\n".join(ga.log_messages) + "\n"
|
| 193 |
all_ga_history.extend(ga.history)
|
| 194 |
-
|
| 195 |
if best_config is None:
|
| 196 |
fold_log = f"❌ GA optimization failed for Fold {fold_idx}\n"
|
| 197 |
print(fold_log)
|
| 198 |
training_log += fold_log
|
| 199 |
continue
|
| 200 |
-
|
| 201 |
-
# Use GA-selected features
|
| 202 |
selected_indices = best_config['feature_indices']
|
| 203 |
X_train_selected = X_train_scaled[:, selected_indices]
|
| 204 |
X_test_selected = X_test_scaled[:, selected_indices]
|
| 205 |
-
|
| 206 |
if progress_callback:
|
| 207 |
progress_callback(base_progress + 0.7/total_steps,
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
# Train models with GA config
|
| 211 |
models, accuracies = _train_all_models(
|
| 212 |
X_train_selected, y_train, X_test_selected, y_test,
|
| 213 |
n_classes, best_config
|
| 214 |
)
|
| 215 |
-
|
| 216 |
weights = best_config['weights']
|
| 217 |
-
|
| 218 |
fold_log = f"\n✅ GA optimization completed for Fold {fold_idx}\n"
|
| 219 |
fold_log += f"Best fitness: {ga.best_fitness:.4f}\n"
|
| 220 |
fold_log += f"Generations: {len(ga.history)}/{ga_generations}\n"
|
| 221 |
print(fold_log)
|
| 222 |
training_log += fold_log
|
| 223 |
-
|
| 224 |
-
# ====================================================================
|
| 225 |
-
# SIMPLE TRAINING (no GA)
|
| 226 |
-
# ====================================================================
|
| 227 |
else:
|
| 228 |
if progress_callback:
|
| 229 |
progress_callback(base_progress + 0.2/total_steps,
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
| 236 |
X_train_selected = X_train_scaled[:, selected_indices]
|
| 237 |
X_test_selected = X_test_scaled[:, selected_indices]
|
| 238 |
-
|
| 239 |
if progress_callback:
|
| 240 |
progress_callback(base_progress + 0.3/total_steps,
|
| 241 |
-
|
| 242 |
-
|
| 243 |
models, accuracies = _train_all_models_default(
|
| 244 |
X_train_selected, y_train, X_test_selected, y_test,
|
| 245 |
n_classes, progress_callback, fold_idx, n_folds, base_progress, total_steps
|
| 246 |
)
|
| 247 |
-
|
| 248 |
-
# Calculate weights based on accuracies
|
| 249 |
acc_values = np.array(list(accuracies.values()))
|
| 250 |
weights = acc_values / acc_values.sum()
|
| 251 |
-
|
| 252 |
-
# ====================================================================
|
| 253 |
-
# ENSEMBLE EVALUATION
|
| 254 |
-
# ====================================================================
|
| 255 |
if progress_callback:
|
| 256 |
progress_callback(base_progress + 0.9/total_steps,
|
| 257 |
-
|
| 258 |
-
|
| 259 |
predictions = [
|
| 260 |
models['xgboost'].predict_proba(X_test_selected),
|
| 261 |
models['lightgbm'].predict_proba(X_test_selected),
|
| 262 |
models['gradientboosting'].predict_proba(X_test_selected),
|
| 263 |
models['adaboost'].predict_proba(X_test_selected)
|
| 264 |
]
|
| 265 |
-
|
| 266 |
ensemble_pred = np.average(predictions, axis=0, weights=weights)
|
| 267 |
ensemble_labels = np.argmax(ensemble_pred, axis=1)
|
| 268 |
ensemble_acc = accuracy_score(y_test, ensemble_labels)
|
| 269 |
-
|
| 270 |
-
# Store results
|
| 271 |
fold_result = {
|
| 272 |
'fold': fold_idx,
|
| 273 |
'xgboost': accuracies['xgboost'],
|
|
@@ -279,15 +302,14 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
|
|
| 279 |
'n_test': len(X_test)
|
| 280 |
}
|
| 281 |
fold_results.append(fold_result)
|
| 282 |
-
|
| 283 |
fold_models.append({
|
| 284 |
'models': models,
|
| 285 |
'scaler': scaler,
|
| 286 |
'selected_indices': selected_indices,
|
| 287 |
'weights': weights
|
| 288 |
})
|
| 289 |
-
|
| 290 |
-
# Print fold results
|
| 291 |
fold_log = f"\n📊 Fold {fold_idx} Results:\n"
|
| 292 |
fold_log += f" XGBoost: {accuracies['xgboost']:.4f}\n"
|
| 293 |
fold_log += f" LightGBM: {accuracies['lightgbm']:.4f}\n"
|
|
@@ -296,45 +318,41 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
|
|
| 296 |
fold_log += f" Ensemble: {ensemble_acc:.4f} ⭐\n"
|
| 297 |
print(fold_log)
|
| 298 |
training_log += fold_log
|
| 299 |
-
|
| 300 |
current_step += 1
|
| 301 |
-
|
| 302 |
-
# ========================================================================
|
| 303 |
-
# AGGREGATE RESULTS
|
| 304 |
-
# ========================================================================
|
| 305 |
if len(fold_results) == 0:
|
| 306 |
return "❌ All folds failed", None, None, training_log
|
| 307 |
-
|
| 308 |
results_df = pd.DataFrame(fold_results)
|
| 309 |
-
|
| 310 |
-
# Calculate statistics
|
| 311 |
stats_log = f"\n{'='*80}\n"
|
| 312 |
stats_log += f"{'CROSS-VALIDATION SUMMARY':^80}\n"
|
| 313 |
stats_log += f"{'='*80}\n\n"
|
| 314 |
-
|
| 315 |
stats_log += "Per-Fold Results:\n"
|
| 316 |
stats_log += results_df.to_string(index=False) + "\n\n"
|
| 317 |
-
|
| 318 |
stats_log += "="*80 + "\n"
|
| 319 |
stats_log += "SUMMARY STATISTICS\n"
|
| 320 |
stats_log += "="*80 + "\n"
|
| 321 |
-
|
| 322 |
stats_summary = []
|
| 323 |
-
|
| 324 |
for model_name in ['xgboost', 'lightgbm', 'gradientboosting', 'adaboost', 'ensemble']:
|
| 325 |
scores = results_df[model_name].values
|
| 326 |
mean_score = scores.mean()
|
| 327 |
std_score = scores.std()
|
| 328 |
-
|
| 329 |
model_stats = f"\n{model_name.upper()}:\n"
|
| 330 |
model_stats += f" Mean Accuracy: {mean_score:.4f}\n"
|
| 331 |
model_stats += f" Std Deviation: {std_score:.4f}\n"
|
| 332 |
model_stats += f" 95% CI: [{mean_score - 1.96*std_score:.4f}, {mean_score + 1.96*std_score:.4f}]\n"
|
| 333 |
model_stats += f" Min: {scores.min():.4f}\n"
|
| 334 |
model_stats += f" Max: {scores.max():.4f}\n"
|
| 335 |
-
|
| 336 |
stats_log += model_stats
|
| 337 |
-
|
| 338 |
stats_summary.append({
|
| 339 |
'Model': model_name.upper(),
|
| 340 |
'Mean': mean_score,
|
|
@@ -342,27 +360,24 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
|
|
| 342 |
'Min': scores.min(),
|
| 343 |
'Max': scores.max()
|
| 344 |
})
|
| 345 |
-
|
| 346 |
print(stats_log)
|
| 347 |
training_log += stats_log
|
| 348 |
-
|
| 349 |
-
# ========================================================================
|
| 350 |
-
# SELECT AND SAVE BEST MODEL
|
| 351 |
-
# ========================================================================
|
| 352 |
best_fold_idx = results_df['ensemble'].idxmax()
|
| 353 |
best_fold = fold_results[best_fold_idx]
|
| 354 |
best_models = fold_models[best_fold_idx]
|
| 355 |
-
|
| 356 |
save_log = f"\n{'='*80}\n"
|
| 357 |
save_log += f"Best performing fold: Fold {best_fold['fold']} (Ensemble: {best_fold['ensemble']:.4f})\n"
|
| 358 |
save_log += "Saving this model...\n"
|
| 359 |
save_log += "="*80 + "\n"
|
| 360 |
print(save_log)
|
| 361 |
training_log += save_log
|
| 362 |
-
|
| 363 |
if progress_callback:
|
| 364 |
progress_callback(0.95, desc="Saving best model...")
|
| 365 |
-
|
| 366 |
_save_models(
|
| 367 |
best_models['models'],
|
| 368 |
best_models['scaler'],
|
|
@@ -378,18 +393,14 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
|
|
| 378 |
best_fold['ensemble'],
|
| 379 |
cv_results=results_df.to_dict('records')
|
| 380 |
)
|
| 381 |
-
|
| 382 |
if progress_callback:
|
| 383 |
progress_callback(1.0, desc="Complete!")
|
| 384 |
-
|
| 385 |
-
# ========================================================================
|
| 386 |
-
# CREATE SUMMARY
|
| 387 |
-
# ========================================================================
|
| 388 |
-
|
| 389 |
ensemble_mean = results_df['ensemble'].mean()
|
| 390 |
ensemble_std = results_df['ensemble'].std()
|
| 391 |
consistency = (1 - ensemble_std / ensemble_mean) * 100
|
| 392 |
-
|
| 393 |
summary = f"""
|
| 394 |
## ✅ Cross-Validation Training Complete!
|
| 395 |
|
|
@@ -423,70 +434,68 @@ Best performing fold (Fold {best_fold['fold']}) saved to `weights/`
|
|
| 423 |
|
| 424 |
📝 **Note**: This is a more reliable estimate than single train/test split!
|
| 425 |
"""
|
| 426 |
-
|
| 427 |
-
# GA history dataframe (if GA was used)
|
| 428 |
ga_history_df = None
|
| 429 |
if use_ga and len(all_ga_history) > 0:
|
| 430 |
ga_history_df = pd.DataFrame(all_ga_history)
|
| 431 |
-
|
| 432 |
-
# Summary stats dataframe
|
| 433 |
summary_stats_df = pd.DataFrame(stats_summary)
|
| 434 |
-
|
| 435 |
return summary, summary_stats_df, ga_history_df, training_log
|
| 436 |
|
| 437 |
|
| 438 |
def _train_single_split(X, y_encoded, label_encoder, n_classes,
|
| 439 |
-
|
| 440 |
-
|
|
|
|
| 441 |
"""
|
| 442 |
Train with single train/test split (Original method)
|
| 443 |
"""
|
| 444 |
-
|
| 445 |
-
# Train/test split
|
| 446 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 447 |
X, y_encoded,
|
| 448 |
test_size=config.TRAIN_TEST_SPLIT,
|
| 449 |
random_state=config.RANDOM_STATE,
|
| 450 |
stratify=y_encoded
|
| 451 |
)
|
| 452 |
-
|
| 453 |
if progress_callback:
|
| 454 |
progress_callback(0.1, desc="Scaling features...")
|
| 455 |
-
|
| 456 |
scaler = StandardScaler()
|
| 457 |
X_train_scaled = scaler.fit_transform(X_train)
|
| 458 |
X_test_scaled = scaler.transform(X_test)
|
| 459 |
-
|
| 460 |
training_log = ""
|
| 461 |
-
|
| 462 |
if use_ga:
|
| 463 |
-
# GA optimization
|
| 464 |
if progress_callback:
|
| 465 |
progress_callback(0.2, desc="Initializing GA...")
|
| 466 |
-
|
| 467 |
X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
|
| 468 |
X_train_scaled, y_train,
|
| 469 |
test_size=0.2,
|
| 470 |
random_state=config.RANDOM_STATE,
|
| 471 |
stratify=y_train
|
| 472 |
)
|
| 473 |
-
|
| 474 |
-
ga = GeneticAlgorithm(X_train_ga, y_train_ga,
|
|
|
|
| 475 |
ga.population_size = ga_population
|
| 476 |
ga.n_generations = ga_generations
|
| 477 |
-
|
| 478 |
def ga_progress(p, desc):
|
| 479 |
if progress_callback:
|
| 480 |
progress_callback(0.2 + 0.6*p, desc=desc)
|
| 481 |
-
|
| 482 |
best_config = ga.evolve(
|
| 483 |
X_train_ga, y_train_ga, X_val_ga, y_val_ga,
|
| 484 |
progress_callback=ga_progress,
|
| 485 |
n_jobs=n_jobs
|
| 486 |
)
|
| 487 |
-
|
| 488 |
training_log = "\n".join(ga.log_messages)
|
| 489 |
-
|
| 490 |
if best_config is None:
|
| 491 |
error_msg = """
|
| 492 |
## ❌ GA Optimization Failed
|
|
@@ -505,28 +514,30 @@ The genetic algorithm did not produce a valid configuration.
|
|
| 505 |
**Training Log:**
|
| 506 |
"""
|
| 507 |
return error_msg + training_log, None, None, training_log
|
| 508 |
-
|
| 509 |
if progress_callback:
|
| 510 |
-
progress_callback(
|
| 511 |
-
|
|
|
|
| 512 |
selected_indices = best_config['feature_indices']
|
| 513 |
X_train_selected = X_train_scaled[:, selected_indices]
|
| 514 |
X_test_selected = X_test_scaled[:, selected_indices]
|
| 515 |
-
|
| 516 |
-
# Train models with GA config
|
| 517 |
models, accuracies = _train_all_models(
|
| 518 |
X_train_selected, y_train, X_test_selected, y_test,
|
| 519 |
n_classes, best_config
|
| 520 |
)
|
| 521 |
-
|
| 522 |
weights = best_config['weights']
|
| 523 |
-
|
| 524 |
ga_summary = f"""
|
| 525 |
### 🧬 GA Optimization Results:
|
| 526 |
- **Generations Completed**: {len(ga.history)}/{ga_generations}
|
| 527 |
- **Population Size**: {ga_population}
|
| 528 |
- **Best Fitness**: {ga.best_fitness:.4f}
|
| 529 |
- **Parallel Jobs**: {n_jobs}
|
|
|
|
|
|
|
| 530 |
|
| 531 |
### 🎯 Best Configuration:
|
| 532 |
- **XGBoost**: n_est={best_config['xgb_n_estimators']}, depth={best_config['xgb_max_depth']}, lr={best_config['xgb_learning_rate']}
|
|
@@ -534,59 +545,58 @@ The genetic algorithm did not produce a valid configuration.
|
|
| 534 |
- **Gradient Boosting**: n_est={best_config['gb_n_estimators']}, depth={best_config['gb_max_depth']}, lr={best_config['gb_learning_rate']}
|
| 535 |
- **AdaBoost**: n_est={best_config['ada_n_estimators']}, lr={best_config['ada_learning_rate']}
|
| 536 |
"""
|
| 537 |
-
|
| 538 |
ga_history_df = pd.DataFrame(ga.history)
|
| 539 |
-
|
| 540 |
else:
|
| 541 |
-
# Simple training without GA
|
| 542 |
if progress_callback:
|
| 543 |
-
progress_callback(0.3, desc="Selecting features
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
X_train_selected = X_train_scaled[:, selected_indices]
|
| 549 |
X_test_selected = X_test_scaled[:, selected_indices]
|
| 550 |
-
|
| 551 |
models, accuracies = _train_all_models_default(
|
| 552 |
X_train_selected, y_train, X_test_selected, y_test,
|
| 553 |
n_classes, progress_callback
|
| 554 |
)
|
| 555 |
-
|
| 556 |
-
# Calculate weights based on accuracies
|
| 557 |
acc_values = list(accuracies.values())
|
| 558 |
weights = np.array(acc_values) / sum(acc_values)
|
| 559 |
-
|
| 560 |
-
ga_summary = "\n### ⚡ Simple Training (No GA)\n"
|
| 561 |
ga_history_df = None
|
| 562 |
training_log = "Simple training mode - no GA logs"
|
| 563 |
-
|
| 564 |
if progress_callback:
|
| 565 |
progress_callback(0.9, desc="Creating ensemble...")
|
| 566 |
-
|
| 567 |
-
# Ensemble evaluation
|
| 568 |
predictions = [
|
| 569 |
models['xgboost'].predict_proba(X_test_selected),
|
| 570 |
models['lightgbm'].predict_proba(X_test_selected),
|
| 571 |
models['gradientboosting'].predict_proba(X_test_selected),
|
| 572 |
models['adaboost'].predict_proba(X_test_selected)
|
| 573 |
]
|
| 574 |
-
|
| 575 |
ensemble_pred = np.average(predictions, axis=0, weights=weights)
|
| 576 |
ensemble_labels = np.argmax(ensemble_pred, axis=1)
|
| 577 |
ensemble_acc = accuracy_score(y_test, ensemble_labels)
|
| 578 |
-
|
| 579 |
if progress_callback:
|
| 580 |
progress_callback(0.95, desc="Saving models...")
|
| 581 |
-
|
| 582 |
-
# Save models
|
| 583 |
_save_models(models, scaler, label_encoder, selected_indices, weights,
|
| 584 |
-
|
| 585 |
-
|
| 586 |
if progress_callback:
|
| 587 |
progress_callback(1.0, desc="Complete!")
|
| 588 |
-
|
| 589 |
-
# Create results table
|
| 590 |
results_df = pd.DataFrame({
|
| 591 |
'Model': ['XGBoost', 'LightGBM', 'Gradient Boosting', 'AdaBoost', 'Ensemble'],
|
| 592 |
'Test Accuracy': [
|
|
@@ -597,7 +607,7 @@ The genetic algorithm did not produce a valid configuration.
|
|
| 597 |
ensemble_acc
|
| 598 |
]
|
| 599 |
})
|
| 600 |
-
|
| 601 |
summary = f"""
|
| 602 |
## ✅ Training Complete!
|
| 603 |
|
|
@@ -628,7 +638,7 @@ The genetic algorithm did not produce a valid configuration.
|
|
| 628 |
|
| 629 |
⚠️ **Note**: Single train/test split. For more reliable results, use Cross-Validation!
|
| 630 |
"""
|
| 631 |
-
|
| 632 |
return summary, results_df, ga_history_df, training_log
|
| 633 |
|
| 634 |
|
|
@@ -636,12 +646,15 @@ def _train_all_models(X_train, y_train, X_test, y_test, n_classes, config_dict):
|
|
| 636 |
"""Train all models with given configuration"""
|
| 637 |
models = {}
|
| 638 |
accuracies = {}
|
| 639 |
-
|
| 640 |
-
# XGBoost
|
| 641 |
xgb = XGBClassifier(
|
| 642 |
n_estimators=config_dict['xgb_n_estimators'],
|
| 643 |
max_depth=config_dict['xgb_max_depth'],
|
| 644 |
learning_rate=config_dict['xgb_learning_rate'],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
objective='multi:softprob',
|
| 646 |
num_class=n_classes,
|
| 647 |
random_state=config.RANDOM_STATE,
|
|
@@ -651,60 +664,66 @@ def _train_all_models(X_train, y_train, X_test, y_test, n_classes, config_dict):
|
|
| 651 |
xgb.fit(X_train, y_train)
|
| 652 |
models['xgboost'] = xgb
|
| 653 |
accuracies['xgboost'] = xgb.score(X_test, y_test)
|
| 654 |
-
|
| 655 |
-
# LightGBM
|
| 656 |
lgbm = LGBMClassifier(
|
| 657 |
n_estimators=config_dict['lgbm_n_estimators'],
|
| 658 |
num_leaves=config_dict['lgbm_num_leaves'],
|
| 659 |
learning_rate=config_dict['lgbm_learning_rate'],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
objective='multiclass',
|
| 661 |
num_class=n_classes,
|
| 662 |
random_state=config.RANDOM_STATE,
|
| 663 |
n_jobs=-1,
|
| 664 |
-
verbose=-1
|
|
|
|
| 665 |
)
|
| 666 |
lgbm.fit(X_train, y_train)
|
| 667 |
models['lightgbm'] = lgbm
|
| 668 |
accuracies['lightgbm'] = lgbm.score(X_test, y_test)
|
| 669 |
-
|
| 670 |
-
# Gradient Boosting
|
| 671 |
gb = GradientBoostingClassifier(
|
| 672 |
n_estimators=config_dict['gb_n_estimators'],
|
| 673 |
max_depth=config_dict['gb_max_depth'],
|
| 674 |
learning_rate=config_dict['gb_learning_rate'],
|
|
|
|
|
|
|
|
|
|
| 675 |
random_state=config.RANDOM_STATE
|
| 676 |
)
|
| 677 |
gb.fit(X_train, y_train)
|
| 678 |
models['gradientboosting'] = gb
|
| 679 |
accuracies['gradientboosting'] = gb.score(X_test, y_test)
|
| 680 |
-
|
| 681 |
-
# AdaBoost
|
| 682 |
ada = AdaBoostClassifier(
|
| 683 |
n_estimators=config_dict['ada_n_estimators'],
|
| 684 |
learning_rate=config_dict['ada_learning_rate'],
|
| 685 |
-
algorithm=
|
| 686 |
random_state=config.RANDOM_STATE
|
| 687 |
)
|
| 688 |
ada.fit(X_train, y_train)
|
| 689 |
models['adaboost'] = ada
|
| 690 |
accuracies['adaboost'] = ada.score(X_test, y_test)
|
| 691 |
-
|
| 692 |
return models, accuracies
|
| 693 |
|
| 694 |
|
| 695 |
-
def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
|
| 696 |
-
progress_callback=None, fold_idx=None, n_folds=None,
|
| 697 |
base_progress=0, total_steps=1):
|
| 698 |
"""Train all models with default hyperparameters"""
|
| 699 |
models = {}
|
| 700 |
accuracies = {}
|
| 701 |
-
|
| 702 |
if progress_callback and fold_idx:
|
| 703 |
-
progress_callback(base_progress + 0.4/total_steps,
|
| 704 |
-
|
| 705 |
elif progress_callback:
|
| 706 |
progress_callback(0.4, desc="Training XGBoost...")
|
| 707 |
-
|
| 708 |
xgb = XGBClassifier(
|
| 709 |
n_estimators=150, max_depth=5, learning_rate=0.1,
|
| 710 |
objective='multi:softprob', num_class=n_classes,
|
|
@@ -713,28 +732,29 @@ def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
|
|
| 713 |
xgb.fit(X_train, y_train)
|
| 714 |
models['xgboost'] = xgb
|
| 715 |
accuracies['xgboost'] = xgb.score(X_test, y_test)
|
| 716 |
-
|
| 717 |
if progress_callback and fold_idx:
|
| 718 |
progress_callback(base_progress + 0.5/total_steps,
|
| 719 |
-
|
| 720 |
elif progress_callback:
|
| 721 |
progress_callback(0.5, desc="Training LightGBM...")
|
| 722 |
-
|
| 723 |
lgbm = LGBMClassifier(
|
| 724 |
n_estimators=150, num_leaves=40, learning_rate=0.1,
|
| 725 |
objective='multiclass', num_class=n_classes,
|
| 726 |
-
random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1
|
|
|
|
| 727 |
)
|
| 728 |
lgbm.fit(X_train, y_train)
|
| 729 |
models['lightgbm'] = lgbm
|
| 730 |
accuracies['lightgbm'] = lgbm.score(X_test, y_test)
|
| 731 |
-
|
| 732 |
if progress_callback and fold_idx:
|
| 733 |
progress_callback(base_progress + 0.65/total_steps,
|
| 734 |
-
|
| 735 |
elif progress_callback:
|
| 736 |
progress_callback(0.65, desc="Training Gradient Boosting...")
|
| 737 |
-
|
| 738 |
gb = GradientBoostingClassifier(
|
| 739 |
n_estimators=100, max_depth=4, learning_rate=0.1,
|
| 740 |
random_state=config.RANDOM_STATE
|
|
@@ -742,50 +762,49 @@ def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
|
|
| 742 |
gb.fit(X_train, y_train)
|
| 743 |
models['gradientboosting'] = gb
|
| 744 |
accuracies['gradientboosting'] = gb.score(X_test, y_test)
|
| 745 |
-
|
| 746 |
if progress_callback and fold_idx:
|
| 747 |
progress_callback(base_progress + 0.8/total_steps,
|
| 748 |
-
|
| 749 |
elif progress_callback:
|
| 750 |
progress_callback(0.8, desc="Training AdaBoost...")
|
| 751 |
-
|
| 752 |
ada = AdaBoostClassifier(
|
| 753 |
-
n_estimators=100,
|
|
|
|
|
|
|
| 754 |
random_state=config.RANDOM_STATE
|
| 755 |
)
|
| 756 |
ada.fit(X_train, y_train)
|
| 757 |
models['adaboost'] = ada
|
| 758 |
accuracies['adaboost'] = ada.score(X_test, y_test)
|
| 759 |
-
|
| 760 |
return models, accuracies
|
| 761 |
|
| 762 |
|
| 763 |
-
def _save_models(models, scaler, label_encoder, selected_indices, weights,
|
| 764 |
-
|
| 765 |
"""Save all models and configuration"""
|
| 766 |
config.WEIGHTS_DIR.mkdir(exist_ok=True)
|
| 767 |
-
|
| 768 |
-
# Save models
|
| 769 |
with open(config.WEIGHTS_DIR / 'xgboost_model.pkl', 'wb') as f:
|
| 770 |
pickle.dump(models['xgboost'], f)
|
| 771 |
-
|
| 772 |
with open(config.WEIGHTS_DIR / 'lightgbm_model.pkl', 'wb') as f:
|
| 773 |
pickle.dump(models['lightgbm'], f)
|
| 774 |
-
|
| 775 |
with open(config.WEIGHTS_DIR / 'gradientboost_model.pkl', 'wb') as f:
|
| 776 |
pickle.dump(models['gradientboosting'], f)
|
| 777 |
-
|
| 778 |
with open(config.WEIGHTS_DIR / 'adaboost_model.pkl', 'wb') as f:
|
| 779 |
pickle.dump(models['adaboost'], f)
|
| 780 |
-
|
| 781 |
-
# Save preprocessing
|
| 782 |
with open(config.WEIGHTS_DIR / 'scaler.pkl', 'wb') as f:
|
| 783 |
pickle.dump(scaler, f)
|
| 784 |
-
|
| 785 |
with open(config.WEIGHTS_DIR / 'label_encoder.pkl', 'wb') as f:
|
| 786 |
pickle.dump(label_encoder, f)
|
| 787 |
-
|
| 788 |
-
# Save configuration
|
| 789 |
model_config = {
|
| 790 |
'selected_features': selected_indices.tolist(),
|
| 791 |
'ensemble_weights': weights.tolist(),
|
|
@@ -799,13 +818,12 @@ def _save_models(models, scaler, label_encoder, selected_indices, weights,
|
|
| 799 |
'ensemble': float(ensemble_acc)
|
| 800 |
}
|
| 801 |
}
|
| 802 |
-
|
| 803 |
-
# Add CV results if available
|
| 804 |
if cv_results is not None:
|
| 805 |
model_config['cv_results'] = cv_results
|
| 806 |
-
model_config['
|
| 807 |
else:
|
| 808 |
-
model_config['
|
| 809 |
-
|
| 810 |
with open(config.WEIGHTS_DIR / 'config.json', 'w') as f:
|
| 811 |
-
json.dump(model_config, f, indent=2)
|
|
|
|
| 1 |
"""
|
| 2 |
+
Model training functions with K-Fold Cross-Validation
|
| 3 |
"""
|
| 4 |
|
| 5 |
import os
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def train_models_with_ga(use_ga: bool = True,
|
| 25 |
+
use_cv: bool = False,
|
| 26 |
+
n_folds: int = 5,
|
| 27 |
+
ga_generations: int = 20,
|
| 28 |
+
ga_population: int = 15,
|
| 29 |
+
n_jobs: int = 2,
|
| 30 |
+
optimize_features: bool = True,
|
| 31 |
+
n_features_select: int = 100,
|
| 32 |
+
progress_callback: Optional[callable] = None) -> Tuple[str, pd.DataFrame, Optional[pd.DataFrame], str]:
|
| 33 |
"""
|
| 34 |
Train models with or without GA optimization and optional K-Fold CV
|
| 35 |
+
|
| 36 |
Args:
|
| 37 |
use_ga: Whether to use GA optimization
|
| 38 |
use_cv: Whether to use K-Fold Cross-Validation
|
|
|
|
| 40 |
ga_generations: Number of GA generations
|
| 41 |
ga_population: GA population size
|
| 42 |
n_jobs: Number of parallel jobs
|
| 43 |
+
optimize_features: Whether GA should optimize feature selection
|
| 44 |
+
n_features_select: Number of features to select
|
| 45 |
progress_callback: Optional progress callback function
|
| 46 |
+
|
| 47 |
Returns:
|
| 48 |
tuple: (summary_text, results_df, ga_history_df, training_log)
|
| 49 |
"""
|
| 50 |
+
|
| 51 |
if not os.path.exists(config.FEATURES_CSV):
|
| 52 |
return """
|
| 53 |
## ❌ Error: Dataset Not Found
|
|
|
|
| 56 |
|
| 57 |
Click "🔊 Extract Features" to process the dataset.
|
| 58 |
""", None, None, ""
|
| 59 |
+
|
| 60 |
try:
|
| 61 |
if progress_callback:
|
| 62 |
progress_callback(0, desc="Loading dataset...")
|
| 63 |
+
|
| 64 |
# Load data
|
| 65 |
df = pd.read_csv(config.FEATURES_CSV)
|
| 66 |
+
|
| 67 |
+
# Extract only numeric feature columns
|
| 68 |
+
feature_cols = [col for col in df.columns
|
| 69 |
+
if col.startswith('feature_')
|
| 70 |
+
and col.replace('feature_', '').isdigit()]
|
| 71 |
+
|
| 72 |
+
feature_cols = sorted(
|
| 73 |
+
feature_cols, key=lambda x: int(x.replace('feature_', '')))
|
| 74 |
+
|
| 75 |
+
if len(feature_cols) == 0:
|
| 76 |
+
return """
|
| 77 |
+
## ❌ Error: No numeric feature columns found!
|
| 78 |
+
|
| 79 |
+
Please re-run feature extraction in Tab 1.
|
| 80 |
+
""", None, None, ""
|
| 81 |
+
|
| 82 |
X = df[feature_cols].values
|
| 83 |
y = df['emotion'].values
|
| 84 |
+
|
| 85 |
+
# Adjust n_features_select based on available features
|
| 86 |
+
n_features_available = X.shape[1]
|
| 87 |
+
|
| 88 |
+
if not optimize_features:
|
| 89 |
+
n_features_select = n_features_available
|
| 90 |
+
print(f"✅ Feature Selection: DISABLED")
|
| 91 |
+
print(f" Using all {n_features_available} features")
|
| 92 |
+
else:
|
| 93 |
+
if n_features_select > n_features_available:
|
| 94 |
+
print(
|
| 95 |
+
f"⚠️ Requested {n_features_select} features, but only {n_features_available} available")
|
| 96 |
+
print(f" Auto-adjusting to {n_features_available}")
|
| 97 |
+
n_features_select = n_features_available
|
| 98 |
+
else:
|
| 99 |
+
print(f"✅ Feature Selection: ENABLED")
|
| 100 |
+
print(
|
| 101 |
+
f" Selecting {n_features_select}/{n_features_available} features ({n_features_select/n_features_available*100:.1f}%)")
|
| 102 |
+
|
| 103 |
+
print(f"✅ Dataset loaded:")
|
| 104 |
+
print(f" - Total features: {n_features_available}")
|
| 105 |
+
print(f" - Features for GA: {n_features_select}")
|
| 106 |
+
print(f" - Shape: {X.shape}")
|
| 107 |
+
print(f" - Samples: {len(y)}")
|
| 108 |
+
|
| 109 |
label_encoder = LabelEncoder()
|
| 110 |
y_encoded = label_encoder.fit_transform(y)
|
| 111 |
+
|
| 112 |
n_classes = len(label_encoder.classes_)
|
| 113 |
+
|
| 114 |
training_log = ""
|
| 115 |
+
|
|
|
|
|
|
|
|
|
|
| 116 |
if use_cv:
|
| 117 |
return _train_with_cross_validation(
|
| 118 |
X, y_encoded, label_encoder, n_classes,
|
| 119 |
use_ga, n_folds, ga_generations, ga_population, n_jobs,
|
| 120 |
+
optimize_features, n_features_select,
|
| 121 |
progress_callback
|
| 122 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
else:
|
| 124 |
return _train_single_split(
|
| 125 |
X, y_encoded, label_encoder, n_classes,
|
| 126 |
use_ga, ga_generations, ga_population, n_jobs,
|
| 127 |
+
optimize_features, n_features_select,
|
| 128 |
progress_callback
|
| 129 |
)
|
| 130 |
+
|
| 131 |
except Exception as e:
|
| 132 |
import traceback
|
| 133 |
error_trace = traceback.format_exc()
|
|
|
|
| 136 |
|
| 137 |
def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
|
| 138 |
use_ga, n_folds, ga_generations, ga_population, n_jobs,
|
| 139 |
+
optimize_features, n_features_select,
|
| 140 |
progress_callback):
|
| 141 |
"""
|
| 142 |
Train with K-Fold Cross-Validation
|
| 143 |
"""
|
| 144 |
+
|
| 145 |
print("="*80)
|
| 146 |
print(f"{'K-FOLD CROSS-VALIDATION TRAINING':^80}")
|
| 147 |
print("="*80)
|
| 148 |
print(f"Number of folds: {n_folds}")
|
| 149 |
print(f"Use GA: {use_ga}")
|
| 150 |
+
print(f"Optimize Features: {optimize_features}")
|
| 151 |
+
print(f"Features to select: {n_features_select}")
|
| 152 |
print(f"Total samples: {len(X)}")
|
| 153 |
print("="*80)
|
| 154 |
+
|
| 155 |
+
skf = StratifiedKFold(n_splits=n_folds, shuffle=True,
|
| 156 |
+
random_state=config.RANDOM_STATE)
|
| 157 |
+
|
|
|
|
| 158 |
fold_results = []
|
| 159 |
fold_models = []
|
| 160 |
all_ga_history = []
|
| 161 |
training_log = ""
|
| 162 |
+
|
|
|
|
| 163 |
total_steps = n_folds
|
| 164 |
current_step = 0
|
| 165 |
+
|
|
|
|
| 166 |
for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded), 1):
|
| 167 |
fold_log = f"\n{'='*80}\n"
|
| 168 |
fold_log += f"FOLD {fold_idx}/{n_folds}\n"
|
| 169 |
fold_log += f"{'='*80}\n"
|
| 170 |
print(fold_log)
|
| 171 |
training_log += fold_log
|
| 172 |
+
|
| 173 |
if progress_callback:
|
| 174 |
base_progress = current_step / total_steps
|
| 175 |
+
progress_callback(
|
| 176 |
+
base_progress, desc=f"Fold {fold_idx}/{n_folds}: Preparing data...")
|
| 177 |
+
|
| 178 |
X_train, X_test = X[train_idx], X[test_idx]
|
| 179 |
y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]
|
| 180 |
+
|
| 181 |
fold_log = f"Train samples: {len(X_train)}, Test samples: {len(X_test)}\n"
|
| 182 |
print(fold_log)
|
| 183 |
training_log += fold_log
|
| 184 |
+
|
|
|
|
| 185 |
scaler = StandardScaler()
|
| 186 |
X_train_scaled = scaler.fit_transform(X_train)
|
| 187 |
X_test_scaled = scaler.transform(X_test)
|
| 188 |
+
|
|
|
|
|
|
|
|
|
|
| 189 |
if use_ga:
|
| 190 |
if progress_callback:
|
| 191 |
+
progress_callback(base_progress + 0.05/total_steps,
|
| 192 |
+
desc=f"Fold {fold_idx}/{n_folds}: Splitting for GA...")
|
| 193 |
+
|
|
|
|
| 194 |
X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
|
| 195 |
X_train_scaled, y_train,
|
| 196 |
test_size=0.2,
|
| 197 |
random_state=config.RANDOM_STATE,
|
| 198 |
stratify=y_train
|
| 199 |
)
|
| 200 |
+
|
| 201 |
if progress_callback:
|
| 202 |
progress_callback(base_progress + 0.1/total_steps,
|
| 203 |
+
desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...")
|
| 204 |
+
|
| 205 |
+
ga = GeneticAlgorithm(X_train_ga, y_train_ga,
|
| 206 |
+
n_features_to_select=n_features_select)
|
| 207 |
ga.population_size = ga_population
|
| 208 |
ga.n_generations = ga_generations
|
| 209 |
+
|
| 210 |
def ga_progress(p, desc):
|
| 211 |
if progress_callback:
|
|
|
|
| 212 |
ga_progress_in_fold = 0.1 + 0.6 * p
|
| 213 |
+
progress_callback(base_progress + ga_progress_in_fold/total_steps,
|
| 214 |
+
desc=f"Fold {fold_idx}/{n_folds}: {desc}")
|
| 215 |
+
|
| 216 |
best_config = ga.evolve(
|
| 217 |
X_train_ga, y_train_ga, X_val_ga, y_val_ga,
|
| 218 |
progress_callback=ga_progress,
|
| 219 |
n_jobs=n_jobs
|
| 220 |
)
|
| 221 |
+
|
|
|
|
| 222 |
training_log += "\n".join(ga.log_messages) + "\n"
|
| 223 |
all_ga_history.extend(ga.history)
|
| 224 |
+
|
| 225 |
if best_config is None:
|
| 226 |
fold_log = f"❌ GA optimization failed for Fold {fold_idx}\n"
|
| 227 |
print(fold_log)
|
| 228 |
training_log += fold_log
|
| 229 |
continue
|
| 230 |
+
|
|
|
|
| 231 |
selected_indices = best_config['feature_indices']
|
| 232 |
X_train_selected = X_train_scaled[:, selected_indices]
|
| 233 |
X_test_selected = X_test_scaled[:, selected_indices]
|
| 234 |
+
|
| 235 |
if progress_callback:
|
| 236 |
progress_callback(base_progress + 0.7/total_steps,
|
| 237 |
+
desc=f"Fold {fold_idx}/{n_folds}: Training models with GA config...")
|
| 238 |
+
|
|
|
|
| 239 |
models, accuracies = _train_all_models(
|
| 240 |
X_train_selected, y_train, X_test_selected, y_test,
|
| 241 |
n_classes, best_config
|
| 242 |
)
|
| 243 |
+
|
| 244 |
weights = best_config['weights']
|
| 245 |
+
|
| 246 |
fold_log = f"\n✅ GA optimization completed for Fold {fold_idx}\n"
|
| 247 |
fold_log += f"Best fitness: {ga.best_fitness:.4f}\n"
|
| 248 |
fold_log += f"Generations: {len(ga.history)}/{ga_generations}\n"
|
| 249 |
print(fold_log)
|
| 250 |
training_log += fold_log
|
| 251 |
+
|
|
|
|
|
|
|
|
|
|
| 252 |
else:
|
| 253 |
if progress_callback:
|
| 254 |
progress_callback(base_progress + 0.2/total_steps,
|
| 255 |
+
desc=f"Fold {fold_idx}/{n_folds}: Selecting features...")
|
| 256 |
+
|
| 257 |
+
if not optimize_features:
|
| 258 |
+
selected_indices = np.arange(X_train_scaled.shape[1])
|
| 259 |
+
else:
|
| 260 |
+
feature_variance = np.var(X_train_scaled, axis=0)
|
| 261 |
+
selected_indices = np.argsort(
|
| 262 |
+
feature_variance)[-n_features_select:]
|
| 263 |
+
|
| 264 |
X_train_selected = X_train_scaled[:, selected_indices]
|
| 265 |
X_test_selected = X_test_scaled[:, selected_indices]
|
| 266 |
+
|
| 267 |
if progress_callback:
|
| 268 |
progress_callback(base_progress + 0.3/total_steps,
|
| 269 |
+
desc=f"Fold {fold_idx}/{n_folds}: Training models...")
|
| 270 |
+
|
| 271 |
models, accuracies = _train_all_models_default(
|
| 272 |
X_train_selected, y_train, X_test_selected, y_test,
|
| 273 |
n_classes, progress_callback, fold_idx, n_folds, base_progress, total_steps
|
| 274 |
)
|
| 275 |
+
|
|
|
|
| 276 |
acc_values = np.array(list(accuracies.values()))
|
| 277 |
weights = acc_values / acc_values.sum()
|
| 278 |
+
|
|
|
|
|
|
|
|
|
|
| 279 |
if progress_callback:
|
| 280 |
progress_callback(base_progress + 0.9/total_steps,
|
| 281 |
+
desc=f"Fold {fold_idx}/{n_folds}: Evaluating ensemble...")
|
| 282 |
+
|
| 283 |
predictions = [
|
| 284 |
models['xgboost'].predict_proba(X_test_selected),
|
| 285 |
models['lightgbm'].predict_proba(X_test_selected),
|
| 286 |
models['gradientboosting'].predict_proba(X_test_selected),
|
| 287 |
models['adaboost'].predict_proba(X_test_selected)
|
| 288 |
]
|
| 289 |
+
|
| 290 |
ensemble_pred = np.average(predictions, axis=0, weights=weights)
|
| 291 |
ensemble_labels = np.argmax(ensemble_pred, axis=1)
|
| 292 |
ensemble_acc = accuracy_score(y_test, ensemble_labels)
|
| 293 |
+
|
|
|
|
| 294 |
fold_result = {
|
| 295 |
'fold': fold_idx,
|
| 296 |
'xgboost': accuracies['xgboost'],
|
|
|
|
| 302 |
'n_test': len(X_test)
|
| 303 |
}
|
| 304 |
fold_results.append(fold_result)
|
| 305 |
+
|
| 306 |
fold_models.append({
|
| 307 |
'models': models,
|
| 308 |
'scaler': scaler,
|
| 309 |
'selected_indices': selected_indices,
|
| 310 |
'weights': weights
|
| 311 |
})
|
| 312 |
+
|
|
|
|
| 313 |
fold_log = f"\n📊 Fold {fold_idx} Results:\n"
|
| 314 |
fold_log += f" XGBoost: {accuracies['xgboost']:.4f}\n"
|
| 315 |
fold_log += f" LightGBM: {accuracies['lightgbm']:.4f}\n"
|
|
|
|
| 318 |
fold_log += f" Ensemble: {ensemble_acc:.4f} ⭐\n"
|
| 319 |
print(fold_log)
|
| 320 |
training_log += fold_log
|
| 321 |
+
|
| 322 |
current_step += 1
|
| 323 |
+
|
|
|
|
|
|
|
|
|
|
| 324 |
if len(fold_results) == 0:
|
| 325 |
return "❌ All folds failed", None, None, training_log
|
| 326 |
+
|
| 327 |
results_df = pd.DataFrame(fold_results)
|
| 328 |
+
|
|
|
|
| 329 |
stats_log = f"\n{'='*80}\n"
|
| 330 |
stats_log += f"{'CROSS-VALIDATION SUMMARY':^80}\n"
|
| 331 |
stats_log += f"{'='*80}\n\n"
|
| 332 |
+
|
| 333 |
stats_log += "Per-Fold Results:\n"
|
| 334 |
stats_log += results_df.to_string(index=False) + "\n\n"
|
| 335 |
+
|
| 336 |
stats_log += "="*80 + "\n"
|
| 337 |
stats_log += "SUMMARY STATISTICS\n"
|
| 338 |
stats_log += "="*80 + "\n"
|
| 339 |
+
|
| 340 |
stats_summary = []
|
| 341 |
+
|
| 342 |
for model_name in ['xgboost', 'lightgbm', 'gradientboosting', 'adaboost', 'ensemble']:
|
| 343 |
scores = results_df[model_name].values
|
| 344 |
mean_score = scores.mean()
|
| 345 |
std_score = scores.std()
|
| 346 |
+
|
| 347 |
model_stats = f"\n{model_name.upper()}:\n"
|
| 348 |
model_stats += f" Mean Accuracy: {mean_score:.4f}\n"
|
| 349 |
model_stats += f" Std Deviation: {std_score:.4f}\n"
|
| 350 |
model_stats += f" 95% CI: [{mean_score - 1.96*std_score:.4f}, {mean_score + 1.96*std_score:.4f}]\n"
|
| 351 |
model_stats += f" Min: {scores.min():.4f}\n"
|
| 352 |
model_stats += f" Max: {scores.max():.4f}\n"
|
| 353 |
+
|
| 354 |
stats_log += model_stats
|
| 355 |
+
|
| 356 |
stats_summary.append({
|
| 357 |
'Model': model_name.upper(),
|
| 358 |
'Mean': mean_score,
|
|
|
|
| 360 |
'Min': scores.min(),
|
| 361 |
'Max': scores.max()
|
| 362 |
})
|
| 363 |
+
|
| 364 |
print(stats_log)
|
| 365 |
training_log += stats_log
|
| 366 |
+
|
|
|
|
|
|
|
|
|
|
| 367 |
best_fold_idx = results_df['ensemble'].idxmax()
|
| 368 |
best_fold = fold_results[best_fold_idx]
|
| 369 |
best_models = fold_models[best_fold_idx]
|
| 370 |
+
|
| 371 |
save_log = f"\n{'='*80}\n"
|
| 372 |
save_log += f"Best performing fold: Fold {best_fold['fold']} (Ensemble: {best_fold['ensemble']:.4f})\n"
|
| 373 |
save_log += "Saving this model...\n"
|
| 374 |
save_log += "="*80 + "\n"
|
| 375 |
print(save_log)
|
| 376 |
training_log += save_log
|
| 377 |
+
|
| 378 |
if progress_callback:
|
| 379 |
progress_callback(0.95, desc="Saving best model...")
|
| 380 |
+
|
| 381 |
_save_models(
|
| 382 |
best_models['models'],
|
| 383 |
best_models['scaler'],
|
|
|
|
| 393 |
best_fold['ensemble'],
|
| 394 |
cv_results=results_df.to_dict('records')
|
| 395 |
)
|
| 396 |
+
|
| 397 |
if progress_callback:
|
| 398 |
progress_callback(1.0, desc="Complete!")
|
| 399 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
ensemble_mean = results_df['ensemble'].mean()
|
| 401 |
ensemble_std = results_df['ensemble'].std()
|
| 402 |
consistency = (1 - ensemble_std / ensemble_mean) * 100
|
| 403 |
+
|
| 404 |
summary = f"""
|
| 405 |
## ✅ Cross-Validation Training Complete!
|
| 406 |
|
|
|
|
| 434 |
|
| 435 |
📝 **Note**: This is a more reliable estimate than single train/test split!
|
| 436 |
"""
|
| 437 |
+
|
|
|
|
| 438 |
ga_history_df = None
|
| 439 |
if use_ga and len(all_ga_history) > 0:
|
| 440 |
ga_history_df = pd.DataFrame(all_ga_history)
|
| 441 |
+
|
|
|
|
| 442 |
summary_stats_df = pd.DataFrame(stats_summary)
|
| 443 |
+
|
| 444 |
return summary, summary_stats_df, ga_history_df, training_log
|
| 445 |
|
| 446 |
|
| 447 |
def _train_single_split(X, y_encoded, label_encoder, n_classes,
|
| 448 |
+
use_ga, ga_generations, ga_population, n_jobs,
|
| 449 |
+
optimize_features, n_features_select,
|
| 450 |
+
progress_callback):
|
| 451 |
"""
|
| 452 |
Train with single train/test split (Original method)
|
| 453 |
"""
|
| 454 |
+
|
|
|
|
| 455 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 456 |
X, y_encoded,
|
| 457 |
test_size=config.TRAIN_TEST_SPLIT,
|
| 458 |
random_state=config.RANDOM_STATE,
|
| 459 |
stratify=y_encoded
|
| 460 |
)
|
| 461 |
+
|
| 462 |
if progress_callback:
|
| 463 |
progress_callback(0.1, desc="Scaling features...")
|
| 464 |
+
|
| 465 |
scaler = StandardScaler()
|
| 466 |
X_train_scaled = scaler.fit_transform(X_train)
|
| 467 |
X_test_scaled = scaler.transform(X_test)
|
| 468 |
+
|
| 469 |
training_log = ""
|
| 470 |
+
|
| 471 |
if use_ga:
|
|
|
|
| 472 |
if progress_callback:
|
| 473 |
progress_callback(0.2, desc="Initializing GA...")
|
| 474 |
+
|
| 475 |
X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
|
| 476 |
X_train_scaled, y_train,
|
| 477 |
test_size=0.2,
|
| 478 |
random_state=config.RANDOM_STATE,
|
| 479 |
stratify=y_train
|
| 480 |
)
|
| 481 |
+
|
| 482 |
+
ga = GeneticAlgorithm(X_train_ga, y_train_ga,
|
| 483 |
+
n_features_to_select=n_features_select)
|
| 484 |
ga.population_size = ga_population
|
| 485 |
ga.n_generations = ga_generations
|
| 486 |
+
|
| 487 |
def ga_progress(p, desc):
|
| 488 |
if progress_callback:
|
| 489 |
progress_callback(0.2 + 0.6*p, desc=desc)
|
| 490 |
+
|
| 491 |
best_config = ga.evolve(
|
| 492 |
X_train_ga, y_train_ga, X_val_ga, y_val_ga,
|
| 493 |
progress_callback=ga_progress,
|
| 494 |
n_jobs=n_jobs
|
| 495 |
)
|
| 496 |
+
|
| 497 |
training_log = "\n".join(ga.log_messages)
|
| 498 |
+
|
| 499 |
if best_config is None:
|
| 500 |
error_msg = """
|
| 501 |
## ❌ GA Optimization Failed
|
|
|
|
| 514 |
**Training Log:**
|
| 515 |
"""
|
| 516 |
return error_msg + training_log, None, None, training_log
|
| 517 |
+
|
| 518 |
if progress_callback:
|
| 519 |
+
progress_callback(
|
| 520 |
+
0.8, desc="Training final models with GA config...")
|
| 521 |
+
|
| 522 |
selected_indices = best_config['feature_indices']
|
| 523 |
X_train_selected = X_train_scaled[:, selected_indices]
|
| 524 |
X_test_selected = X_test_scaled[:, selected_indices]
|
| 525 |
+
|
|
|
|
| 526 |
models, accuracies = _train_all_models(
|
| 527 |
X_train_selected, y_train, X_test_selected, y_test,
|
| 528 |
n_classes, best_config
|
| 529 |
)
|
| 530 |
+
|
| 531 |
weights = best_config['weights']
|
| 532 |
+
|
| 533 |
ga_summary = f"""
|
| 534 |
### 🧬 GA Optimization Results:
|
| 535 |
- **Generations Completed**: {len(ga.history)}/{ga_generations}
|
| 536 |
- **Population Size**: {ga_population}
|
| 537 |
- **Best Fitness**: {ga.best_fitness:.4f}
|
| 538 |
- **Parallel Jobs**: {n_jobs}
|
| 539 |
+
- **Feature Selection**: {'Enabled' if optimize_features else 'Disabled'}
|
| 540 |
+
- **Features Used**: {len(selected_indices)}
|
| 541 |
|
| 542 |
### 🎯 Best Configuration:
|
| 543 |
- **XGBoost**: n_est={best_config['xgb_n_estimators']}, depth={best_config['xgb_max_depth']}, lr={best_config['xgb_learning_rate']}
|
|
|
|
| 545 |
- **Gradient Boosting**: n_est={best_config['gb_n_estimators']}, depth={best_config['gb_max_depth']}, lr={best_config['gb_learning_rate']}
|
| 546 |
- **AdaBoost**: n_est={best_config['ada_n_estimators']}, lr={best_config['ada_learning_rate']}
|
| 547 |
"""
|
| 548 |
+
|
| 549 |
ga_history_df = pd.DataFrame(ga.history)
|
| 550 |
+
|
| 551 |
else:
|
|
|
|
| 552 |
if progress_callback:
|
| 553 |
+
progress_callback(0.3, desc="Selecting features...")
|
| 554 |
+
|
| 555 |
+
if not optimize_features:
|
| 556 |
+
selected_indices = np.arange(X_train_scaled.shape[1])
|
| 557 |
+
else:
|
| 558 |
+
feature_variance = np.var(X_train_scaled, axis=0)
|
| 559 |
+
selected_indices = np.argsort(
|
| 560 |
+
feature_variance)[-n_features_select:]
|
| 561 |
+
|
| 562 |
X_train_selected = X_train_scaled[:, selected_indices]
|
| 563 |
X_test_selected = X_test_scaled[:, selected_indices]
|
| 564 |
+
|
| 565 |
models, accuracies = _train_all_models_default(
|
| 566 |
X_train_selected, y_train, X_test_selected, y_test,
|
| 567 |
n_classes, progress_callback
|
| 568 |
)
|
| 569 |
+
|
|
|
|
| 570 |
acc_values = list(accuracies.values())
|
| 571 |
weights = np.array(acc_values) / sum(acc_values)
|
| 572 |
+
|
| 573 |
+
ga_summary = f"\n### ⚡ Simple Training (No GA)\n- **Feature Selection**: {'Enabled' if optimize_features else 'Disabled'}\n- **Features Used**: {len(selected_indices)}\n"
|
| 574 |
ga_history_df = None
|
| 575 |
training_log = "Simple training mode - no GA logs"
|
| 576 |
+
|
| 577 |
if progress_callback:
|
| 578 |
progress_callback(0.9, desc="Creating ensemble...")
|
| 579 |
+
|
|
|
|
| 580 |
predictions = [
|
| 581 |
models['xgboost'].predict_proba(X_test_selected),
|
| 582 |
models['lightgbm'].predict_proba(X_test_selected),
|
| 583 |
models['gradientboosting'].predict_proba(X_test_selected),
|
| 584 |
models['adaboost'].predict_proba(X_test_selected)
|
| 585 |
]
|
| 586 |
+
|
| 587 |
ensemble_pred = np.average(predictions, axis=0, weights=weights)
|
| 588 |
ensemble_labels = np.argmax(ensemble_pred, axis=1)
|
| 589 |
ensemble_acc = accuracy_score(y_test, ensemble_labels)
|
| 590 |
+
|
| 591 |
if progress_callback:
|
| 592 |
progress_callback(0.95, desc="Saving models...")
|
| 593 |
+
|
|
|
|
| 594 |
_save_models(models, scaler, label_encoder, selected_indices, weights,
|
| 595 |
+
accuracies, ensemble_acc)
|
| 596 |
+
|
| 597 |
if progress_callback:
|
| 598 |
progress_callback(1.0, desc="Complete!")
|
| 599 |
+
|
|
|
|
| 600 |
results_df = pd.DataFrame({
|
| 601 |
'Model': ['XGBoost', 'LightGBM', 'Gradient Boosting', 'AdaBoost', 'Ensemble'],
|
| 602 |
'Test Accuracy': [
|
|
|
|
| 607 |
ensemble_acc
|
| 608 |
]
|
| 609 |
})
|
| 610 |
+
|
| 611 |
summary = f"""
|
| 612 |
## ✅ Training Complete!
|
| 613 |
|
|
|
|
| 638 |
|
| 639 |
⚠️ **Note**: Single train/test split. For more reliable results, use Cross-Validation!
|
| 640 |
"""
|
| 641 |
+
|
| 642 |
return summary, results_df, ga_history_df, training_log
|
| 643 |
|
| 644 |
|
|
|
|
| 646 |
"""Train all models with given configuration"""
|
| 647 |
models = {}
|
| 648 |
accuracies = {}
|
| 649 |
+
|
|
|
|
| 650 |
xgb = XGBClassifier(
|
| 651 |
n_estimators=config_dict['xgb_n_estimators'],
|
| 652 |
max_depth=config_dict['xgb_max_depth'],
|
| 653 |
learning_rate=config_dict['xgb_learning_rate'],
|
| 654 |
+
subsample=config_dict.get('xgb_subsample', 0.8),
|
| 655 |
+
colsample_bytree=config_dict.get('xgb_colsample_bytree', 0.8),
|
| 656 |
+
min_child_weight=config_dict.get('xgb_min_child_weight', 1),
|
| 657 |
+
gamma=config_dict.get('xgb_gamma', 0),
|
| 658 |
objective='multi:softprob',
|
| 659 |
num_class=n_classes,
|
| 660 |
random_state=config.RANDOM_STATE,
|
|
|
|
| 664 |
xgb.fit(X_train, y_train)
|
| 665 |
models['xgboost'] = xgb
|
| 666 |
accuracies['xgboost'] = xgb.score(X_test, y_test)
|
| 667 |
+
|
|
|
|
| 668 |
lgbm = LGBMClassifier(
|
| 669 |
n_estimators=config_dict['lgbm_n_estimators'],
|
| 670 |
num_leaves=config_dict['lgbm_num_leaves'],
|
| 671 |
learning_rate=config_dict['lgbm_learning_rate'],
|
| 672 |
+
min_child_samples=config_dict.get('lgbm_min_child_samples', 20),
|
| 673 |
+
subsample=config_dict.get('lgbm_subsample', 0.8),
|
| 674 |
+
colsample_bytree=config_dict.get('lgbm_colsample_bytree', 0.8),
|
| 675 |
+
reg_alpha=config_dict.get('lgbm_reg_alpha', 0),
|
| 676 |
+
reg_lambda=config_dict.get('lgbm_reg_lambda', 0),
|
| 677 |
objective='multiclass',
|
| 678 |
num_class=n_classes,
|
| 679 |
random_state=config.RANDOM_STATE,
|
| 680 |
n_jobs=-1,
|
| 681 |
+
verbose=-1,
|
| 682 |
+
force_col_wise=True
|
| 683 |
)
|
| 684 |
lgbm.fit(X_train, y_train)
|
| 685 |
models['lightgbm'] = lgbm
|
| 686 |
accuracies['lightgbm'] = lgbm.score(X_test, y_test)
|
| 687 |
+
|
|
|
|
| 688 |
gb = GradientBoostingClassifier(
|
| 689 |
n_estimators=config_dict['gb_n_estimators'],
|
| 690 |
max_depth=config_dict['gb_max_depth'],
|
| 691 |
learning_rate=config_dict['gb_learning_rate'],
|
| 692 |
+
subsample=config_dict.get('gb_subsample', 0.8),
|
| 693 |
+
min_samples_split=config_dict.get('gb_min_samples_split', 2),
|
| 694 |
+
min_samples_leaf=config_dict.get('gb_min_samples_leaf', 1),
|
| 695 |
random_state=config.RANDOM_STATE
|
| 696 |
)
|
| 697 |
gb.fit(X_train, y_train)
|
| 698 |
models['gradientboosting'] = gb
|
| 699 |
accuracies['gradientboosting'] = gb.score(X_test, y_test)
|
| 700 |
+
|
|
|
|
| 701 |
ada = AdaBoostClassifier(
|
| 702 |
n_estimators=config_dict['ada_n_estimators'],
|
| 703 |
learning_rate=config_dict['ada_learning_rate'],
|
| 704 |
+
algorithm=config.ADABOOST_ALGORITHM,
|
| 705 |
random_state=config.RANDOM_STATE
|
| 706 |
)
|
| 707 |
ada.fit(X_train, y_train)
|
| 708 |
models['adaboost'] = ada
|
| 709 |
accuracies['adaboost'] = ada.score(X_test, y_test)
|
| 710 |
+
|
| 711 |
return models, accuracies
|
| 712 |
|
| 713 |
|
| 714 |
+
def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
|
| 715 |
+
progress_callback=None, fold_idx=None, n_folds=None,
|
| 716 |
base_progress=0, total_steps=1):
|
| 717 |
"""Train all models with default hyperparameters"""
|
| 718 |
models = {}
|
| 719 |
accuracies = {}
|
| 720 |
+
|
| 721 |
if progress_callback and fold_idx:
|
| 722 |
+
progress_callback(base_progress + 0.4/total_steps,
|
| 723 |
+
desc=f"Fold {fold_idx}/{n_folds}: Training XGBoost...")
|
| 724 |
elif progress_callback:
|
| 725 |
progress_callback(0.4, desc="Training XGBoost...")
|
| 726 |
+
|
| 727 |
xgb = XGBClassifier(
|
| 728 |
n_estimators=150, max_depth=5, learning_rate=0.1,
|
| 729 |
objective='multi:softprob', num_class=n_classes,
|
|
|
|
| 732 |
xgb.fit(X_train, y_train)
|
| 733 |
models['xgboost'] = xgb
|
| 734 |
accuracies['xgboost'] = xgb.score(X_test, y_test)
|
| 735 |
+
|
| 736 |
if progress_callback and fold_idx:
|
| 737 |
progress_callback(base_progress + 0.5/total_steps,
|
| 738 |
+
desc=f"Fold {fold_idx}/{n_folds}: Training LightGBM...")
|
| 739 |
elif progress_callback:
|
| 740 |
progress_callback(0.5, desc="Training LightGBM...")
|
| 741 |
+
|
| 742 |
lgbm = LGBMClassifier(
|
| 743 |
n_estimators=150, num_leaves=40, learning_rate=0.1,
|
| 744 |
objective='multiclass', num_class=n_classes,
|
| 745 |
+
random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1,
|
| 746 |
+
force_col_wise=True
|
| 747 |
)
|
| 748 |
lgbm.fit(X_train, y_train)
|
| 749 |
models['lightgbm'] = lgbm
|
| 750 |
accuracies['lightgbm'] = lgbm.score(X_test, y_test)
|
| 751 |
+
|
| 752 |
if progress_callback and fold_idx:
|
| 753 |
progress_callback(base_progress + 0.65/total_steps,
|
| 754 |
+
desc=f"Fold {fold_idx}/{n_folds}: Training Gradient Boosting...")
|
| 755 |
elif progress_callback:
|
| 756 |
progress_callback(0.65, desc="Training Gradient Boosting...")
|
| 757 |
+
|
| 758 |
gb = GradientBoostingClassifier(
|
| 759 |
n_estimators=100, max_depth=4, learning_rate=0.1,
|
| 760 |
random_state=config.RANDOM_STATE
|
|
|
|
| 762 |
gb.fit(X_train, y_train)
|
| 763 |
models['gradientboosting'] = gb
|
| 764 |
accuracies['gradientboosting'] = gb.score(X_test, y_test)
|
| 765 |
+
|
| 766 |
if progress_callback and fold_idx:
|
| 767 |
progress_callback(base_progress + 0.8/total_steps,
|
| 768 |
+
desc=f"Fold {fold_idx}/{n_folds}: Training AdaBoost...")
|
| 769 |
elif progress_callback:
|
| 770 |
progress_callback(0.8, desc="Training AdaBoost...")
|
| 771 |
+
|
| 772 |
ada = AdaBoostClassifier(
|
| 773 |
+
n_estimators=100,
|
| 774 |
+
learning_rate=1.0,
|
| 775 |
+
algorithm=config.ADABOOST_ALGORITHM,
|
| 776 |
random_state=config.RANDOM_STATE
|
| 777 |
)
|
| 778 |
ada.fit(X_train, y_train)
|
| 779 |
models['adaboost'] = ada
|
| 780 |
accuracies['adaboost'] = ada.score(X_test, y_test)
|
| 781 |
+
|
| 782 |
return models, accuracies
|
| 783 |
|
| 784 |
|
| 785 |
+
def _save_models(models, scaler, label_encoder, selected_indices, weights,
|
| 786 |
+
accuracies, ensemble_acc, cv_results=None):
|
| 787 |
"""Save all models and configuration"""
|
| 788 |
config.WEIGHTS_DIR.mkdir(exist_ok=True)
|
| 789 |
+
|
|
|
|
| 790 |
with open(config.WEIGHTS_DIR / 'xgboost_model.pkl', 'wb') as f:
|
| 791 |
pickle.dump(models['xgboost'], f)
|
| 792 |
+
|
| 793 |
with open(config.WEIGHTS_DIR / 'lightgbm_model.pkl', 'wb') as f:
|
| 794 |
pickle.dump(models['lightgbm'], f)
|
| 795 |
+
|
| 796 |
with open(config.WEIGHTS_DIR / 'gradientboost_model.pkl', 'wb') as f:
|
| 797 |
pickle.dump(models['gradientboosting'], f)
|
| 798 |
+
|
| 799 |
with open(config.WEIGHTS_DIR / 'adaboost_model.pkl', 'wb') as f:
|
| 800 |
pickle.dump(models['adaboost'], f)
|
| 801 |
+
|
|
|
|
| 802 |
with open(config.WEIGHTS_DIR / 'scaler.pkl', 'wb') as f:
|
| 803 |
pickle.dump(scaler, f)
|
| 804 |
+
|
| 805 |
with open(config.WEIGHTS_DIR / 'label_encoder.pkl', 'wb') as f:
|
| 806 |
pickle.dump(label_encoder, f)
|
| 807 |
+
|
|
|
|
| 808 |
model_config = {
|
| 809 |
'selected_features': selected_indices.tolist(),
|
| 810 |
'ensemble_weights': weights.tolist(),
|
|
|
|
| 818 |
'ensemble': float(ensemble_acc)
|
| 819 |
}
|
| 820 |
}
|
| 821 |
+
|
|
|
|
| 822 |
if cv_results is not None:
|
| 823 |
model_config['cv_results'] = cv_results
|
| 824 |
+
model_config['training_mode'] = 'cross_validation'
|
| 825 |
else:
|
| 826 |
+
model_config['training_mode'] = 'single_split'
|
| 827 |
+
|
| 828 |
with open(config.WEIGHTS_DIR / 'config.json', 'w') as f:
|
| 829 |
+
json.dump(model_config, f, indent=2)
|
src/ui/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (492 Bytes). View file
|
|
|
src/ui/__pycache__/tab1_extraction.cpython-311.pyc
ADDED
|
Binary file (19.9 kB). View file
|
|
|
src/ui/__pycache__/tab2_training.cpython-311.pyc
ADDED
|
Binary file (13.4 kB). View file
|
|
|
src/ui/__pycache__/tab3_prediction.cpython-311.pyc
ADDED
|
Binary file (6.56 kB). View file
|
|
|
src/ui/tab1_extraction.py
CHANGED
|
@@ -1,19 +1,66 @@
|
|
| 1 |
"""
|
| 2 |
-
Tab 1: Feature Extraction UI
|
| 3 |
"""
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
| 7 |
from pathlib import Path
|
|
|
|
| 8 |
|
| 9 |
from src.data_loader import scan_dataset_directory, extract_emotion_from_filename, extract_actor_from_filename, get_dataset_statistics
|
| 10 |
-
from src.feature_extraction import extract_features
|
| 11 |
from src.utils import create_waveform_plot, create_spectrogram_plot
|
| 12 |
import config
|
| 13 |
|
| 14 |
|
| 15 |
-
def
|
| 16 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
try:
|
| 19 |
progress(0, desc="Scanning dataset directory...")
|
|
@@ -21,50 +68,34 @@ def extract_dataset_features(progress=gr.Progress()):
|
|
| 21 |
audio_files, error = scan_dataset_directory()
|
| 22 |
|
| 23 |
if error:
|
| 24 |
-
return f""
|
| 25 |
-
## ❌ Error: {error}
|
| 26 |
-
|
| 27 |
-
**Expected structure:**
|
| 28 |
-
data/
|
| 29 |
-
└── RAVDESS/
|
| 30 |
-
└── audio_speech_actors_01-24/
|
| 31 |
-
├── Actor_01/
|
| 32 |
-
│ ├── 03-01-01-01-01-01-01.wav
|
| 33 |
-
│ └── ...
|
| 34 |
-
├── Actor_02/
|
| 35 |
-
└── ...
|
| 36 |
-
|
| 37 |
-
**Please ensure dataset is in correct location.**
|
| 38 |
-
""", None, None
|
| 39 |
|
| 40 |
if len(audio_files) == 0:
|
| 41 |
return "❌ No audio files found", None, None
|
| 42 |
|
|
|
|
|
|
|
| 43 |
progress(
|
| 44 |
-
0.05, desc=f"Found {len(audio_files)} files. Extracting features...")
|
| 45 |
|
| 46 |
data_list = []
|
| 47 |
failed_files = []
|
| 48 |
total_files = len(audio_files)
|
| 49 |
|
| 50 |
for idx, audio_file in enumerate(audio_files):
|
| 51 |
-
progress(
|
| 52 |
-
|
| 53 |
-
desc=f"Processing {idx + 1}/{total_files}: {audio_file.name}"
|
| 54 |
-
)
|
| 55 |
|
| 56 |
try:
|
| 57 |
-
features, _, _ = extract_features(
|
|
|
|
|
|
|
| 58 |
filename = audio_file.name
|
| 59 |
emotion = extract_emotion_from_filename(filename)
|
| 60 |
actor = extract_actor_from_filename(filename)
|
| 61 |
|
| 62 |
-
row = {
|
| 63 |
-
'
|
| 64 |
-
'filename': filename,
|
| 65 |
-
'actor': actor,
|
| 66 |
-
'emotion': emotion
|
| 67 |
-
}
|
| 68 |
|
| 69 |
for i, feat in enumerate(features):
|
| 70 |
row[f'feature_{i}'] = feat
|
|
@@ -81,29 +112,65 @@ data/
|
|
| 81 |
progress(0.95, desc="Saving to CSV...")
|
| 82 |
|
| 83 |
df = pd.DataFrame(data_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
df.to_csv(config.FEATURES_CSV, index=False)
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
progress(1.0, desc="Complete!")
|
| 87 |
|
| 88 |
stats = get_dataset_statistics(audio_files)
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
### 📊 Statistics:
|
| 94 |
- **Total Files**: {stats['total_files']}
|
| 95 |
- **Successfully Processed**: {len(df)}
|
| 96 |
- **Failed**: {len(failed_files)}
|
| 97 |
-
- **Features per File**: {
|
| 98 |
-
- **Output**: `{config.FEATURES_CSV}`
|
|
|
|
| 99 |
|
| 100 |
### 🎭 Emotion Distribution:
|
| 101 |
{df['emotion'].value_counts().to_string()}
|
| 102 |
|
| 103 |
### 👥 Actors: {stats['n_actors']}
|
| 104 |
|
| 105 |
-
✅ **Ready for training! Go to Tab 2.**
|
| 106 |
-
"""
|
| 107 |
|
| 108 |
if failed_files:
|
| 109 |
summary += f"\n\n### ⚠️ Failed Files ({len(failed_files)}):\n"
|
|
@@ -124,56 +191,66 @@ def check_dataset_status():
|
|
| 124 |
audio_files, error = scan_dataset_directory()
|
| 125 |
|
| 126 |
if error:
|
| 127 |
-
return f""
|
| 128 |
-
## ⚠️ Dataset Not Found
|
| 129 |
-
|
| 130 |
-
{error}
|
| 131 |
-
|
| 132 |
-
**Please upload RAVDESS dataset to the correct location.**
|
| 133 |
-
"""
|
| 134 |
|
| 135 |
stats = get_dataset_statistics(audio_files)
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
-
### 🎭 Emotions
|
| 145 |
-
"""
|
| 146 |
|
| 147 |
for emotion, count in sorted(stats['emotion_counts'].items()):
|
| 148 |
status += f"- **{emotion.capitalize()}**: {count} files\n"
|
| 149 |
|
| 150 |
-
status += f""
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
""
|
| 156 |
|
| 157 |
return status
|
| 158 |
|
| 159 |
|
| 160 |
-
def preview_single_audio(audio_file):
|
| 161 |
-
"""Preview single audio file"""
|
| 162 |
if audio_file is None:
|
| 163 |
return "Please upload an audio file", None, None
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
try:
|
| 166 |
-
features, y, sr = extract_features(
|
|
|
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
- **File**: {Path(audio_file).name}
|
| 172 |
-
- **Features**: {config.N_FEATURES}
|
| 173 |
-
- **Sample Rate**: {sr} Hz
|
| 174 |
-
- **Duration**: {len(y)/sr:.2f}s
|
| 175 |
-
- **Emotion**: {extract_emotion_from_filename(Path(audio_file).name)}
|
| 176 |
-
"""
|
| 177 |
|
| 178 |
waveform = create_waveform_plot(y, sr)
|
| 179 |
spectrogram = create_spectrogram_plot(y, sr)
|
|
@@ -186,41 +263,58 @@ def preview_single_audio(audio_file):
|
|
| 186 |
|
| 187 |
|
| 188 |
def create_tab1():
|
| 189 |
-
"""Create Tab 1: Feature Extraction"""
|
| 190 |
|
| 191 |
with gr.Tab("1️⃣ Feature Extraction"):
|
| 192 |
-
gr.Markdown(
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
Automatically processes all audio files in `data/RAVDESS/audio_speech_actors_01-24/`
|
| 196 |
-
""")
|
| 197 |
|
| 198 |
with gr.Row():
|
| 199 |
with gr.Column(scale=1):
|
| 200 |
-
|
| 201 |
|
| 202 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
Test feature extraction on one file.
|
| 215 |
-
""")
|
| 216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
preview_audio = gr.Audio(
|
| 218 |
-
sources=["upload"],
|
| 219 |
-
type="filepath",
|
| 220 |
-
label="Upload Single File"
|
| 221 |
-
)
|
| 222 |
preview_btn = gr.Button("Preview Features")
|
| 223 |
|
|
|
|
|
|
|
| 224 |
with gr.Column(scale=2):
|
| 225 |
output_text = gr.Markdown()
|
| 226 |
preview_df = gr.Dataframe(label="Dataset Preview")
|
|
@@ -230,19 +324,8 @@ def create_tab1():
|
|
| 230 |
waveform_plot = gr.Plot(label="Waveform")
|
| 231 |
spectrogram_plot = gr.Plot(label="Spectrogram")
|
| 232 |
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
extract_btn.click(
|
| 240 |
-
fn=extract_dataset_features,
|
| 241 |
-
outputs=[output_text, preview_df, emotion_dist]
|
| 242 |
-
)
|
| 243 |
-
|
| 244 |
-
preview_btn.click(
|
| 245 |
-
fn=preview_single_audio,
|
| 246 |
-
inputs=[preview_audio],
|
| 247 |
-
outputs=[output_text, waveform_plot, spectrogram_plot]
|
| 248 |
-
)
|
|
|
|
| 1 |
"""
|
| 2 |
+
Tab 1: Feature Extraction UI with Feature Type Selection and MFCC Count
|
| 3 |
"""
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
| 7 |
from pathlib import Path
|
| 8 |
+
import json
|
| 9 |
|
| 10 |
from src.data_loader import scan_dataset_directory, extract_emotion_from_filename, extract_actor_from_filename, get_dataset_statistics
|
| 11 |
+
from src.feature_extraction import extract_features, get_feature_count
|
| 12 |
from src.utils import create_waveform_plot, create_spectrogram_plot
|
| 13 |
import config
|
| 14 |
|
| 15 |
|
| 16 |
+
def calculate_feature_count(zcr, chroma, mfcc, rms, mel, n_mfcc):
|
| 17 |
+
"""Calculate total feature count based on selections"""
|
| 18 |
+
feature_types = []
|
| 19 |
+
if zcr:
|
| 20 |
+
feature_types.append('zcr')
|
| 21 |
+
if chroma:
|
| 22 |
+
feature_types.append('chroma')
|
| 23 |
+
if mfcc:
|
| 24 |
+
feature_types.append('mfcc')
|
| 25 |
+
if rms:
|
| 26 |
+
feature_types.append('rms')
|
| 27 |
+
if mel:
|
| 28 |
+
feature_types.append('mel')
|
| 29 |
+
|
| 30 |
+
total = get_feature_count(feature_types, n_mfcc=n_mfcc)
|
| 31 |
+
|
| 32 |
+
breakdown = []
|
| 33 |
+
if zcr:
|
| 34 |
+
breakdown.append("ZCR: 1")
|
| 35 |
+
if chroma:
|
| 36 |
+
breakdown.append("Chroma: 12")
|
| 37 |
+
if mfcc:
|
| 38 |
+
breakdown.append(f"MFCC: {n_mfcc}")
|
| 39 |
+
if rms:
|
| 40 |
+
breakdown.append("RMS: 1")
|
| 41 |
+
if mel:
|
| 42 |
+
breakdown.append("Mel: 128")
|
| 43 |
+
|
| 44 |
+
return f"**Total Features: {total}**\n\n*Breakdown: {' + '.join(breakdown)}*"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def extract_dataset_features(zcr, chroma, mfcc, rms, mel, n_mfcc, progress=gr.Progress()):
|
| 48 |
+
"""Extract features from dataset with selected feature types"""
|
| 49 |
+
|
| 50 |
+
feature_types = []
|
| 51 |
+
if zcr:
|
| 52 |
+
feature_types.append('zcr')
|
| 53 |
+
if chroma:
|
| 54 |
+
feature_types.append('chroma')
|
| 55 |
+
if mfcc:
|
| 56 |
+
feature_types.append('mfcc')
|
| 57 |
+
if rms:
|
| 58 |
+
feature_types.append('rms')
|
| 59 |
+
if mel:
|
| 60 |
+
feature_types.append('mel')
|
| 61 |
+
|
| 62 |
+
if len(feature_types) == 0:
|
| 63 |
+
return "❌ Please select at least one feature type!", None, None
|
| 64 |
|
| 65 |
try:
|
| 66 |
progress(0, desc="Scanning dataset directory...")
|
|
|
|
| 68 |
audio_files, error = scan_dataset_directory()
|
| 69 |
|
| 70 |
if error:
|
| 71 |
+
return f"## ❌ Error: {error}\n\n**Please ensure dataset is in correct location.**", None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
if len(audio_files) == 0:
|
| 74 |
return "❌ No audio files found", None, None
|
| 75 |
|
| 76 |
+
total_features = get_feature_count(feature_types, n_mfcc=n_mfcc)
|
| 77 |
+
|
| 78 |
progress(
|
| 79 |
+
0.05, desc=f"Found {len(audio_files)} files. Extracting {total_features} features...")
|
| 80 |
|
| 81 |
data_list = []
|
| 82 |
failed_files = []
|
| 83 |
total_files = len(audio_files)
|
| 84 |
|
| 85 |
for idx, audio_file in enumerate(audio_files):
|
| 86 |
+
progress(0.05 + (idx + 1) / total_files * 0.90,
|
| 87 |
+
desc=f"Processing {idx + 1}/{total_files}: {audio_file.name}")
|
|
|
|
|
|
|
| 88 |
|
| 89 |
try:
|
| 90 |
+
features, _, _, feature_info = extract_features(
|
| 91 |
+
str(audio_file), feature_types=feature_types, n_mfcc=n_mfcc)
|
| 92 |
+
|
| 93 |
filename = audio_file.name
|
| 94 |
emotion = extract_emotion_from_filename(filename)
|
| 95 |
actor = extract_actor_from_filename(filename)
|
| 96 |
|
| 97 |
+
row = {'file_path': str(
|
| 98 |
+
audio_file), 'filename': filename, 'actor': actor, 'emotion': emotion}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
for i, feat in enumerate(features):
|
| 101 |
row[f'feature_{i}'] = feat
|
|
|
|
| 112 |
progress(0.95, desc="Saving to CSV...")
|
| 113 |
|
| 114 |
df = pd.DataFrame(data_list)
|
| 115 |
+
|
| 116 |
+
extraction_config = {
|
| 117 |
+
'feature_types': feature_types,
|
| 118 |
+
'n_mfcc': n_mfcc if 'mfcc' in feature_types else 0,
|
| 119 |
+
'total_features': total_features,
|
| 120 |
+
'feature_breakdown': {
|
| 121 |
+
'zcr': 1 if 'zcr' in feature_types else 0,
|
| 122 |
+
'chroma': 12 if 'chroma' in feature_types else 0,
|
| 123 |
+
'mfcc': n_mfcc if 'mfcc' in feature_types else 0,
|
| 124 |
+
'rms': 1 if 'rms' in feature_types else 0,
|
| 125 |
+
'mel': 128 if 'mel' in feature_types else 0
|
| 126 |
+
},
|
| 127 |
+
'n_samples': len(df),
|
| 128 |
+
'extraction_date': pd.Timestamp.now().isoformat()
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
df.to_csv(config.FEATURES_CSV, index=False)
|
| 132 |
|
| 133 |
+
config_file = Path(config.FEATURES_CSV).with_suffix('.json')
|
| 134 |
+
with open(config_file, 'w') as f:
|
| 135 |
+
json.dump(extraction_config, f, indent=2)
|
| 136 |
+
|
| 137 |
progress(1.0, desc="Complete!")
|
| 138 |
|
| 139 |
stats = get_dataset_statistics(audio_files)
|
| 140 |
|
| 141 |
+
feature_summary_lines = []
|
| 142 |
+
if 'zcr' in feature_types:
|
| 143 |
+
feature_summary_lines.append("- **ZCR**: 1 feature")
|
| 144 |
+
if 'chroma' in feature_types:
|
| 145 |
+
feature_summary_lines.append("- **CHROMA**: 12 features")
|
| 146 |
+
if 'mfcc' in feature_types:
|
| 147 |
+
feature_summary_lines.append(f"- **MFCC**: {n_mfcc} features")
|
| 148 |
+
if 'rms' in feature_types:
|
| 149 |
+
feature_summary_lines.append("- **RMS**: 1 feature")
|
| 150 |
+
if 'mel' in feature_types:
|
| 151 |
+
feature_summary_lines.append("- **MEL**: 128 features")
|
| 152 |
+
|
| 153 |
+
feature_summary = "\n".join(feature_summary_lines)
|
| 154 |
+
|
| 155 |
+
summary = f"""## ✅ Feature Extraction Complete!
|
| 156 |
+
|
| 157 |
+
### 🎨 Selected Feature Types:
|
| 158 |
+
{feature_summary}
|
| 159 |
|
| 160 |
### 📊 Statistics:
|
| 161 |
- **Total Files**: {stats['total_files']}
|
| 162 |
- **Successfully Processed**: {len(df)}
|
| 163 |
- **Failed**: {len(failed_files)}
|
| 164 |
+
- **Features per File**: {total_features}
|
| 165 |
+
- **Output CSV**: `{config.FEATURES_CSV}`
|
| 166 |
+
- **Config File**: `{config_file}`
|
| 167 |
|
| 168 |
### 🎭 Emotion Distribution:
|
| 169 |
{df['emotion'].value_counts().to_string()}
|
| 170 |
|
| 171 |
### 👥 Actors: {stats['n_actors']}
|
| 172 |
|
| 173 |
+
✅ **Ready for training! Go to Tab 2.**"""
|
|
|
|
| 174 |
|
| 175 |
if failed_files:
|
| 176 |
summary += f"\n\n### ⚠️ Failed Files ({len(failed_files)}):\n"
|
|
|
|
| 191 |
audio_files, error = scan_dataset_directory()
|
| 192 |
|
| 193 |
if error:
|
| 194 |
+
return f"## ⚠️ Dataset Not Found\n\n{error}\n\n**Please upload RAVDESS dataset to the correct location.**"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
stats = get_dataset_statistics(audio_files)
|
| 197 |
|
| 198 |
+
config_file = Path(config.FEATURES_CSV).with_suffix('.json')
|
| 199 |
+
existing_config = None
|
| 200 |
|
| 201 |
+
if config_file.exists():
|
| 202 |
+
try:
|
| 203 |
+
with open(config_file, 'r') as f:
|
| 204 |
+
existing_config = json.load(f)
|
| 205 |
+
except:
|
| 206 |
+
pass
|
| 207 |
|
| 208 |
+
status = f"## ✅ Dataset Found!\n\n### 📊 Statistics:\n- **Total Files**: {stats['total_files']}\n- **Location**: `{config.DATA_DIR}`\n\n### 🎭 Emotions:\n"
|
|
|
|
| 209 |
|
| 210 |
for emotion, count in sorted(stats['emotion_counts'].items()):
|
| 211 |
status += f"- **{emotion.capitalize()}**: {count} files\n"
|
| 212 |
|
| 213 |
+
status += f"\n### 👥 Actors: {stats['n_actors']}\n"
|
| 214 |
|
| 215 |
+
if existing_config:
|
| 216 |
+
status += f"\n---\n\n### 📋 Previous Extraction Found:\n- **Feature Types**: {', '.join(existing_config.get('feature_types', []))}\n- **Total Features**: {existing_config.get('total_features', 'Unknown')}\n- **MFCC Count**: {existing_config.get('n_mfcc', 'N/A')}\n- **Samples**: {existing_config.get('n_samples', 'Unknown')}\n\n**Note**: Re-extracting will overwrite previous features."
|
| 217 |
+
else:
|
| 218 |
+
status += '\n**Select feature types and click "🔊 Extract Features".**'
|
| 219 |
|
| 220 |
return status
|
| 221 |
|
| 222 |
|
| 223 |
+
def preview_single_audio(audio_file, zcr, chroma, mfcc, rms, mel, n_mfcc):
|
| 224 |
+
"""Preview single audio file with selected features"""
|
| 225 |
if audio_file is None:
|
| 226 |
return "Please upload an audio file", None, None
|
| 227 |
|
| 228 |
+
feature_types = []
|
| 229 |
+
if zcr:
|
| 230 |
+
feature_types.append('zcr')
|
| 231 |
+
if chroma:
|
| 232 |
+
feature_types.append('chroma')
|
| 233 |
+
if mfcc:
|
| 234 |
+
feature_types.append('mfcc')
|
| 235 |
+
if rms:
|
| 236 |
+
feature_types.append('rms')
|
| 237 |
+
if mel:
|
| 238 |
+
feature_types.append('mel')
|
| 239 |
+
|
| 240 |
+
if len(feature_types) == 0:
|
| 241 |
+
return "❌ Please select at least one feature type!", None, None
|
| 242 |
+
|
| 243 |
try:
|
| 244 |
+
features, y, sr, feature_info = extract_features(
|
| 245 |
+
audio_file, feature_types=feature_types, n_mfcc=n_mfcc)
|
| 246 |
|
| 247 |
+
feature_breakdown_lines = []
|
| 248 |
+
for ftype, count in feature_info['counts'].items():
|
| 249 |
+
feature_breakdown_lines.append(
|
| 250 |
+
f"- **{ftype.upper()}**: {count} features")
|
| 251 |
+
feature_breakdown = "\n".join(feature_breakdown_lines)
|
| 252 |
|
| 253 |
+
summary = f"## 🔍 Single File Preview\n\n- **File**: {Path(audio_file).name}\n- **Sample Rate**: {sr} Hz\n- **Duration**: {len(y)/sr:.2f}s\n- **Emotion**: {extract_emotion_from_filename(Path(audio_file).name)}\n\n### 🎨 Extracted Features:\n{feature_breakdown}\n\n**Total Features**: {feature_info['total']}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
waveform = create_waveform_plot(y, sr)
|
| 256 |
spectrogram = create_spectrogram_plot(y, sr)
|
|
|
|
| 263 |
|
| 264 |
|
| 265 |
def create_tab1():
|
| 266 |
+
"""Create Tab 1: Feature Extraction with Feature Type Selection"""
|
| 267 |
|
| 268 |
with gr.Tab("1️⃣ Feature Extraction"):
|
| 269 |
+
gr.Markdown(
|
| 270 |
+
"## 📁 Extract Features from Dataset\n\n**Select which feature types to extract:**")
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
with gr.Row():
|
| 273 |
with gr.Column(scale=1):
|
| 274 |
+
gr.Markdown("### 🎨 Feature Types")
|
| 275 |
|
| 276 |
+
with gr.Group():
|
| 277 |
+
zcr_check = gr.Checkbox(
|
| 278 |
+
label="🌊 ZCR - Zero Crossing Rate (1 feature)", value=True, info="Signal sign change frequency")
|
| 279 |
+
chroma_check = gr.Checkbox(
|
| 280 |
+
label="🎵 Chroma STFT (12 features)", value=True, info="Pitch class distribution")
|
| 281 |
+
mfcc_check = gr.Checkbox(label="🎤 MFCC (20-40 features, configurable below)",
|
| 282 |
+
value=True, info="Mel-frequency cepstral coefficients - MOST IMPORTANT")
|
| 283 |
|
| 284 |
+
n_mfcc_slider = gr.Slider(minimum=config.MFCC_MIN, maximum=config.MFCC_MAX, value=config.MFCC_DEFAULT, step=1,
|
| 285 |
+
label="Number of MFCC Coefficients", info="More MFCC = more detail but slower extraction", visible=True)
|
| 286 |
+
|
| 287 |
+
rms_check = gr.Checkbox(
|
| 288 |
+
label="📊 RMS Energy (1 feature)", value=True, info="Signal amplitude/loudness")
|
| 289 |
+
mel_check = gr.Checkbox(
|
| 290 |
+
label="🎹 Mel Spectrogram (128 features)", value=True, info="Frequency distribution over time")
|
| 291 |
+
|
| 292 |
+
feature_count_display = gr.Markdown(calculate_feature_count(
|
| 293 |
+
True, True, True, True, True, config.MFCC_DEFAULT))
|
|
|
|
|
|
|
| 294 |
|
| 295 |
+
for control in [zcr_check, chroma_check, mfcc_check, rms_check, mel_check, n_mfcc_slider]:
|
| 296 |
+
control.change(fn=calculate_feature_count, inputs=[
|
| 297 |
+
zcr_check, chroma_check, mfcc_check, rms_check, mel_check, n_mfcc_slider], outputs=[feature_count_display])
|
| 298 |
+
|
| 299 |
+
def toggle_mfcc_slider(mfcc_enabled):
|
| 300 |
+
return gr.update(visible=mfcc_enabled)
|
| 301 |
+
|
| 302 |
+
mfcc_check.change(fn=toggle_mfcc_slider, inputs=[
|
| 303 |
+
mfcc_check], outputs=[n_mfcc_slider])
|
| 304 |
+
|
| 305 |
+
gr.Markdown("---")
|
| 306 |
+
check_btn = gr.Button("🔄 Check Dataset Status", size="sm")
|
| 307 |
+
gr.Markdown("---")
|
| 308 |
+
extract_btn = gr.Button(
|
| 309 |
+
"🔊 Extract Features", variant="primary", size="lg")
|
| 310 |
+
gr.Markdown(
|
| 311 |
+
"---\n### 🔍 Preview Single Audio\n\nTest feature extraction on one file.")
|
| 312 |
preview_audio = gr.Audio(
|
| 313 |
+
sources=["upload"], type="filepath", label="Upload Single File")
|
|
|
|
|
|
|
|
|
|
| 314 |
preview_btn = gr.Button("Preview Features")
|
| 315 |
|
| 316 |
+
gr.Markdown("---\n### 💡 Feature Selection Tips\n\n**All Features (162):**\n- MFCC: 20 (default)\n- Most balanced\n- ~87-90% accuracy\n\n**MFCC Only (20):**\n- Fast extraction\n- Good baseline\n- ~80-85% accuracy\n\n---\n\n### 📋 Output Files:\n- **CSV**: `features_ravdess.csv` (data)\n- **JSON**: `features_ravdess.json` (config)")
|
| 317 |
+
|
| 318 |
with gr.Column(scale=2):
|
| 319 |
output_text = gr.Markdown()
|
| 320 |
preview_df = gr.Dataframe(label="Dataset Preview")
|
|
|
|
| 324 |
waveform_plot = gr.Plot(label="Waveform")
|
| 325 |
spectrogram_plot = gr.Plot(label="Spectrogram")
|
| 326 |
|
| 327 |
+
check_btn.click(fn=check_dataset_status, outputs=[output_text])
|
| 328 |
+
extract_btn.click(fn=extract_dataset_features, inputs=[
|
| 329 |
+
zcr_check, chroma_check, mfcc_check, rms_check, mel_check, n_mfcc_slider], outputs=[output_text, preview_df, emotion_dist])
|
| 330 |
+
preview_btn.click(fn=preview_single_audio, inputs=[preview_audio, zcr_check, chroma_check, mfcc_check,
|
| 331 |
+
rms_check, mel_check, n_mfcc_slider], outputs=[output_text, waveform_plot, spectrogram_plot])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ui/tab2_training.py
CHANGED
|
@@ -1,30 +1,85 @@
|
|
| 1 |
"""
|
| 2 |
-
Tab 2: Model Training UI
|
| 3 |
"""
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
from src.training import train_models_with_ga
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def create_tab2():
|
| 10 |
"""Create Tab 2: Model Training"""
|
| 11 |
-
|
| 12 |
with gr.Tab("2️⃣ Model Training"):
|
| 13 |
gr.Markdown("""
|
| 14 |
## 🧬 Train Models with Genetic Algorithm
|
| 15 |
|
| 16 |
-
Optimize
|
| 17 |
""")
|
| 18 |
-
|
| 19 |
with gr.Row():
|
| 20 |
with gr.Column(scale=1):
|
| 21 |
-
# Cross-Validation Toggle
|
| 22 |
use_cv = gr.Checkbox(
|
| 23 |
label="🔄 Use K-Fold Cross-Validation",
|
| 24 |
value=False,
|
| 25 |
info="More reliable evaluation but slower (recommended for final training)"
|
| 26 |
)
|
| 27 |
-
|
| 28 |
n_folds = gr.Slider(
|
| 29 |
minimum=3,
|
| 30 |
maximum=10,
|
|
@@ -34,34 +89,63 @@ def create_tab2():
|
|
| 34 |
info="More folds = more reliable but slower",
|
| 35 |
visible=False
|
| 36 |
)
|
| 37 |
-
|
| 38 |
gr.Markdown("---")
|
| 39 |
-
|
| 40 |
-
# GA Toggle
|
| 41 |
use_ga = gr.Checkbox(
|
| 42 |
label="🧬 Use Genetic Algorithm Optimization",
|
| 43 |
value=True,
|
| 44 |
-
info="GA optimizes
|
| 45 |
)
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
ga_generations = gr.Slider(
|
| 48 |
minimum=5,
|
| 49 |
maximum=50,
|
| 50 |
-
value=
|
| 51 |
step=5,
|
| 52 |
label="GA Generations",
|
| 53 |
info="More generations = better optimization but slower"
|
| 54 |
)
|
| 55 |
-
|
| 56 |
ga_population = gr.Slider(
|
| 57 |
minimum=5,
|
| 58 |
maximum=30,
|
| 59 |
-
value=
|
| 60 |
step=5,
|
| 61 |
label="GA Population Size",
|
| 62 |
info="Larger population = more exploration but slower"
|
| 63 |
)
|
| 64 |
-
|
| 65 |
n_jobs = gr.Slider(
|
| 66 |
minimum=1,
|
| 67 |
maximum=8,
|
|
@@ -70,78 +154,113 @@ def create_tab2():
|
|
| 70 |
label="Parallel Jobs",
|
| 71 |
info="Number of CPU cores (2 for free tier, 4+ for better hardware)"
|
| 72 |
)
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
def toggle_cv_params(use_cv_val):
|
| 76 |
return gr.update(visible=use_cv_val)
|
| 77 |
-
|
| 78 |
def toggle_ga_params(use_ga_val):
|
|
|
|
|
|
|
|
|
|
| 79 |
return (
|
| 80 |
-
gr.update(visible=
|
| 81 |
-
gr.update(visible=
|
| 82 |
-
gr.update(visible=use_ga_val)
|
| 83 |
)
|
| 84 |
-
|
| 85 |
use_cv.change(
|
| 86 |
fn=toggle_cv_params,
|
| 87 |
inputs=[use_cv],
|
| 88 |
outputs=[n_folds]
|
| 89 |
)
|
| 90 |
-
|
| 91 |
use_ga.change(
|
| 92 |
fn=toggle_ga_params,
|
| 93 |
inputs=[use_ga],
|
| 94 |
-
outputs=[
|
|
|
|
| 95 |
)
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
gr.Markdown("---")
|
| 98 |
-
|
| 99 |
train_btn = gr.Button(
|
| 100 |
"🚀 Start Training",
|
| 101 |
variant="primary",
|
| 102 |
size="lg"
|
| 103 |
)
|
| 104 |
-
|
| 105 |
gr.Markdown("""
|
| 106 |
### 🔬 Training Modes:
|
| 107 |
|
| 108 |
-
**
|
| 109 |
-
-
|
| 110 |
-
-
|
| 111 |
-
-
|
| 112 |
-
|
| 113 |
-
**K-Fold CV (Recommended):**
|
| 114 |
-
- ✓ Reliable accuracy estimate
|
| 115 |
-
- ✓ Mean ± Std reported
|
| 116 |
-
- ✓ Detects overfitting
|
| 117 |
-
- ⚠️ Slower (5x longer)
|
| 118 |
|
| 119 |
-
|
| 120 |
-
-
|
| 121 |
-
- ✅
|
| 122 |
-
-
|
| 123 |
-
-
|
| 124 |
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
| 128 |
-
- With GA (Parallel): 15-30 minutes
|
| 129 |
-
- Without GA: 5-10 minutes
|
| 130 |
|
| 131 |
-
|
| 132 |
-
- With GA (Parallel): 75-150 minutes
|
| 133 |
-
- Without GA: 25-50 minutes
|
| 134 |
|
| 135 |
-
**
|
| 136 |
-
-
|
| 137 |
-
-
|
| 138 |
|
| 139 |
-
|
|
|
|
|
|
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
|
|
|
| 143 |
""")
|
| 144 |
-
|
| 145 |
with gr.Column(scale=2):
|
| 146 |
training_output = gr.Markdown()
|
| 147 |
results_table = gr.Dataframe(
|
|
@@ -152,7 +271,7 @@ def create_tab2():
|
|
| 152 |
label="GA Evolution History / CV Statistics",
|
| 153 |
headers=None
|
| 154 |
)
|
| 155 |
-
|
| 156 |
with gr.Accordion("📜 Detailed Training Log", open=False):
|
| 157 |
training_log = gr.Textbox(
|
| 158 |
label="Training Log",
|
|
@@ -161,119 +280,59 @@ def create_tab2():
|
|
| 161 |
interactive=False,
|
| 162 |
show_copy_button=True
|
| 163 |
)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
with gr.Accordion("ℹ️ Understanding Cross-Validation", open=False):
|
| 167 |
gr.Markdown("""
|
| 168 |
-
##
|
| 169 |
-
|
| 170 |
-
### How it works:
|
| 171 |
-
|
| 172 |
-
1. **Split data into K folds** (e.g., 5 folds)
|
| 173 |
-
2. **Train K times**, each time using:
|
| 174 |
-
- K-1 folds for training
|
| 175 |
-
- 1 fold for testing
|
| 176 |
-
3. **Average results** across all folds
|
| 177 |
|
| 178 |
-
###
|
| 179 |
-
Fold 1: Train on [2,3,4,5], Test on [1]
|
| 180 |
-
Fold 2: Train on [1,3,4,5], Test on [2]
|
| 181 |
-
Fold 3: Train on [1,2,4,5], Test on [3]
|
| 182 |
-
Fold 4: Train on [1,2,3,5], Test on [4]
|
| 183 |
-
Fold 5: Train on [1,2,3,4], Test on [5]
|
| 184 |
-
|
| 185 |
-
Final Result: Average of all 5 test accuracies
|
| 186 |
-
|
| 187 |
-
### Why use CV?
|
| 188 |
-
|
| 189 |
-
✅ **More Reliable**: Every sample is tested exactly once
|
| 190 |
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
-
|
| 194 |
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
-
###
|
| 198 |
|
| 199 |
-
**
|
| 200 |
-
-
|
| 201 |
-
-
|
| 202 |
-
- Interpretation: Stable, reliable performance ✓
|
| 203 |
|
| 204 |
-
**
|
| 205 |
-
-
|
| 206 |
-
-
|
| 207 |
-
- Interpretation: Unstable, unreliable ✗
|
| 208 |
|
| 209 |
-
|
| 210 |
-
- Mean: 75.0%
|
| 211 |
-
- Std: 0.3%
|
| 212 |
-
- Interpretation: Stable but poor performance ✗
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
| Scenario | Recommendation |
|
| 217 |
|----------|---------------|
|
| 218 |
-
|
|
| 219 |
-
|
|
| 220 |
-
|
|
| 221 |
-
| Large dataset (>100k samples) | Single Split |
|
| 222 |
-
| Publication/Research | 5 or 10-Fold CV |
|
| 223 |
-
""")
|
| 224 |
-
|
| 225 |
-
with gr.Accordion("🧬 Understanding Genetic Algorithm", open=False):
|
| 226 |
-
gr.Markdown("""
|
| 227 |
-
## 🧬 What does GA optimize?
|
| 228 |
-
|
| 229 |
-
### 1. Feature Selection (80/162)
|
| 230 |
-
- Finds best combination of audio features
|
| 231 |
-
- Removes redundant/noisy features
|
| 232 |
-
- Reduces overfitting
|
| 233 |
-
|
| 234 |
-
### 2. Hyperparameters
|
| 235 |
-
- **XGBoost**: n_estimators, max_depth, learning_rate
|
| 236 |
-
- **LightGBM**: n_estimators, num_leaves, learning_rate
|
| 237 |
-
- **Gradient Boosting**: n_estimators, max_depth, learning_rate
|
| 238 |
-
- **AdaBoost**: n_estimators, learning_rate
|
| 239 |
-
|
| 240 |
-
### 3. Ensemble Weights
|
| 241 |
-
- Optimal weights for combining models
|
| 242 |
-
- NOT equal weights [0.25, 0.25, 0.25, 0.25]
|
| 243 |
-
- NOT accuracy-based weights
|
| 244 |
-
- Learned from validation performance
|
| 245 |
-
|
| 246 |
-
### How GA works:
|
| 247 |
-
1. Create random population (15 solutions)
|
| 248 |
-
2. Evaluate fitness (train models, measure accuracy)
|
| 249 |
-
3. Select best solutions (tournament selection)
|
| 250 |
-
4. Create offspring (crossover + mutation)
|
| 251 |
-
5. Repeat for 20 generations
|
| 252 |
-
6. Return best solution found
|
| 253 |
-
|
| 254 |
-
### Why GA vs Grid Search?
|
| 255 |
-
|
| 256 |
-
**Grid Search:**
|
| 257 |
-
- Tests every combination
|
| 258 |
-
- Very slow (days for this problem)
|
| 259 |
-
- Guarantees finding best in grid
|
| 260 |
-
|
| 261 |
-
**Genetic Algorithm:**
|
| 262 |
-
- Intelligent search (evolutionary)
|
| 263 |
-
- Fast (minutes to hours)
|
| 264 |
-
- Finds near-optimal solution
|
| 265 |
-
- Can optimize multiple objectives
|
| 266 |
-
|
| 267 |
-
### Typical Improvement:
|
| 268 |
-
|
| 269 |
-
- **Without GA**: 82-85% accuracy
|
| 270 |
-
- **With GA**: 87-90% accuracy
|
| 271 |
-
- **Gain**: +5% absolute improvement
|
| 272 |
""")
|
| 273 |
-
|
| 274 |
-
# Event handler
|
| 275 |
train_btn.click(
|
| 276 |
fn=train_models_with_ga,
|
| 277 |
-
inputs=[
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Tab 2: Model Training UI with K-Fold Cross-Validation and GA Feature Selection
|
| 3 |
"""
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
from src.training import train_models_with_ga
|
| 7 |
+
import config
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def calculate_ga_feature_info(optimize_features, n_features_select, total_features=162):
|
| 11 |
+
"""Calculate and display GA feature selection info"""
|
| 12 |
+
if optimize_features:
|
| 13 |
+
percentage = (n_features_select / total_features *
|
| 14 |
+
100) if total_features > 0 else 0
|
| 15 |
+
return f"""
|
| 16 |
+
### 🧬 GA Feature Selection: **ENABLED**
|
| 17 |
+
|
| 18 |
+
GA will optimize:
|
| 19 |
+
1. **Which specific features to use**: {n_features_select}/{total_features} ({percentage:.1f}%)
|
| 20 |
+
2. **Model hyperparameters** (all 4 models)
|
| 21 |
+
3. **Ensemble weights**
|
| 22 |
+
|
| 23 |
+
**Search Space:**
|
| 24 |
+
- Feature combinations: C({total_features}, {n_features_select}) = Very Large!
|
| 25 |
+
- Plus hyperparameter combinations
|
| 26 |
+
- Total optimization space: **MASSIVE**
|
| 27 |
+
|
| 28 |
+
**Expected:** GA will find optimal feature subset + model configurations
|
| 29 |
+
"""
|
| 30 |
+
else:
|
| 31 |
+
return f"""
|
| 32 |
+
### 🧬 GA Feature Selection: **DISABLED**
|
| 33 |
+
|
| 34 |
+
GA will optimize:
|
| 35 |
+
- **Model hyperparameters ONLY** (all 4 models)
|
| 36 |
+
- **Ensemble weights**
|
| 37 |
+
|
| 38 |
+
**Note:** All {total_features} extracted features will be used (no feature selection)
|
| 39 |
+
|
| 40 |
+
This is faster but may include noisy/redundant features.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def update_feature_slider_max(csv_path='features_ravdess.csv'):
|
| 45 |
+
"""Update slider maximum based on extracted features"""
|
| 46 |
+
import pandas as pd
|
| 47 |
+
import os
|
| 48 |
+
|
| 49 |
+
if not os.path.exists(csv_path):
|
| 50 |
+
return gr.update(maximum=162, value=100)
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
df = pd.read_csv(csv_path)
|
| 54 |
+
feature_cols = [col for col in df.columns if col.startswith(
|
| 55 |
+
'feature_') and col.replace('feature_', '').isdigit()]
|
| 56 |
+
n_features = len(feature_cols)
|
| 57 |
+
|
| 58 |
+
default_select = min(100, int(n_features * 0.7))
|
| 59 |
+
|
| 60 |
+
return gr.update(maximum=n_features, value=default_select, label=f"Features to Select (Max: {n_features})")
|
| 61 |
+
except:
|
| 62 |
+
return gr.update(maximum=162, value=100)
|
| 63 |
|
| 64 |
|
| 65 |
def create_tab2():
|
| 66 |
"""Create Tab 2: Model Training"""
|
| 67 |
+
|
| 68 |
with gr.Tab("2️⃣ Model Training"):
|
| 69 |
gr.Markdown("""
|
| 70 |
## 🧬 Train Models with Genetic Algorithm
|
| 71 |
|
| 72 |
+
Optimize hyperparameters and optionally feature selection.
|
| 73 |
""")
|
| 74 |
+
|
| 75 |
with gr.Row():
|
| 76 |
with gr.Column(scale=1):
|
|
|
|
| 77 |
use_cv = gr.Checkbox(
|
| 78 |
label="🔄 Use K-Fold Cross-Validation",
|
| 79 |
value=False,
|
| 80 |
info="More reliable evaluation but slower (recommended for final training)"
|
| 81 |
)
|
| 82 |
+
|
| 83 |
n_folds = gr.Slider(
|
| 84 |
minimum=3,
|
| 85 |
maximum=10,
|
|
|
|
| 89 |
info="More folds = more reliable but slower",
|
| 90 |
visible=False
|
| 91 |
)
|
| 92 |
+
|
| 93 |
gr.Markdown("---")
|
| 94 |
+
|
|
|
|
| 95 |
use_ga = gr.Checkbox(
|
| 96 |
label="🧬 Use Genetic Algorithm Optimization",
|
| 97 |
value=True,
|
| 98 |
+
info="GA optimizes hyperparameters + optionally features"
|
| 99 |
)
|
| 100 |
+
|
| 101 |
+
optimize_features = gr.Checkbox(
|
| 102 |
+
label="✨ GA Optimize Feature Selection",
|
| 103 |
+
value=True,
|
| 104 |
+
info="Let GA select best feature subset (recommended)"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
n_features_select = gr.Slider(
|
| 108 |
+
minimum=10,
|
| 109 |
+
maximum=162,
|
| 110 |
+
value=100,
|
| 111 |
+
step=5,
|
| 112 |
+
label="Features to Select (Max: 162)",
|
| 113 |
+
info="Number of features GA will select from extracted features",
|
| 114 |
+
visible=True
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
update_slider_btn = gr.Button(
|
| 118 |
+
"🔄 Update from Extracted Features",
|
| 119 |
+
size="sm",
|
| 120 |
+
visible=True
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
update_slider_btn.click(
|
| 124 |
+
fn=update_feature_slider_max,
|
| 125 |
+
inputs=[],
|
| 126 |
+
outputs=[n_features_select]
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
gr.Markdown("---")
|
| 130 |
+
|
| 131 |
ga_generations = gr.Slider(
|
| 132 |
minimum=5,
|
| 133 |
maximum=50,
|
| 134 |
+
value=30,
|
| 135 |
step=5,
|
| 136 |
label="GA Generations",
|
| 137 |
info="More generations = better optimization but slower"
|
| 138 |
)
|
| 139 |
+
|
| 140 |
ga_population = gr.Slider(
|
| 141 |
minimum=5,
|
| 142 |
maximum=30,
|
| 143 |
+
value=20,
|
| 144 |
step=5,
|
| 145 |
label="GA Population Size",
|
| 146 |
info="Larger population = more exploration but slower"
|
| 147 |
)
|
| 148 |
+
|
| 149 |
n_jobs = gr.Slider(
|
| 150 |
minimum=1,
|
| 151 |
maximum=8,
|
|
|
|
| 154 |
label="Parallel Jobs",
|
| 155 |
info="Number of CPU cores (2 for free tier, 4+ for better hardware)"
|
| 156 |
)
|
| 157 |
+
|
| 158 |
+
ga_feature_info = gr.Markdown(
|
| 159 |
+
calculate_ga_feature_info(True, 100, 162)
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
def update_ga_info_wrapper(opt_feat, n_feat):
|
| 163 |
+
import pandas as pd
|
| 164 |
+
import os
|
| 165 |
+
total = 162
|
| 166 |
+
if os.path.exists(config.FEATURES_CSV):
|
| 167 |
+
try:
|
| 168 |
+
df = pd.read_csv(config.FEATURES_CSV)
|
| 169 |
+
feature_cols = [col for col in df.columns if col.startswith(
|
| 170 |
+
'feature_') and col.replace('feature_', '').isdigit()]
|
| 171 |
+
total = len(feature_cols)
|
| 172 |
+
except:
|
| 173 |
+
pass
|
| 174 |
+
return calculate_ga_feature_info(opt_feat, n_feat, total)
|
| 175 |
+
|
| 176 |
+
optimize_features.change(
|
| 177 |
+
fn=update_ga_info_wrapper,
|
| 178 |
+
inputs=[optimize_features, n_features_select],
|
| 179 |
+
outputs=[ga_feature_info]
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
n_features_select.change(
|
| 183 |
+
fn=update_ga_info_wrapper,
|
| 184 |
+
inputs=[optimize_features, n_features_select],
|
| 185 |
+
outputs=[ga_feature_info]
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
def toggle_cv_params(use_cv_val):
|
| 189 |
return gr.update(visible=use_cv_val)
|
| 190 |
+
|
| 191 |
def toggle_ga_params(use_ga_val):
|
| 192 |
+
return tuple([gr.update(visible=use_ga_val)] * 6)
|
| 193 |
+
|
| 194 |
+
def toggle_feature_slider(opt_feat_val):
|
| 195 |
return (
|
| 196 |
+
gr.update(visible=opt_feat_val),
|
| 197 |
+
gr.update(visible=opt_feat_val)
|
|
|
|
| 198 |
)
|
| 199 |
+
|
| 200 |
use_cv.change(
|
| 201 |
fn=toggle_cv_params,
|
| 202 |
inputs=[use_cv],
|
| 203 |
outputs=[n_folds]
|
| 204 |
)
|
| 205 |
+
|
| 206 |
use_ga.change(
|
| 207 |
fn=toggle_ga_params,
|
| 208 |
inputs=[use_ga],
|
| 209 |
+
outputs=[optimize_features, n_features_select,
|
| 210 |
+
update_slider_btn, ga_generations, ga_population, n_jobs]
|
| 211 |
)
|
| 212 |
+
|
| 213 |
+
optimize_features.change(
|
| 214 |
+
fn=toggle_feature_slider,
|
| 215 |
+
inputs=[optimize_features],
|
| 216 |
+
outputs=[n_features_select, update_slider_btn]
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
gr.Markdown("---")
|
| 220 |
+
|
| 221 |
train_btn = gr.Button(
|
| 222 |
"🚀 Start Training",
|
| 223 |
variant="primary",
|
| 224 |
size="lg"
|
| 225 |
)
|
| 226 |
+
|
| 227 |
gr.Markdown("""
|
| 228 |
### 🔬 Training Modes:
|
| 229 |
|
| 230 |
+
**Mode 1: Full GA (Recommended)**
|
| 231 |
+
- ✅ GA Feature Selection: ON
|
| 232 |
+
- ✅ GA Hyperparameter Tuning: ON
|
| 233 |
+
- ⏱️ Time: 60-120 min
|
| 234 |
+
- 🎯 Best accuracy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
+
**Mode 2: GA Hyperparameters Only**
|
| 237 |
+
- ❌ GA Feature Selection: OFF
|
| 238 |
+
- ✅ GA Hyperparameter Tuning: ON
|
| 239 |
+
- ⏱️ Time: 30-60 min
|
| 240 |
+
- 🎯 Good accuracy, faster
|
| 241 |
|
| 242 |
+
**Mode 3: No GA (Fast)**
|
| 243 |
+
- ❌ GA: OFF
|
| 244 |
+
- ⏱️ Time: 5-10 min
|
| 245 |
+
- 🎯 Baseline accuracy
|
| 246 |
|
| 247 |
+
---
|
|
|
|
|
|
|
| 248 |
|
| 249 |
+
### 💡 Feature Selection Tips:
|
|
|
|
|
|
|
| 250 |
|
| 251 |
+
**Many features (>100):**
|
| 252 |
+
- Select 60-80%
|
| 253 |
+
- GA finds most informative
|
| 254 |
|
| 255 |
+
**Few features (<50):**
|
| 256 |
+
- Use all features
|
| 257 |
+
- Disable feature selection
|
| 258 |
|
| 259 |
+
**Medium features (50-100):**
|
| 260 |
+
- Select 70-90%
|
| 261 |
+
- Balance info and speed
|
| 262 |
""")
|
| 263 |
+
|
| 264 |
with gr.Column(scale=2):
|
| 265 |
training_output = gr.Markdown()
|
| 266 |
results_table = gr.Dataframe(
|
|
|
|
| 271 |
label="GA Evolution History / CV Statistics",
|
| 272 |
headers=None
|
| 273 |
)
|
| 274 |
+
|
| 275 |
with gr.Accordion("📜 Detailed Training Log", open=False):
|
| 276 |
training_log = gr.Textbox(
|
| 277 |
label="Training Log",
|
|
|
|
| 280 |
interactive=False,
|
| 281 |
show_copy_button=True
|
| 282 |
)
|
| 283 |
+
|
| 284 |
+
with gr.Accordion("ℹ️ Understanding Feature Selection", open=False):
|
|
|
|
| 285 |
gr.Markdown("""
|
| 286 |
+
## 🎯 What is Feature Selection?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
|
| 288 |
+
### Why select features?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
+
**Too many features can cause:**
|
| 291 |
+
- ❌ **Overfitting**: Model memorizes noise
|
| 292 |
+
- ❌ **Curse of dimensionality**: Need exponentially more data
|
| 293 |
+
- ❌ **Slow training**: More features = more computation
|
| 294 |
+
- ❌ **Redundancy**: Correlated features don't add info
|
| 295 |
|
| 296 |
+
### How GA selects features:
|
| 297 |
|
| 298 |
+
Evolution process finds optimal feature subset through:
|
| 299 |
+
- Random initialization
|
| 300 |
+
- Fitness evaluation (accuracy)
|
| 301 |
+
- Selection (keep best)
|
| 302 |
+
- Crossover (combine good solutions)
|
| 303 |
+
- Mutation (explore new combinations)
|
| 304 |
|
| 305 |
+
### Example Results:
|
| 306 |
|
| 307 |
+
**Full features (162):**
|
| 308 |
+
- Accuracy: 87%
|
| 309 |
+
- Training time: 60 min
|
|
|
|
| 310 |
|
| 311 |
+
**GA selected (80 features):**
|
| 312 |
+
- Accuracy: 90% ✓ (better!)
|
| 313 |
+
- Training time: 40 min ✓ (faster!)
|
|
|
|
| 314 |
|
| 315 |
+
### When to use:
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
+
| Features | Recommendation |
|
|
|
|
|
|
|
| 318 |
|----------|---------------|
|
| 319 |
+
| >100 | ✅ Use GA (60-80%) |
|
| 320 |
+
| 50-100 | ✅ Optional (70-90%) |
|
| 321 |
+
| <50 | ❌ Use all features |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
""")
|
| 323 |
+
|
|
|
|
| 324 |
train_btn.click(
|
| 325 |
fn=train_models_with_ga,
|
| 326 |
+
inputs=[
|
| 327 |
+
use_ga,
|
| 328 |
+
use_cv,
|
| 329 |
+
n_folds,
|
| 330 |
+
ga_generations,
|
| 331 |
+
ga_population,
|
| 332 |
+
n_jobs,
|
| 333 |
+
optimize_features,
|
| 334 |
+
n_features_select
|
| 335 |
+
],
|
| 336 |
+
outputs=[training_output, results_table,
|
| 337 |
+
ga_history_table, training_log]
|
| 338 |
+
)
|