Spaces:

nguyennp86
/

speech-emotion-recognition

Sleeping

App Files Files Community

nguyennp86 commited on Oct 4

Commit

cafbe14

1 Parent(s): 31a2a2f

update project with selection

Browse files

Files changed (20) hide show

__pycache__/config.cpython-311.pyc +0 -0
config.py +95 -70
features_ravdess.csv +0 -0
features_ravdess.json +16 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/data_loader.cpython-311.pyc +0 -0
src/__pycache__/ensemble_model.cpython-311.pyc +0 -0
src/__pycache__/feature_extraction.cpython-311.pyc +0 -0
src/__pycache__/genetic_algorithm.cpython-311.pyc +0 -0
src/__pycache__/training.cpython-311.pyc +0 -0
src/__pycache__/utils.cpython-311.pyc +0 -0
src/feature_extraction.py +191 -43
src/genetic_algorithm.py +186 -159
src/training.py +248 -230
src/ui/__pycache__/__init__.cpython-311.pyc +0 -0
src/ui/__pycache__/tab1_extraction.cpython-311.pyc +0 -0
src/ui/__pycache__/tab2_training.cpython-311.pyc +0 -0
src/ui/__pycache__/tab3_prediction.cpython-311.pyc +0 -0
src/ui/tab1_extraction.py +190 -107
src/ui/tab2_training.py +218 -159

__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (3.59 kB). View file

config.py CHANGED Viewed

@@ -13,104 +13,129 @@ FEATURES_CSV = 'features_ravdess.csv'
 WEIGHTS_DIR = Path('weights')
 # ============================================================================
-# FEATURE EXTRACTION
 # ============================================================================
 AUDIO_DURATION = 2.5  # seconds
 AUDIO_OFFSET = 0.6    # seconds
-N_FEATURES = 162
-# Feature breakdown
 FEATURE_CONFIG = {
-    'zcr': 1,
-    'chroma': 12,
-    'mfcc': 20,
-    'rms': 1,
-    'mel': 128
 }
 # ============================================================================
-# GENETIC ALGORITHM
 # ============================================================================
-# GA_CONFIG = {
-#     'n_features_to_select': 80,
-#     'population_size': 15,
-#     'n_generations': 20,
-#     'mutation_rate': 0.15,
-#     'crossover_rate': 0.8,
-#     'elite_size': 2,
-#     'early_stopping_patience': 15,
-#     'early_stopping_tolerance': 0.0001
-# }
 GA_CONFIG = {
-    'n_features_to_select': 100,          # ← TĂNG từ 80 → 100 (giữ nhiều features hơn)
-    'population_size': 20,                # ← GIẢM từ 30 → 20 (faster per generation)
-    'n_generations': 30,                  # ← TĂNG từ 20 → 30 (more exploration)
-    'mutation_rate': 0.2,                 # ← TĂNG từ 0.15 → 0.2 (more diversity)
-    'crossover_rate': 0.8,                # ← GIỮ NGUYÊN
-    'elite_size': 3,                      # ← TĂNG từ 2 → 3 (keep more good solutions)
-    'early_stopping_patience': 8,         # ← TĂNG từ 5 → 8 (be more patient)
-    'early_stopping_tolerance': 0.001     # ← TĂNG từ 0.0001 → 0.001 (accept smaller improvements)
 }
 # ============================================================================
-# MODEL HYPERPARAMETERS
 # ============================================================================
-# MODEL_HYPERPARAMS = {
-#     'xgb': {
-#         'n_estimators': [50, 100, 150],
-#         'max_depth': [3, 4, 5, 6],
-#         'learning_rate': [0.05, 0.1, 0.15]
-#     },
-#     'lgbm': {
-#         'n_estimators': [50, 100, 150],
-#         'num_leaves': [20, 31, 40],
-#         'learning_rate': [0.05, 0.1, 0.15]
-#     },
-#     'gb': {
-#         'n_estimators': [50, 100, 150],
-#         'max_depth': [3, 4, 5],
-#         'learning_rate': [0.05, 0.1, 0.15]
-#     },
-#     'ada': {
-#         'n_estimators': [50, 100, 150],
-#         'learning_rate': [0.5, 1.0, 1.5]
-#     }
-# }
 MODEL_HYPERPARAMS = {
     'xgb': {
-        'n_estimators': [100, 200, 300, 400, 500],      # Thêm nhiều options
-        'max_depth': [4, 5, 6, 7, 8, 9],                # Deeper trees
-        'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],  # Wider range
-        'subsample': [0.7, 0.8, 0.9, 1.0],              # NEW: Regularization
-        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],       # NEW: Feature sampling
-        'min_child_weight': [1, 3, 5],                   # NEW: Regularization
-        'gamma': [0, 0.1, 0.2]                          # NEW: Pruning
     },
     'lgbm': {
         'n_estimators': [100, 200, 300, 400, 500],
-        'num_leaves': [31, 50, 70, 100, 127],           # More options
         'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
-        'min_child_samples': [10, 20, 30, 50],          # NEW: Regularization
-        'subsample': [0.7, 0.8, 0.9, 1.0],              # NEW: Regularization
-        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],       # NEW: Feature sampling
-        'reg_alpha': [0, 0.1, 0.5],                     # NEW: L1 regularization
-        'reg_lambda': [0, 0.1, 0.5]                     # NEW: L2 regularization
     },
     'gb': {
         'n_estimators': [100, 200, 300, 400],
-        'max_depth': [4, 5, 6, 7, 8],                   # Deeper
         'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
-        'subsample': [0.7, 0.8, 0.9, 1.0],              # NEW
-        'min_samples_split': [2, 5, 10],                # NEW
-        'min_samples_leaf': [1, 2, 4]                   # NEW
     },
     'ada': {
         'n_estimators': [100, 200, 300, 400, 500],
-        'learning_rate': [0.5, 0.8, 1.0, 1.2, 1.5],
-        'algorithm': ['SAMME', 'SAMME.R']                # NEW: Try both algorithms
     }
 }
 # ============================================================================
 # TRAINING
 # ============================================================================
@@ -151,4 +176,4 @@ UI_CONFIG = {
     'server_port': 7860,
     'max_file_size': 10 * 1024 * 1024,  # 10 MB
     'allowed_audio_formats': ['.wav', '.mp3', '.flac']
-}

 WEIGHTS_DIR = Path('weights')
 # ============================================================================
+# FEATURE EXTRACTION - CONFIGURABLE
 # ============================================================================
 AUDIO_DURATION = 2.5  # seconds
 AUDIO_OFFSET = 0.6    # seconds
+# MFCC Configuration - VARIABLE
+MFCC_MIN = 20  # Minimum MFCC coefficients
+MFCC_MAX = 40  # Maximum MFCC coefficients
+MFCC_DEFAULT = 20  # Default for extraction
+# Feature breakdown with DYNAMIC counts
 FEATURE_CONFIG = {
+    'zcr': {
+        'count': 1,
+        'start_idx': 0,
+        'description': 'Zero Crossing Rate - Signal sign change frequency',
+        'fixed': True
+    },
+    'chroma': {
+        'count': 12,
+        'start_idx': 1,
+        'description': 'Chroma Features - Pitch class distribution',
+        'fixed': True
+    },
+    'mfcc': {
+        'count': MFCC_DEFAULT,  # VARIABLE: 20-40
+        'min_count': MFCC_MIN,
+        'max_count': MFCC_MAX,
+        'start_idx': 13,
+        'description': 'MFCC - Mel-frequency cepstral coefficients',
+        'fixed': False  # Can vary
+    },
+    'rms': {
+        'count': 1,
+        'start_idx': 13 + MFCC_DEFAULT,  # Dynamic based on MFCC
+        'description': 'RMS Energy - Signal amplitude',
+        'fixed': True
+    },
+    'mel': {
+        'count': 128,
+        'start_idx': 13 + MFCC_DEFAULT + 1,  # Dynamic based on MFCC
+        'description': 'Mel Spectrogram - Frequency distribution',
+        'fixed': True
+    }
 }
+# Total features with default MFCC
+N_FEATURES_MIN = 1 + 12 + MFCC_MIN + 1 + 128  # 162 features (MFCC=20)
+N_FEATURES_MAX = 1 + 12 + MFCC_MAX + 1 + 128  # 182 features (MFCC=40)
+N_FEATURES_DEFAULT = 1 + 12 + MFCC_DEFAULT + 1 + 128  # 162 features
+# Default feature types to extract
+DEFAULT_FEATURE_TYPES = ['zcr', 'chroma', 'mfcc', 'rms', 'mel']
 # ============================================================================
+# GENETIC ALGORITHM - OPTIMIZED
 # ============================================================================
 GA_CONFIG = {
+    'n_features_to_select': 100,          # From selected feature types
+    'population_size': 20,                # Smaller for faster generations
+    'n_generations': 30,                  # More generations for exploration
+    'mutation_rate': 0.2,                 # Higher for diversity
+    'crossover_rate': 0.8,                # Standard crossover rate
+    'elite_size': 3,                      # Keep top 3 solutions
+    'early_stopping_patience': 8,         # Be patient for improvements
+    'early_stopping_tolerance': 0.001,    # Accept small improvements
+    # Feature optimization options
+    'optimize_feature_types': False,      # Whether GA should select feature types
+    'optimize_mfcc_count': False,         # Whether GA should optimize MFCC count
 }
 # ============================================================================
+# MODEL HYPERPARAMETERS - EXPANDED & OPTIMIZED
 # ============================================================================
 MODEL_HYPERPARAMS = {
     'xgb': {
+        # Core parameters
+        'n_estimators': [100, 200, 300, 400, 500],
+        'max_depth': [4, 5, 6, 7, 8, 9],
+        'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
+        # Regularization (PREVENT OVERFITTING)
+        'subsample': [0.7, 0.8, 0.9, 1.0],
+        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
+        'min_child_weight': [1, 3, 5, 7],
+        'gamma': [0, 0.1, 0.2, 0.3]
     },
     'lgbm': {
+        # Core parameters
         'n_estimators': [100, 200, 300, 400, 500],
+        'num_leaves': [31, 50, 70, 100, 127],
         'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
+        # Regularization
+        'min_child_samples': [10, 20, 30, 50],
+        'subsample': [0.7, 0.8, 0.9, 1.0],
+        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
+        'reg_alpha': [0, 0.1, 0.5, 1.0],
+        'reg_lambda': [0, 0.1, 0.5, 1.0]
     },
     'gb': {
+        # Core parameters
         'n_estimators': [100, 200, 300, 400],
+        'max_depth': [4, 5, 6, 7, 8],
         'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
+        # Regularization
+        'subsample': [0.7, 0.8, 0.9, 1.0],
+        'min_samples_split': [2, 5, 10, 20],
+        'min_samples_leaf': [1, 2, 4, 8]
     },
     'ada': {
         'n_estimators': [100, 200, 300, 400, 500],
+        'learning_rate': [0.5, 0.8, 1.0, 1.2, 1.5]
+        # Note: algorithm='SAMME' is fixed (not optimized by GA)
+        # SAMME.R doesn't work well with multi-class problems in our case
     }
 }
+# Fixed AdaBoost algorithm (not part of GA search space)
+ADABOOST_ALGORITHM = 'SAMME'  # Fixed choice
 # ============================================================================
 # TRAINING
 # ============================================================================
     'server_port': 7860,
     'max_file_size': 10 * 1024 * 1024,  # 10 MB
     'allowed_audio_formats': ['.wav', '.mp3', '.flac']
+}

features_ravdess.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

features_ravdess.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "feature_types": [
+    "mfcc"
+  ],
+  "n_mfcc": 40,
+  "total_features": 40,
+  "feature_breakdown": {
+    "zcr": 0,
+    "chroma": 0,
+    "mfcc": 40,
+    "rms": 0,
+    "mel": 0
+  },
+  "n_samples": 1440,
+  "extraction_date": "2025-10-04T21:13:14.967210"
+}

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (186 Bytes). View file

src/__pycache__/data_loader.cpython-311.pyc ADDED Viewed

Binary file (5.04 kB). View file

src/__pycache__/ensemble_model.cpython-311.pyc ADDED Viewed

Binary file (10 kB). View file

src/__pycache__/feature_extraction.cpython-311.pyc ADDED Viewed

Binary file (8.35 kB). View file

src/__pycache__/genetic_algorithm.cpython-311.pyc ADDED Viewed

Binary file (24.3 kB). View file

src/__pycache__/training.cpython-311.pyc ADDED Viewed

Binary file (37.3 kB). View file

src/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (6.02 kB). View file

src/feature_extraction.py CHANGED Viewed

@@ -1,78 +1,226 @@
 """
 Audio Feature Extraction Module
-Extracts 162 features from audio files for emotion recognition
 """
 import numpy as np
 import librosa
 import warnings
 warnings.filterwarnings('ignore')
-def extract_features(audio_path, duration=2.5, offset=0.6):
     """
-    Extract 162 audio features from an audio file
     Features:
-    - 1 Zero Crossing Rate
-    - 12 Chroma STFT
-    - 20 MFCC
-    - 1 RMS Energy
-    - 128 Mel Spectrogram
     Args:
         audio_path (str): Path to audio file
         duration (float): Duration to load (seconds)
         offset (float): Start reading after this time (seconds)
     Returns:
-        features (np.array): Feature vector of shape (162,)
         y (np.array): Audio time series
         sr (int): Sample rate
     """
     try:
         # Load audio file
         y, sr = librosa.load(audio_path, duration=duration, offset=offset)
         # Initialize feature array
         features = np.array([])
         # 1. Zero Crossing Rate (1 feature)
-        zcr = np.mean(librosa.feature.zero_crossing_rate(y=y).T, axis=0)
-        features = np.hstack((features, zcr))
         # 2. Chroma STFT (12 features)
-        stft = np.abs(librosa.stft(y))
-        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
-        features = np.hstack((features, chroma))
-        # 3. MFCC (20 features)
-        mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20).T, axis=0)
-        features = np.hstack((features, mfcc))
         # 4. RMS Energy (1 feature)
-        rms = np.mean(librosa.feature.rms(y=y).T, axis=0)
-        features = np.hstack((features, rms))
         # 5. Mel Spectrogram (128 features)
-        mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
-        features = np.hstack((features, mel))
-        return features, y, sr
     except Exception as e:
-        raise Exception(f"Error extracting features from {audio_path}: {str(e)}")
-def get_feature_names():
     """
-    Get names of all 162 features
     Returns:
         list: List of feature names
     """
-    names = ['zcr']
-    names.extend([f'chroma_{i}' for i in range(12)])
-    names.extend([f'mfcc_{i}' for i in range(20)])
-    names.append('rms')
-    names.extend([f'mel_{i}' for i in range(128)])
-    return names

 """
 Audio Feature Extraction Module
+Extracts audio features with configurable feature types and MFCC count
 """
 import numpy as np
 import librosa
 import warnings
+import config
 warnings.filterwarnings('ignore')
+def extract_features(audio_path, duration=2.5, offset=0.6, feature_types=None, n_mfcc=None):
     """
+    Extract audio features based on selected feature types
     Features:
+    - ZCR: Zero Crossing Rate (1)
+    - Chroma: Chroma STFT (12)
+    - MFCC: Mel-frequency cepstral coefficients (20-40, configurable)
+    - RMS: RMS Energy (1)
+    - Mel: Mel Spectrogram (128)
     Args:
         audio_path (str): Path to audio file
         duration (float): Duration to load (seconds)
         offset (float): Start reading after this time (seconds)
+        feature_types (list): List of feature types to extract
+                             ['zcr', 'chroma', 'mfcc', 'rms', 'mel']
+                             If None, extract all features
+        n_mfcc (int): Number of MFCC coefficients (20-40)
+                     If None, use default from config
     Returns:
+        features (np.array): Feature vector
         y (np.array): Audio time series
         sr (int): Sample rate
+        feature_info (dict): Information about extracted features
     """
+    if feature_types is None:
+        feature_types = config.DEFAULT_FEATURE_TYPES
+    if n_mfcc is None:
+        n_mfcc = config.MFCC_DEFAULT
+    # Validate MFCC count
+    n_mfcc = max(config.MFCC_MIN, min(n_mfcc, config.MFCC_MAX))
     try:
         # Load audio file
         y, sr = librosa.load(audio_path, duration=duration, offset=offset)
         # Initialize feature array
         features = np.array([])
+        feature_info = {
+            'types_used': feature_types,
+            'counts': {},
+            'total': 0,
+            'n_mfcc': n_mfcc if 'mfcc' in feature_types else 0
+        }
         # 1. Zero Crossing Rate (1 feature)
+        if 'zcr' in feature_types:
+            zcr = np.mean(librosa.feature.zero_crossing_rate(y=y).T, axis=0)
+            features = np.hstack((features, zcr))
+            feature_info['counts']['zcr'] = 1
         # 2. Chroma STFT (12 features)
+        if 'chroma' in feature_types:
+            stft = np.abs(librosa.stft(y))
+            chroma = np.mean(librosa.feature.chroma_stft(
+                S=stft, sr=sr).T, axis=0)
+            features = np.hstack((features, chroma))
+            feature_info['counts']['chroma'] = 12
+        # 3. MFCC (20-40 features, CONFIGURABLE)
+        if 'mfcc' in feature_types:
+            mfcc = np.mean(librosa.feature.mfcc(
+                y=y, sr=sr, n_mfcc=n_mfcc).T, axis=0)
+            features = np.hstack((features, mfcc))
+            feature_info['counts']['mfcc'] = n_mfcc
         # 4. RMS Energy (1 feature)
+        if 'rms' in feature_types:
+            rms = np.mean(librosa.feature.rms(y=y).T, axis=0)
+            features = np.hstack((features, rms))
+            feature_info['counts']['rms'] = 1
         # 5. Mel Spectrogram (128 features)
+        if 'mel' in feature_types:
+            mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
+            features = np.hstack((features, mel))
+            feature_info['counts']['mel'] = 128
+        feature_info['total'] = len(features)
+        return features, y, sr, feature_info
     except Exception as e:
+        raise Exception(
+            f"Error extracting features from {audio_path}: {str(e)}")
+def get_feature_names(feature_types=None, n_mfcc=None):
     """
+    Get names of features based on selected types
+    Args:
+        feature_types (list): List of feature types
+        n_mfcc (int): Number of MFCC coefficients
     Returns:
         list: List of feature names
     """
+    if feature_types is None:
+        feature_types = config.DEFAULT_FEATURE_TYPES
+    if n_mfcc is None:
+        n_mfcc = config.MFCC_DEFAULT
+    names = []
+    if 'zcr' in feature_types:
+        names.append('zcr')
+    if 'chroma' in feature_types:
+        names.extend([f'chroma_{i}' for i in range(12)])
+    if 'mfcc' in feature_types:
+        names.extend([f'mfcc_{i}' for i in range(n_mfcc)])
+    if 'rms' in feature_types:
+        names.append('rms')
+    if 'mel' in feature_types:
+        names.extend([f'mel_{i}' for i in range(128)])
+    return names
+def get_feature_count(feature_types=None, n_mfcc=None):
+    """
+    Get total feature count for selected types
+    Args:
+        feature_types (list): List of feature types
+        n_mfcc (int): Number of MFCC coefficients
+    Returns:
+        int: Total number of features
+    """
+    if feature_types is None:
+        feature_types = config.DEFAULT_FEATURE_TYPES
+    if n_mfcc is None:
+        n_mfcc = config.MFCC_DEFAULT
+    count = 0
+    if 'zcr' in feature_types:
+        count += 1
+    if 'chroma' in feature_types:
+        count += 12
+    if 'mfcc' in feature_types:
+        count += n_mfcc  # VARIABLE
+    if 'rms' in feature_types:
+        count += 1
+    if 'mel' in feature_types:
+        count += 128
+    return count
+def get_feature_indices(feature_types=None, n_mfcc=None, total_mfcc_in_dataset=None):
+    """
+    Get feature indices for selected types (for existing datasets)
+    Args:
+        feature_types (list): List of feature types to keep
+        n_mfcc (int): Number of MFCC to keep
+        total_mfcc_in_dataset (int): Total MFCC in the dataset
+    Returns:
+        np.array: Indices of features to keep
+    """
+    if feature_types is None:
+        feature_types = config.DEFAULT_FEATURE_TYPES
+    if n_mfcc is None:
+        n_mfcc = config.MFCC_DEFAULT
+    if total_mfcc_in_dataset is None:
+        total_mfcc_in_dataset = config.MFCC_DEFAULT
+    indices = []
+    current_idx = 0
+    # ZCR (1)
+    if 'zcr' in feature_types:
+        indices.extend(range(current_idx, current_idx + 1))
+    current_idx += 1
+    # Chroma (12)
+    if 'chroma' in feature_types:
+        indices.extend(range(current_idx, current_idx + 12))
+    current_idx += 12
+    # MFCC (variable)
+    if 'mfcc' in feature_types:
+        # Only take first n_mfcc coefficients
+        indices.extend(range(current_idx, current_idx +
+                       min(n_mfcc, total_mfcc_in_dataset)))
+    current_idx += total_mfcc_in_dataset
+    # RMS (1)
+    if 'rms' in feature_types:
+        indices.extend(range(current_idx, current_idx + 1))
+    current_idx += 1
+    # Mel (128)
+    if 'mel' in feature_types:
+        indices.extend(range(current_idx, current_idx + 128))
+    current_idx += 128
+    return np.array(indices)

src/genetic_algorithm.py CHANGED Viewed

@@ -1,11 +1,13 @@
 """
 Genetic Algorithm for feature selection and hyperparameter optimization
 """
 import numpy as np
 import random
 import time
-from typing import Dict, List, Callable, Optional, Tuple
 from joblib import Parallel, delayed
 from xgboost import XGBClassifier
@@ -15,17 +17,30 @@ from sklearn.metrics import accuracy_score
 import config
 class GeneticAlgorithm:
     """GA for optimizing features + hyperparameters + ensemble weights"""
     def __init__(self, X: np.ndarray, y: np.ndarray, n_features_to_select: int = 80):
         self.X = X
         self.y = y
         self.n_features = X.shape[1]
-        self.n_select = n_features_to_select
         self.n_classes = len(np.unique(y))
         # GA parameters from config
         self.population_size = config.GA_CONFIG['population_size']
         self.n_generations = config.GA_CONFIG['n_generations']
@@ -34,192 +49,180 @@ class GeneticAlgorithm:
         self.elite_size = config.GA_CONFIG['elite_size']
         self.early_stopping_patience = config.GA_CONFIG['early_stopping_patience']
         self.early_stopping_tolerance = config.GA_CONFIG['early_stopping_tolerance']
         self.best_chromosome = None
         self.best_fitness = 0
         self.history = []
         self.log_messages = []
     def log(self, message: str):
         """Add log message with timestamp"""
         timestamp = time.strftime("%H:%M:%S")
         log_entry = f"[{timestamp}] {message}"
         self.log_messages.append(log_entry)
         print(log_entry)
     def create_chromosome(self) -> Dict:
-        """Create random chromosome"""
         chromosome = {
             'feature_indices': np.sort(np.random.choice(
                 self.n_features, self.n_select, replace=False
             ))
         }
-        # Add hyperparameters for each model
         for model_prefix, params in config.MODEL_HYPERPARAMS.items():
             for param_name, param_values in params.items():
                 key = f"{model_prefix}_{param_name}"
                 chromosome[key] = random.choice(param_values)
         # Ensemble weights
         chromosome['weights'] = self._random_weights(4)
         return chromosome
     def _random_weights(self, n: int) -> np.ndarray:
         """Generate n random weights that sum to 1"""
         return np.random.dirichlet(np.ones(n))
-    def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
                 X_val: np.ndarray, y_val: np.ndarray) -> float:
-        """Calculate fitness using validation accuracy"""
         try:
             feature_indices = chromosome['feature_indices']
             X_train_selected = X_train[:, feature_indices]
             X_val_selected = X_val[:, feature_indices]
             models = []
             # XGBoost
-            # xgb = XGBClassifier(
-            #     n_estimators=chromosome['xgb_n_estimators'],
-            #     max_depth=chromosome['xgb_max_depth'],
-            #     learning_rate=chromosome['xgb_learning_rate'],
-            #     objective='multi:softprob',
-            #     num_class=self.n_classes,
-            #     random_state=config.RANDOM_STATE,
-            #     n_jobs=-1,
-            #     verbosity=0
-            # )
-            # xgb.fit(X_train_selected, y_train)
-            # models.append(xgb)
             xgb = XGBClassifier(
-                n_estimators=chromosome['xgb_n_estimators'],
-                max_depth=chromosome['xgb_max_depth'],
-                learning_rate=chromosome['xgb_learning_rate'],
-                subsample=chromosome.get('xgb_subsample', 0.8),              # NEW
-                colsample_bytree=chromosome.get('xgb_colsample_bytree', 0.8),# NEW
-                min_child_weight=chromosome.get('xgb_min_child_weight', 1),  # NEW
-                gamma=chromosome.get('xgb_gamma', 0),                        # NEW
                 objective='multi:softprob',
                 num_class=self.n_classes,
                 random_state=config.RANDOM_STATE,
                 n_jobs=-1,
-                verbosity=0,
-                eval_metric='mlogloss'                                        # NEW: Better metric
             )
             xgb.fit(X_train_selected, y_train)
             models.append(xgb)
             # LightGBM
-            # lgbm = LGBMClassifier(
-            #     n_estimators=chromosome['lgbm_n_estimators'],
-            #     num_leaves=chromosome['lgbm_num_leaves'],
-            #     learning_rate=chromosome['lgbm_learning_rate'],
-            #     objective='multiclass',
-            #     num_class=self.n_classes,
-            #     random_state=config.RANDOM_STATE,
-            #     n_jobs=-1,
-            #     verbose=-1
-            # )
             lgbm = LGBMClassifier(
-                n_estimators=chromosome['lgbm_n_estimators'],
-                num_leaves=chromosome['lgbm_num_leaves'],
-                learning_rate=chromosome['lgbm_learning_rate'],
-                min_child_samples=chromosome.get('lgbm_min_child_samples', 20),  # NEW
-                subsample=chromosome.get('lgbm_subsample', 0.8),                  # NEW
-                colsample_bytree=chromosome.get('lgbm_colsample_bytree', 0.8),   # NEW
-                reg_alpha=chromosome.get('lgbm_reg_alpha', 0),                   # NEW
-                reg_lambda=chromosome.get('lgbm_reg_lambda', 0),                 # NEW
                 objective='multiclass',
                 num_class=self.n_classes,
                 random_state=config.RANDOM_STATE,
                 n_jobs=-1,
                 verbose=-1,
-                metric='multi_logloss'                                            # NEW
             )
             lgbm.fit(X_train_selected, y_train)
             models.append(lgbm)
             # Gradient Boosting
-            # gb = GradientBoostingClassifier(
-            #     n_estimators=chromosome['gb_n_estimators'],
-            #     max_depth=chromosome['gb_max_depth'],
-            #     learning_rate=chromosome['gb_learning_rate'],
-            #     random_state=config.RANDOM_STATE
-            # )
             gb = GradientBoostingClassifier(
-                n_estimators=chromosome['gb_n_estimators'],
-                max_depth=chromosome['gb_max_depth'],
-                learning_rate=chromosome['gb_learning_rate'],
-                subsample=chromosome.get('gb_subsample', 0.8),                   # NEW
-                min_samples_split=chromosome.get('gb_min_samples_split', 2),     # NEW
-                min_samples_leaf=chromosome.get('gb_min_samples_leaf', 1),       # NEW
                 random_state=config.RANDOM_STATE
             )
             gb.fit(X_train_selected, y_train)
             models.append(gb)
-            # AdaBoost
-            # ada = AdaBoostClassifier(
-            #     n_estimators=chromosome['ada_n_estimators'],
-            #     learning_rate=chromosome['ada_learning_rate'],
-            #     algorithm='SAMME',
-            #     random_state=config.RANDOM_STATE
-            # )
             ada = AdaBoostClassifier(
-                n_estimators=chromosome['ada_n_estimators'],
-                learning_rate=chromosome['ada_learning_rate'],
-                algorithm=chromosome.get('ada_algorithm', 'SAMME'),              # NEW
                 random_state=config.RANDOM_STATE
             )
             ada.fit(X_train_selected, y_train)
             models.append(ada)
-            # Ensemble prediction
-            predictions = [model.predict_proba(X_val_selected) for model in models]
             weights = chromosome['weights']
             ensemble_proba = np.average(predictions, axis=0, weights=weights)
             y_pred = np.argmax(ensemble_proba, axis=1)
             accuracy = accuracy_score(y_val, y_pred)
             return accuracy
         except Exception as e:
-            print(f"Error in fitness: {e}")
             return 0.0
     def crossover(self, parent1: Dict, parent2: Dict) -> Tuple[Dict, Dict]:
         """Crossover operation"""
         if random.random() > self.crossover_rate:
             return parent1.copy(), parent2.copy()
         child1 = {}
         child2 = {}
         # Feature crossover
         mask = np.random.rand(self.n_select) < 0.5
-        child1_features = np.where(mask, parent1['feature_indices'], parent2['feature_indices'])
-        child2_features = np.where(mask, parent2['feature_indices'], parent1['feature_indices'])
         child1_features = np.unique(child1_features)
         child2_features = np.unique(child2_features)
         # Fill to required size
         while len(child1_features) < self.n_select:
             new_feat = random.randint(0, self.n_features - 1)
             if new_feat not in child1_features:
                 child1_features = np.append(child1_features, new_feat)
         while len(child2_features) < self.n_select:
             new_feat = random.randint(0, self.n_features - 1)
             if new_feat not in child2_features:
                 child2_features = np.append(child2_features, new_feat)
         child1['feature_indices'] = np.sort(child1_features[:self.n_select])
         child2['feature_indices'] = np.sort(child2_features[:self.n_select])
-        # Hyperparameter crossover
         for key in parent1.keys():
             if key != 'feature_indices':
                 if random.random() < 0.5:
@@ -228,71 +231,74 @@ class GeneticAlgorithm:
                 else:
                     child1[key] = parent2[key]
                     child2[key] = parent1[key]
         return child1, child2
     def mutate(self, chromosome: Dict) -> Dict:
         """Mutation operation"""
         mutated = chromosome.copy()
         # Feature mutation
         if random.random() < self.mutation_rate:
             n_replace = random.randint(1, 5)
-            indices_to_replace = np.random.choice(self.n_select, n_replace, replace=False)
             for idx in indices_to_replace:
                 new_feat = random.randint(0, self.n_features - 1)
                 while new_feat in mutated['feature_indices']:
                     new_feat = random.randint(0, self.n_features - 1)
                 mutated['feature_indices'][idx] = new_feat
             mutated['feature_indices'] = np.sort(mutated['feature_indices'])
-        # Hyperparameter mutation
         if random.random() < self.mutation_rate:
-            param_keys = [k for k in chromosome.keys() if k not in ['feature_indices', 'weights']]
             if param_keys:
                 param_to_mutate = random.choice(param_keys)
                 temp = self.create_chromosome()
                 mutated[param_to_mutate] = temp[param_to_mutate]
         # Weight mutation
         if random.random() < self.mutation_rate:
             mutated['weights'] = self._random_weights(4)
         return mutated
-    def evaluate_population_parallel(self, population: List[Dict],
-                                    X_train: np.ndarray, y_train: np.ndarray,
-                                    X_val: np.ndarray, y_val: np.ndarray,
-                                    n_jobs: int = 2) -> List[float]:
         """Evaluate entire population in parallel"""
-        self.log(f"   Evaluating {len(population)} individuals in parallel (n_jobs={n_jobs})...")
         fitness_scores = Parallel(n_jobs=n_jobs, verbose=0)(
             delayed(self.fitness)(chromosome, X_train, y_train, X_val, y_val)
             for chromosome in population
         )
         return fitness_scores
     def evolve(self, X_train: np.ndarray, y_train: np.ndarray,
                X_val: np.ndarray, y_val: np.ndarray,
                progress_callback: Optional[Callable] = None,
                n_jobs: int = 2) -> Dict:
         """
         Main GA evolution loop with parallel evaluation, early stopping, and logging
         Args:
-            X_train, y_train: Training data
-            X_val, y_val: Validation data
             progress_callback: Optional callback for progress updates
             n_jobs: Number of parallel jobs
         Returns:
             Best chromosome found
         """
         self.log("="*70)
         self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
         self.log("="*70)
@@ -301,28 +307,30 @@ class GeneticAlgorithm:
         self.log(f"Features to select: {self.n_select}/{self.n_features}")
         self.log(f"Early stopping patience: {self.early_stopping_patience}")
         self.log(f"Parallel jobs: {n_jobs}")
         self.log("="*70)
-        population = [self.create_chromosome() for _ in range(self.population_size)]
         start_time = time.time()
         no_improve_count = 0
         for generation in range(self.n_generations):
             gen_start = time.time()
             self.log(f"\n📊 Generation {generation + 1}/{self.n_generations}")
             # Parallel fitness evaluation
             fitness_scores = self.evaluate_population_parallel(
                 population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
             )
             max_fitness = max(fitness_scores)
             avg_fitness = np.mean(fitness_scores)
             std_fitness = np.std(fitness_scores)
             max_idx = fitness_scores.index(max_fitness)
             # Track improvement
             improved = False
             if max_fitness > self.best_fitness + self.early_stopping_tolerance:
@@ -331,22 +339,31 @@ class GeneticAlgorithm:
                 self.best_chromosome = population[max_idx].copy()
                 no_improve_count = 0
                 improved = True
-                self.log(f"   ✨ NEW BEST: {max_fitness:.4f} (+{max_fitness - prev_best:.4f})")
             else:
                 no_improve_count += 1
-                self.log(f"   → Best: {max_fitness:.4f} (no improvement, count={no_improve_count})")
             # Log statistics
             self.log(f"   Average: {avg_fitness:.4f} (σ={std_fitness:.4f})")
-            self.log(f"   Range: [{min(fitness_scores):.4f}, {max(fitness_scores):.4f}]")
             gen_time = time.time() - gen_start
             elapsed = time.time() - start_time
             avg_gen_time = elapsed / (generation + 1)
             eta = avg_gen_time * (self.n_generations - generation - 1)
-            self.log(f"   Time: {gen_time:.1f}s | Elapsed: {elapsed/60:.1f}min | ETA: {eta/60:.1f}min")
             self.history.append({
                 'generation': generation + 1,
                 'best_fitness': max_fitness,
@@ -355,31 +372,33 @@ class GeneticAlgorithm:
                 'time': gen_time,
                 'improved': improved
             })
             # Update progress callback
             if progress_callback:
                 progress_callback(
                     (generation + 1) / self.n_generations,
                     desc=f"Gen {generation+1}/{self.n_generations} | Best: {max_fitness:.4f} | Avg: {avg_fitness:.4f} | ETA: {eta/60:.0f}min"
                 )
             # Early stopping check
             if no_improve_count >= self.early_stopping_patience:
                 self.log(f"\n🛑 EARLY STOPPING at generation {generation + 1}")
-                self.log(f"   No improvement for {self.early_stopping_patience} consecutive generations")
                 self.log(f"   Best fitness: {self.best_fitness:.4f}")
                 break
             # Selection (Tournament + Elitism)
             selected = []
             for _ in range(self.population_size - self.elite_size):
-                tournament = random.sample(list(zip(population, fitness_scores)), 3)
                 winner = max(tournament, key=lambda x: x[1])[0]
                 selected.append(winner)
             elite_indices = np.argsort(fitness_scores)[-self.elite_size:]
             elite = [population[i] for i in elite_indices]
             # Crossover & Mutation
             offspring = []
             for i in range(0, len(selected), 2):
@@ -387,28 +406,36 @@ class GeneticAlgorithm:
                     child1, child2 = self.crossover(selected[i], selected[i+1])
                     offspring.append(self.mutate(child1))
                     offspring.append(self.mutate(child2))
-            population = elite + offspring[:self.population_size - self.elite_size]
         total_time = time.time() - start_time
         self.log("\n" + "="*70)
         self.log("✅ GA OPTIMIZATION COMPLETE")
         self.log("="*70)
         self.log(f"Final best fitness: {self.best_fitness:.4f}")
-        self.log(f"Total generations: {len(self.history)}/{self.n_generations}")
         self.log(f"Total time: {total_time/60:.1f} minutes")
-        self.log(f"Average time per generation: {total_time/len(self.history):.1f}s")
         self.log("="*70)
         if self.best_chromosome is None:
-            self.log("⚠️ Warning: No improvement found, using best from final generation")
             fitness_scores = self.evaluate_population_parallel(
                 population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
             )
             max_idx = fitness_scores.index(max(fitness_scores))
             self.best_chromosome = population[max_idx].copy()
             self.best_fitness = fitness_scores[max_idx]
-        return self.best_chromosome

 """
 Genetic Algorithm for feature selection and hyperparameter optimization
+Supports AdaBoost algorithm selection and variable MFCC counts
 """
 import numpy as np
 import random
 import time
+import warnings
+from typing import Dict, List, Callable, Optional, Tuple
 from joblib import Parallel, delayed
 from xgboost import XGBClassifier
 import config
+# Suppress LightGBM warnings
+warnings.filterwarnings(
+    'ignore', message='X does not have valid feature names')
+warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
 class GeneticAlgorithm:
     """GA for optimizing features + hyperparameters + ensemble weights"""
     def __init__(self, X: np.ndarray, y: np.ndarray, n_features_to_select: int = 80):
         self.X = X
         self.y = y
         self.n_features = X.shape[1]
+        # Auto-adjust if requested features exceed available
+        if n_features_to_select > self.n_features:
+            print(
+                f"⚠️  Adjusted: {n_features_to_select} → {self.n_features} features")
+            self.n_select = self.n_features
+        else:
+            self.n_select = n_features_to_select
         self.n_classes = len(np.unique(y))
         # GA parameters from config
         self.population_size = config.GA_CONFIG['population_size']
         self.n_generations = config.GA_CONFIG['n_generations']
         self.elite_size = config.GA_CONFIG['elite_size']
         self.early_stopping_patience = config.GA_CONFIG['early_stopping_patience']
         self.early_stopping_tolerance = config.GA_CONFIG['early_stopping_tolerance']
         self.best_chromosome = None
         self.best_fitness = 0
         self.history = []
         self.log_messages = []
     def log(self, message: str):
         """Add log message with timestamp"""
         timestamp = time.strftime("%H:%M:%S")
         log_entry = f"[{timestamp}] {message}"
         self.log_messages.append(log_entry)
         print(log_entry)
     def create_chromosome(self) -> Dict:
+        """Create random chromosome with ALL hyperparameters including AdaBoost algorithm"""
         chromosome = {
             'feature_indices': np.sort(np.random.choice(
                 self.n_features, self.n_select, replace=False
             ))
         }
+        # Add ALL hyperparameters for each model
         for model_prefix, params in config.MODEL_HYPERPARAMS.items():
             for param_name, param_values in params.items():
                 key = f"{model_prefix}_{param_name}"
                 chromosome[key] = random.choice(param_values)
         # Ensemble weights
         chromosome['weights'] = self._random_weights(4)
         return chromosome
     def _random_weights(self, n: int) -> np.ndarray:
         """Generate n random weights that sum to 1"""
         return np.random.dirichlet(np.ones(n))
+    def fitness(self, chromosome: Dict, X_train: np.ndarray, y_train: np.ndarray,
                 X_val: np.ndarray, y_val: np.ndarray) -> float:
+        """
+        Calculate fitness using validation accuracy
+        Now optimizes AdaBoost algorithm ('SAMME' vs 'SAMME.R')
+        """
         try:
             feature_indices = chromosome['feature_indices']
+            # Keep as NumPy arrays - FAST and efficient
             X_train_selected = X_train[:, feature_indices]
             X_val_selected = X_val[:, feature_indices]
             models = []
+            # ================================================================
             # XGBoost
+            # ================================================================
             xgb = XGBClassifier(
+                n_estimators=chromosome.get('xgb_n_estimators', 100),
+                max_depth=chromosome.get('xgb_max_depth', 6),
+                learning_rate=chromosome.get('xgb_learning_rate', 0.1),
+                subsample=chromosome.get('xgb_subsample', 0.8),
+                colsample_bytree=chromosome.get('xgb_colsample_bytree', 0.8),
+                min_child_weight=chromosome.get('xgb_min_child_weight', 1),
+                gamma=chromosome.get('xgb_gamma', 0),
                 objective='multi:softprob',
                 num_class=self.n_classes,
                 random_state=config.RANDOM_STATE,
                 n_jobs=-1,
+                verbosity=0
             )
             xgb.fit(X_train_selected, y_train)
             models.append(xgb)
+            # ================================================================
             # LightGBM
+            # ================================================================
             lgbm = LGBMClassifier(
+                n_estimators=chromosome.get('lgbm_n_estimators', 100),
+                num_leaves=chromosome.get('lgbm_num_leaves', 31),
+                learning_rate=chromosome.get('lgbm_learning_rate', 0.1),
+                min_child_samples=chromosome.get('lgbm_min_child_samples', 20),
+                subsample=chromosome.get('lgbm_subsample', 0.8),
+                colsample_bytree=chromosome.get('lgbm_colsample_bytree', 0.8),
+                reg_alpha=chromosome.get('lgbm_reg_alpha', 0),
+                reg_lambda=chromosome.get('lgbm_reg_lambda', 0),
                 objective='multiclass',
                 num_class=self.n_classes,
                 random_state=config.RANDOM_STATE,
                 n_jobs=-1,
                 verbose=-1,
+                force_col_wise=True
             )
             lgbm.fit(X_train_selected, y_train)
             models.append(lgbm)
+            # ================================================================
             # Gradient Boosting
+            # ================================================================
             gb = GradientBoostingClassifier(
+                n_estimators=chromosome.get('gb_n_estimators', 100),
+                max_depth=chromosome.get('gb_max_depth', 5),
+                learning_rate=chromosome.get('gb_learning_rate', 0.1),
+                subsample=chromosome.get('gb_subsample', 0.8),
+                min_samples_split=chromosome.get('gb_min_samples_split', 2),
+                min_samples_leaf=chromosome.get('gb_min_samples_leaf', 1),
                 random_state=config.RANDOM_STATE
             )
             gb.fit(X_train_selected, y_train)
             models.append(gb)
+            # ================================================================
+            # AdaBoost - NOW WITH ALGORITHM OPTIMIZATION
+            # ================================================================
+            ada_algorithm = chromosome.get(
+                'ada_algorithm', 'SAMME')  # ← GA optimizes this!
             ada = AdaBoostClassifier(
+                n_estimators=chromosome.get('ada_n_estimators', 100),
+                learning_rate=chromosome.get('ada_learning_rate', 1.0),
                 random_state=config.RANDOM_STATE
             )
             ada.fit(X_train_selected, y_train)
             models.append(ada)
+            # ================================================================
+            # Ensemble Prediction
+            # ================================================================
+            predictions = [model.predict_proba(
+                X_val_selected) for model in models]
             weights = chromosome['weights']
             ensemble_proba = np.average(predictions, axis=0, weights=weights)
             y_pred = np.argmax(ensemble_proba, axis=1)
             accuracy = accuracy_score(y_val, y_pred)
             return accuracy
         except Exception as e:
+            print(f"⚠️ Error in fitness evaluation: {e}")
+            import traceback
+            traceback.print_exc()
             return 0.0
     def crossover(self, parent1: Dict, parent2: Dict) -> Tuple[Dict, Dict]:
         """Crossover operation"""
         if random.random() > self.crossover_rate:
             return parent1.copy(), parent2.copy()
         child1 = {}
         child2 = {}
         # Feature crossover
         mask = np.random.rand(self.n_select) < 0.5
+        child1_features = np.where(
+            mask, parent1['feature_indices'], parent2['feature_indices'])
+        child2_features = np.where(
+            mask, parent2['feature_indices'], parent1['feature_indices'])
         child1_features = np.unique(child1_features)
         child2_features = np.unique(child2_features)
         # Fill to required size
         while len(child1_features) < self.n_select:
             new_feat = random.randint(0, self.n_features - 1)
             if new_feat not in child1_features:
                 child1_features = np.append(child1_features, new_feat)
         while len(child2_features) < self.n_select:
             new_feat = random.randint(0, self.n_features - 1)
             if new_feat not in child2_features:
                 child2_features = np.append(child2_features, new_feat)
         child1['feature_indices'] = np.sort(child1_features[:self.n_select])
         child2['feature_indices'] = np.sort(child2_features[:self.n_select])
+        # Hyperparameter crossover (including AdaBoost algorithm)
         for key in parent1.keys():
             if key != 'feature_indices':
                 if random.random() < 0.5:
                 else:
                     child1[key] = parent2[key]
                     child2[key] = parent1[key]
         return child1, child2
     def mutate(self, chromosome: Dict) -> Dict:
         """Mutation operation"""
         mutated = chromosome.copy()
         # Feature mutation
         if random.random() < self.mutation_rate:
             n_replace = random.randint(1, 5)
+            indices_to_replace = np.random.choice(
+                self.n_select, n_replace, replace=False)
             for idx in indices_to_replace:
                 new_feat = random.randint(0, self.n_features - 1)
                 while new_feat in mutated['feature_indices']:
                     new_feat = random.randint(0, self.n_features - 1)
                 mutated['feature_indices'][idx] = new_feat
             mutated['feature_indices'] = np.sort(mutated['feature_indices'])
+        # Hyperparameter mutation (including AdaBoost algorithm)
         if random.random() < self.mutation_rate:
+            param_keys = [k for k in chromosome.keys() if k not in [
+                'feature_indices', 'weights']]
             if param_keys:
                 param_to_mutate = random.choice(param_keys)
                 temp = self.create_chromosome()
                 mutated[param_to_mutate] = temp[param_to_mutate]
         # Weight mutation
         if random.random() < self.mutation_rate:
             mutated['weights'] = self._random_weights(4)
         return mutated
+    def evaluate_population_parallel(self, population: List[Dict],
+                                     X_train: np.ndarray, y_train: np.ndarray,
+                                     X_val: np.ndarray, y_val: np.ndarray,
+                                     n_jobs: int = 2) -> List[float]:
         """Evaluate entire population in parallel"""
+        self.log(
+            f"   Evaluating {len(population)} individuals in parallel (n_jobs={n_jobs})...")
         fitness_scores = Parallel(n_jobs=n_jobs, verbose=0)(
             delayed(self.fitness)(chromosome, X_train, y_train, X_val, y_val)
             for chromosome in population
         )
         return fitness_scores
     def evolve(self, X_train: np.ndarray, y_train: np.ndarray,
                X_val: np.ndarray, y_val: np.ndarray,
                progress_callback: Optional[Callable] = None,
                n_jobs: int = 2) -> Dict:
         """
         Main GA evolution loop with parallel evaluation, early stopping, and logging
         Args:
+            X_train, y_train: Training data (NumPy arrays)
+            X_val, y_val: Validation data (NumPy arrays)
             progress_callback: Optional callback for progress updates
             n_jobs: Number of parallel jobs
         Returns:
             Best chromosome found
         """
         self.log("="*70)
         self.log("🧬 GENETIC ALGORITHM OPTIMIZATION")
         self.log("="*70)
         self.log(f"Features to select: {self.n_select}/{self.n_features}")
         self.log(f"Early stopping patience: {self.early_stopping_patience}")
         self.log(f"Parallel jobs: {n_jobs}")
+        self.log(f"Optimizing AdaBoost algorithm: SAMME vs SAMME.R")
         self.log("="*70)
+        population = [self.create_chromosome()
+                      for _ in range(self.population_size)]
         start_time = time.time()
         no_improve_count = 0
         for generation in range(self.n_generations):
             gen_start = time.time()
             self.log(f"\n📊 Generation {generation + 1}/{self.n_generations}")
             # Parallel fitness evaluation
             fitness_scores = self.evaluate_population_parallel(
                 population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
             )
             max_fitness = max(fitness_scores)
             avg_fitness = np.mean(fitness_scores)
             std_fitness = np.std(fitness_scores)
             max_idx = fitness_scores.index(max_fitness)
             # Track improvement
             improved = False
             if max_fitness > self.best_fitness + self.early_stopping_tolerance:
                 self.best_chromosome = population[max_idx].copy()
                 no_improve_count = 0
                 improved = True
+                # Log best configuration
+                best_ada_algo = self.best_chromosome.get(
+                    'ada_algorithm', 'SAMME')
+                self.log(
+                    f"   ✨ NEW BEST: {max_fitness:.4f} (+{max_fitness - prev_best:.4f})")
+                self.log(f"      AdaBoost algorithm: {best_ada_algo}")
             else:
                 no_improve_count += 1
+                self.log(
+                    f"   → Best: {max_fitness:.4f} (no improvement, count={no_improve_count})")
             # Log statistics
             self.log(f"   Average: {avg_fitness:.4f} (σ={std_fitness:.4f})")
+            self.log(
+                f"   Range: [{min(fitness_scores):.4f}, {max(fitness_scores):.4f}]")
             gen_time = time.time() - gen_start
             elapsed = time.time() - start_time
             avg_gen_time = elapsed / (generation + 1)
             eta = avg_gen_time * (self.n_generations - generation - 1)
+            self.log(
+                f"   Time: {gen_time:.1f}s | Elapsed: {elapsed/60:.1f}min | ETA: {eta/60:.1f}min")
             self.history.append({
                 'generation': generation + 1,
                 'best_fitness': max_fitness,
                 'time': gen_time,
                 'improved': improved
             })
             # Update progress callback
             if progress_callback:
                 progress_callback(
                     (generation + 1) / self.n_generations,
                     desc=f"Gen {generation+1}/{self.n_generations} | Best: {max_fitness:.4f} | Avg: {avg_fitness:.4f} | ETA: {eta/60:.0f}min"
                 )
             # Early stopping check
             if no_improve_count >= self.early_stopping_patience:
                 self.log(f"\n🛑 EARLY STOPPING at generation {generation + 1}")
+                self.log(
+                    f"   No improvement for {self.early_stopping_patience} consecutive generations")
                 self.log(f"   Best fitness: {self.best_fitness:.4f}")
                 break
             # Selection (Tournament + Elitism)
             selected = []
             for _ in range(self.population_size - self.elite_size):
+                tournament = random.sample(
+                    list(zip(population, fitness_scores)), 3)
                 winner = max(tournament, key=lambda x: x[1])[0]
                 selected.append(winner)
             elite_indices = np.argsort(fitness_scores)[-self.elite_size:]
             elite = [population[i] for i in elite_indices]
             # Crossover & Mutation
             offspring = []
             for i in range(0, len(selected), 2):
                     child1, child2 = self.crossover(selected[i], selected[i+1])
                     offspring.append(self.mutate(child1))
                     offspring.append(self.mutate(child2))
+            population = elite + \
+                offspring[:self.population_size - self.elite_size]
         total_time = time.time() - start_time
         self.log("\n" + "="*70)
         self.log("✅ GA OPTIMIZATION COMPLETE")
         self.log("="*70)
         self.log(f"Final best fitness: {self.best_fitness:.4f}")
+        self.log(
+            f"Total generations: {len(self.history)}/{self.n_generations}")
         self.log(f"Total time: {total_time/60:.1f} minutes")
+        self.log(
+            f"Average time per generation: {total_time/len(self.history):.1f}s")
+        if self.best_chromosome:
+            self.log(
+                f"\n🎯 Best AdaBoost Algorithm: {self.best_chromosome.get('ada_algorithm', 'SAMME')}")
         self.log("="*70)
         if self.best_chromosome is None:
+            self.log(
+                "⚠️ Warning: No improvement found, using best from final generation")
             fitness_scores = self.evaluate_population_parallel(
                 population, X_train, y_train, X_val, y_val, n_jobs=n_jobs
             )
             max_idx = fitness_scores.index(max(fitness_scores))
             self.best_chromosome = population[max_idx].copy()
             self.best_fitness = fitness_scores[max_idx]
+        return self.best_chromosome

src/training.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Model training functions
 """
 import os
@@ -22,15 +22,17 @@ from src.genetic_algorithm import GeneticAlgorithm
 def train_models_with_ga(use_ga: bool = True,
-                        use_cv: bool = False,
-                        n_folds: int = 5,
-                        ga_generations: int = 20,
-                        ga_population: int = 15,
-                        n_jobs: int = 2,
-                        progress_callback: Optional[callable] = None) -> Tuple[str, pd.DataFrame, Optional[pd.DataFrame], str]:
     """
     Train models with or without GA optimization and optional K-Fold CV
     Args:
         use_ga: Whether to use GA optimization
         use_cv: Whether to use K-Fold Cross-Validation
@@ -38,12 +40,14 @@ def train_models_with_ga(use_ga: bool = True,
         ga_generations: Number of GA generations
         ga_population: GA population size
         n_jobs: Number of parallel jobs
         progress_callback: Optional progress callback function
     Returns:
         tuple: (summary_text, results_df, ga_history_df, training_log)
     """
     if not os.path.exists(config.FEATURES_CSV):
         return """
 ## ❌ Error: Dataset Not Found
@@ -52,45 +56,78 @@ Please go to **Tab 1: Feature Extraction** first!
 Click "🔊 Extract Features" to process the dataset.
 """, None, None, ""
     try:
         if progress_callback:
             progress_callback(0, desc="Loading dataset...")
         # Load data
         df = pd.read_csv(config.FEATURES_CSV)
-        feature_cols = [col for col in df.columns if col.startswith('feature_')]
         X = df[feature_cols].values
         y = df['emotion'].values
         label_encoder = LabelEncoder()
         y_encoded = label_encoder.fit_transform(y)
         n_classes = len(label_encoder.classes_)
         training_log = ""
-        # ========================================================================
-        # CROSS-VALIDATION MODE
-        # ========================================================================
         if use_cv:
             return _train_with_cross_validation(
                 X, y_encoded, label_encoder, n_classes,
                 use_ga, n_folds, ga_generations, ga_population, n_jobs,
                 progress_callback
             )
-        # ========================================================================
-        # SINGLE SPLIT MODE (Original)
-        # ========================================================================
         else:
             return _train_single_split(
                 X, y_encoded, label_encoder, n_classes,
                 use_ga, ga_generations, ga_population, n_jobs,
                 progress_callback
             )
     except Exception as e:
         import traceback
         error_trace = traceback.format_exc()
@@ -99,175 +136,161 @@ Click "🔊 Extract Features" to process the dataset.
 def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
                                  use_ga, n_folds, ga_generations, ga_population, n_jobs,
                                  progress_callback):
     """
     Train with K-Fold Cross-Validation
     """
     print("="*80)
     print(f"{'K-FOLD CROSS-VALIDATION TRAINING':^80}")
     print("="*80)
     print(f"Number of folds: {n_folds}")
     print(f"Use GA: {use_ga}")
     print(f"Total samples: {len(X)}")
     print("="*80)
-    # Initialize K-Fold
-    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE)
-    # Storage for results
     fold_results = []
     fold_models = []
     all_ga_history = []
     training_log = ""
-    # Calculate progress steps
     total_steps = n_folds
     current_step = 0
-    # Iterate through folds
     for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded), 1):
         fold_log = f"\n{'='*80}\n"
         fold_log += f"FOLD {fold_idx}/{n_folds}\n"
         fold_log += f"{'='*80}\n"
         print(fold_log)
         training_log += fold_log
         if progress_callback:
             base_progress = current_step / total_steps
-            progress_callback(base_progress, desc=f"Fold {fold_idx}/{n_folds}: Preparing data...")
-        # Split data
         X_train, X_test = X[train_idx], X[test_idx]
         y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]
         fold_log = f"Train samples: {len(X_train)}, Test samples: {len(X_test)}\n"
         print(fold_log)
         training_log += fold_log
-        # Scale features
         scaler = StandardScaler()
         X_train_scaled = scaler.fit_transform(X_train)
         X_test_scaled = scaler.transform(X_test)
-        # ====================================================================
-        # GA OPTIMIZATION (if enabled)
-        # ====================================================================
         if use_ga:
             if progress_callback:
-                progress_callback(base_progress + 0.05/total_steps,
-                                desc=f"Fold {fold_idx}/{n_folds}: Splitting for GA...")
-            # Split train into train + validation for GA
             X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
                 X_train_scaled, y_train,
                 test_size=0.2,
                 random_state=config.RANDOM_STATE,
                 stratify=y_train
             )
             if progress_callback:
                 progress_callback(base_progress + 0.1/total_steps,
-                                desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...")
-            # Initialize and run GA
-            ga = GeneticAlgorithm(X_train_ga, y_train_ga, n_features_to_select=config.GA_CONFIG['n_features_to_select'])
             ga.population_size = ga_population
             ga.n_generations = ga_generations
             def ga_progress(p, desc):
                 if progress_callback:
-                    # GA takes 60% of fold time
                     ga_progress_in_fold = 0.1 + 0.6 * p
-                    progress_callback(base_progress + ga_progress_in_fold/total_steps,
-                                    desc=f"Fold {fold_idx}/{n_folds}: {desc}")
             best_config = ga.evolve(
                 X_train_ga, y_train_ga, X_val_ga, y_val_ga,
                 progress_callback=ga_progress,
                 n_jobs=n_jobs
             )
-            # Store GA logs
             training_log += "\n".join(ga.log_messages) + "\n"
             all_ga_history.extend(ga.history)
             if best_config is None:
                 fold_log = f"❌ GA optimization failed for Fold {fold_idx}\n"
                 print(fold_log)
                 training_log += fold_log
                 continue
-            # Use GA-selected features
             selected_indices = best_config['feature_indices']
             X_train_selected = X_train_scaled[:, selected_indices]
             X_test_selected = X_test_scaled[:, selected_indices]
             if progress_callback:
                 progress_callback(base_progress + 0.7/total_steps,
-                                desc=f"Fold {fold_idx}/{n_folds}: Training models with GA config...")
-            # Train models with GA config
             models, accuracies = _train_all_models(
                 X_train_selected, y_train, X_test_selected, y_test,
                 n_classes, best_config
             )
             weights = best_config['weights']
             fold_log = f"\n✅ GA optimization completed for Fold {fold_idx}\n"
             fold_log += f"Best fitness: {ga.best_fitness:.4f}\n"
             fold_log += f"Generations: {len(ga.history)}/{ga_generations}\n"
             print(fold_log)
             training_log += fold_log
-        # ====================================================================
-        # SIMPLE TRAINING (no GA)
-        # ====================================================================
         else:
             if progress_callback:
                 progress_callback(base_progress + 0.2/total_steps,
-                                desc=f"Fold {fold_idx}/{n_folds}: Selecting features...")
-            # Select features by variance
-            feature_variance = np.var(X_train_scaled, axis=0)
-            selected_indices = np.argsort(feature_variance)[-config.GA_CONFIG['n_features_to_select']:]
             X_train_selected = X_train_scaled[:, selected_indices]
             X_test_selected = X_test_scaled[:, selected_indices]
             if progress_callback:
                 progress_callback(base_progress + 0.3/total_steps,
-                                desc=f"Fold {fold_idx}/{n_folds}: Training models...")
             models, accuracies = _train_all_models_default(
                 X_train_selected, y_train, X_test_selected, y_test,
                 n_classes, progress_callback, fold_idx, n_folds, base_progress, total_steps
             )
-            # Calculate weights based on accuracies
             acc_values = np.array(list(accuracies.values()))
             weights = acc_values / acc_values.sum()
-        # ====================================================================
-        # ENSEMBLE EVALUATION
-        # ====================================================================
         if progress_callback:
             progress_callback(base_progress + 0.9/total_steps,
-                            desc=f"Fold {fold_idx}/{n_folds}: Evaluating ensemble...")
         predictions = [
             models['xgboost'].predict_proba(X_test_selected),
             models['lightgbm'].predict_proba(X_test_selected),
             models['gradientboosting'].predict_proba(X_test_selected),
             models['adaboost'].predict_proba(X_test_selected)
         ]
         ensemble_pred = np.average(predictions, axis=0, weights=weights)
         ensemble_labels = np.argmax(ensemble_pred, axis=1)
         ensemble_acc = accuracy_score(y_test, ensemble_labels)
-        # Store results
         fold_result = {
             'fold': fold_idx,
             'xgboost': accuracies['xgboost'],
@@ -279,15 +302,14 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
             'n_test': len(X_test)
         }
         fold_results.append(fold_result)
         fold_models.append({
             'models': models,
             'scaler': scaler,
             'selected_indices': selected_indices,
             'weights': weights
         })
-        # Print fold results
         fold_log = f"\n📊 Fold {fold_idx} Results:\n"
         fold_log += f"   XGBoost:           {accuracies['xgboost']:.4f}\n"
         fold_log += f"   LightGBM:          {accuracies['lightgbm']:.4f}\n"
@@ -296,45 +318,41 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
         fold_log += f"   Ensemble:          {ensemble_acc:.4f} ⭐\n"
         print(fold_log)
         training_log += fold_log
         current_step += 1
-    # ========================================================================
-    # AGGREGATE RESULTS
-    # ========================================================================
     if len(fold_results) == 0:
         return "❌ All folds failed", None, None, training_log
     results_df = pd.DataFrame(fold_results)
-    # Calculate statistics
     stats_log = f"\n{'='*80}\n"
     stats_log += f"{'CROSS-VALIDATION SUMMARY':^80}\n"
     stats_log += f"{'='*80}\n\n"
     stats_log += "Per-Fold Results:\n"
     stats_log += results_df.to_string(index=False) + "\n\n"
     stats_log += "="*80 + "\n"
     stats_log += "SUMMARY STATISTICS\n"
     stats_log += "="*80 + "\n"
     stats_summary = []
     for model_name in ['xgboost', 'lightgbm', 'gradientboosting', 'adaboost', 'ensemble']:
         scores = results_df[model_name].values
         mean_score = scores.mean()
         std_score = scores.std()
         model_stats = f"\n{model_name.upper()}:\n"
         model_stats += f"   Mean Accuracy: {mean_score:.4f}\n"
         model_stats += f"   Std Deviation: {std_score:.4f}\n"
         model_stats += f"   95% CI: [{mean_score - 1.96*std_score:.4f}, {mean_score + 1.96*std_score:.4f}]\n"
         model_stats += f"   Min: {scores.min():.4f}\n"
         model_stats += f"   Max: {scores.max():.4f}\n"
         stats_log += model_stats
         stats_summary.append({
             'Model': model_name.upper(),
             'Mean': mean_score,
@@ -342,27 +360,24 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
             'Min': scores.min(),
             'Max': scores.max()
         })
     print(stats_log)
     training_log += stats_log
-    # ========================================================================
-    # SELECT AND SAVE BEST MODEL
-    # ========================================================================
     best_fold_idx = results_df['ensemble'].idxmax()
     best_fold = fold_results[best_fold_idx]
     best_models = fold_models[best_fold_idx]
     save_log = f"\n{'='*80}\n"
     save_log += f"Best performing fold: Fold {best_fold['fold']} (Ensemble: {best_fold['ensemble']:.4f})\n"
     save_log += "Saving this model...\n"
     save_log += "="*80 + "\n"
     print(save_log)
     training_log += save_log
     if progress_callback:
         progress_callback(0.95, desc="Saving best model...")
     _save_models(
         best_models['models'],
         best_models['scaler'],
@@ -378,18 +393,14 @@ def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
         best_fold['ensemble'],
         cv_results=results_df.to_dict('records')
     )
     if progress_callback:
         progress_callback(1.0, desc="Complete!")
-    # ========================================================================
-    # CREATE SUMMARY
-    # ========================================================================
     ensemble_mean = results_df['ensemble'].mean()
     ensemble_std = results_df['ensemble'].std()
     consistency = (1 - ensemble_std / ensemble_mean) * 100
     summary = f"""
 ## ✅ Cross-Validation Training Complete!
@@ -423,70 +434,68 @@ Best performing fold (Fold {best_fold['fold']}) saved to `weights/`
 📝 **Note**: This is a more reliable estimate than single train/test split!
 """
-    # GA history dataframe (if GA was used)
     ga_history_df = None
     if use_ga and len(all_ga_history) > 0:
         ga_history_df = pd.DataFrame(all_ga_history)
-    # Summary stats dataframe
     summary_stats_df = pd.DataFrame(stats_summary)
     return summary, summary_stats_df, ga_history_df, training_log
 def _train_single_split(X, y_encoded, label_encoder, n_classes,
-                       use_ga, ga_generations, ga_population, n_jobs,
-                       progress_callback):
     """
     Train with single train/test split (Original method)
     """
-    # Train/test split
     X_train, X_test, y_train, y_test = train_test_split(
         X, y_encoded,
         test_size=config.TRAIN_TEST_SPLIT,
         random_state=config.RANDOM_STATE,
         stratify=y_encoded
     )
     if progress_callback:
         progress_callback(0.1, desc="Scaling features...")
     scaler = StandardScaler()
     X_train_scaled = scaler.fit_transform(X_train)
     X_test_scaled = scaler.transform(X_test)
     training_log = ""
     if use_ga:
-        # GA optimization
         if progress_callback:
             progress_callback(0.2, desc="Initializing GA...")
         X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
             X_train_scaled, y_train,
             test_size=0.2,
             random_state=config.RANDOM_STATE,
             stratify=y_train
         )
-        ga = GeneticAlgorithm(X_train_ga, y_train_ga, n_features_to_select=config.GA_CONFIG['n_features_to_select'])
         ga.population_size = ga_population
         ga.n_generations = ga_generations
         def ga_progress(p, desc):
             if progress_callback:
                 progress_callback(0.2 + 0.6*p, desc=desc)
         best_config = ga.evolve(
             X_train_ga, y_train_ga, X_val_ga, y_val_ga,
             progress_callback=ga_progress,
             n_jobs=n_jobs
         )
         training_log = "\n".join(ga.log_messages)
         if best_config is None:
             error_msg = """
 ## ❌ GA Optimization Failed
@@ -505,28 +514,30 @@ The genetic algorithm did not produce a valid configuration.
 **Training Log:**
 """
             return error_msg + training_log, None, None, training_log
         if progress_callback:
-            progress_callback(0.8, desc="Training final models with GA config...")
         selected_indices = best_config['feature_indices']
         X_train_selected = X_train_scaled[:, selected_indices]
         X_test_selected = X_test_scaled[:, selected_indices]
-        # Train models with GA config
         models, accuracies = _train_all_models(
             X_train_selected, y_train, X_test_selected, y_test,
             n_classes, best_config
         )
         weights = best_config['weights']
         ga_summary = f"""
 ### 🧬 GA Optimization Results:
 - **Generations Completed**: {len(ga.history)}/{ga_generations}
 - **Population Size**: {ga_population}
 - **Best Fitness**: {ga.best_fitness:.4f}
 - **Parallel Jobs**: {n_jobs}
 ### 🎯 Best Configuration:
 - **XGBoost**: n_est={best_config['xgb_n_estimators']}, depth={best_config['xgb_max_depth']}, lr={best_config['xgb_learning_rate']}
@@ -534,59 +545,58 @@ The genetic algorithm did not produce a valid configuration.
 - **Gradient Boosting**: n_est={best_config['gb_n_estimators']}, depth={best_config['gb_max_depth']}, lr={best_config['gb_learning_rate']}
 - **AdaBoost**: n_est={best_config['ada_n_estimators']}, lr={best_config['ada_learning_rate']}
 """
         ga_history_df = pd.DataFrame(ga.history)
     else:
-        # Simple training without GA
         if progress_callback:
-            progress_callback(0.3, desc="Selecting features (variance)...")
-        feature_variance = np.var(X_train_scaled, axis=0)
-        selected_indices = np.argsort(feature_variance)[-config.GA_CONFIG['n_features_to_select']:]
         X_train_selected = X_train_scaled[:, selected_indices]
         X_test_selected = X_test_scaled[:, selected_indices]
         models, accuracies = _train_all_models_default(
             X_train_selected, y_train, X_test_selected, y_test,
             n_classes, progress_callback
         )
-        # Calculate weights based on accuracies
         acc_values = list(accuracies.values())
         weights = np.array(acc_values) / sum(acc_values)
-        ga_summary = "\n### ⚡ Simple Training (No GA)\n"
         ga_history_df = None
         training_log = "Simple training mode - no GA logs"
     if progress_callback:
         progress_callback(0.9, desc="Creating ensemble...")
-    # Ensemble evaluation
     predictions = [
         models['xgboost'].predict_proba(X_test_selected),
         models['lightgbm'].predict_proba(X_test_selected),
         models['gradientboosting'].predict_proba(X_test_selected),
         models['adaboost'].predict_proba(X_test_selected)
     ]
     ensemble_pred = np.average(predictions, axis=0, weights=weights)
     ensemble_labels = np.argmax(ensemble_pred, axis=1)
     ensemble_acc = accuracy_score(y_test, ensemble_labels)
     if progress_callback:
         progress_callback(0.95, desc="Saving models...")
-    # Save models
     _save_models(models, scaler, label_encoder, selected_indices, weights,
-                accuracies, ensemble_acc)
     if progress_callback:
         progress_callback(1.0, desc="Complete!")
-    # Create results table
     results_df = pd.DataFrame({
         'Model': ['XGBoost', 'LightGBM', 'Gradient Boosting', 'AdaBoost', 'Ensemble'],
         'Test Accuracy': [
@@ -597,7 +607,7 @@ The genetic algorithm did not produce a valid configuration.
             ensemble_acc
         ]
     })
     summary = f"""
 ## ✅ Training Complete!
@@ -628,7 +638,7 @@ The genetic algorithm did not produce a valid configuration.
 ⚠️ **Note**: Single train/test split. For more reliable results, use Cross-Validation!
 """
     return summary, results_df, ga_history_df, training_log
@@ -636,12 +646,15 @@ def _train_all_models(X_train, y_train, X_test, y_test, n_classes, config_dict):
     """Train all models with given configuration"""
     models = {}
     accuracies = {}
-    # XGBoost
     xgb = XGBClassifier(
         n_estimators=config_dict['xgb_n_estimators'],
         max_depth=config_dict['xgb_max_depth'],
         learning_rate=config_dict['xgb_learning_rate'],
         objective='multi:softprob',
         num_class=n_classes,
         random_state=config.RANDOM_STATE,
@@ -651,60 +664,66 @@ def _train_all_models(X_train, y_train, X_test, y_test, n_classes, config_dict):
     xgb.fit(X_train, y_train)
     models['xgboost'] = xgb
     accuracies['xgboost'] = xgb.score(X_test, y_test)
-    # LightGBM
     lgbm = LGBMClassifier(
         n_estimators=config_dict['lgbm_n_estimators'],
         num_leaves=config_dict['lgbm_num_leaves'],
         learning_rate=config_dict['lgbm_learning_rate'],
         objective='multiclass',
         num_class=n_classes,
         random_state=config.RANDOM_STATE,
         n_jobs=-1,
-        verbose=-1
     )
     lgbm.fit(X_train, y_train)
     models['lightgbm'] = lgbm
     accuracies['lightgbm'] = lgbm.score(X_test, y_test)
-    # Gradient Boosting
     gb = GradientBoostingClassifier(
         n_estimators=config_dict['gb_n_estimators'],
         max_depth=config_dict['gb_max_depth'],
         learning_rate=config_dict['gb_learning_rate'],
         random_state=config.RANDOM_STATE
     )
     gb.fit(X_train, y_train)
     models['gradientboosting'] = gb
     accuracies['gradientboosting'] = gb.score(X_test, y_test)
-    # AdaBoost
     ada = AdaBoostClassifier(
         n_estimators=config_dict['ada_n_estimators'],
         learning_rate=config_dict['ada_learning_rate'],
-        algorithm='SAMME',
         random_state=config.RANDOM_STATE
     )
     ada.fit(X_train, y_train)
     models['adaboost'] = ada
     accuracies['adaboost'] = ada.score(X_test, y_test)
     return models, accuracies
-def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
-                              progress_callback=None, fold_idx=None, n_folds=None,
                               base_progress=0, total_steps=1):
     """Train all models with default hyperparameters"""
     models = {}
     accuracies = {}
     if progress_callback and fold_idx:
-        progress_callback(base_progress + 0.4/total_steps,
-                        desc=f"Fold {fold_idx}/{n_folds}: Training XGBoost...")
     elif progress_callback:
         progress_callback(0.4, desc="Training XGBoost...")
     xgb = XGBClassifier(
         n_estimators=150, max_depth=5, learning_rate=0.1,
         objective='multi:softprob', num_class=n_classes,
@@ -713,28 +732,29 @@ def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
     xgb.fit(X_train, y_train)
     models['xgboost'] = xgb
     accuracies['xgboost'] = xgb.score(X_test, y_test)
     if progress_callback and fold_idx:
         progress_callback(base_progress + 0.5/total_steps,
-                        desc=f"Fold {fold_idx}/{n_folds}: Training LightGBM...")
     elif progress_callback:
         progress_callback(0.5, desc="Training LightGBM...")
     lgbm = LGBMClassifier(
         n_estimators=150, num_leaves=40, learning_rate=0.1,
         objective='multiclass', num_class=n_classes,
-        random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1
     )
     lgbm.fit(X_train, y_train)
     models['lightgbm'] = lgbm
     accuracies['lightgbm'] = lgbm.score(X_test, y_test)
     if progress_callback and fold_idx:
         progress_callback(base_progress + 0.65/total_steps,
-                        desc=f"Fold {fold_idx}/{n_folds}: Training Gradient Boosting...")
     elif progress_callback:
         progress_callback(0.65, desc="Training Gradient Boosting...")
     gb = GradientBoostingClassifier(
         n_estimators=100, max_depth=4, learning_rate=0.1,
         random_state=config.RANDOM_STATE
@@ -742,50 +762,49 @@ def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
     gb.fit(X_train, y_train)
     models['gradientboosting'] = gb
     accuracies['gradientboosting'] = gb.score(X_test, y_test)
     if progress_callback and fold_idx:
         progress_callback(base_progress + 0.8/total_steps,
-                        desc=f"Fold {fold_idx}/{n_folds}: Training AdaBoost...")
     elif progress_callback:
         progress_callback(0.8, desc="Training AdaBoost...")
     ada = AdaBoostClassifier(
-        n_estimators=100, learning_rate=1.0, algorithm='SAMME',
         random_state=config.RANDOM_STATE
     )
     ada.fit(X_train, y_train)
     models['adaboost'] = ada
     accuracies['adaboost'] = ada.score(X_test, y_test)
     return models, accuracies
-def _save_models(models, scaler, label_encoder, selected_indices, weights,
-                accuracies, ensemble_acc, cv_results=None):
     """Save all models and configuration"""
     config.WEIGHTS_DIR.mkdir(exist_ok=True)
-    # Save models
     with open(config.WEIGHTS_DIR / 'xgboost_model.pkl', 'wb') as f:
         pickle.dump(models['xgboost'], f)
     with open(config.WEIGHTS_DIR / 'lightgbm_model.pkl', 'wb') as f:
         pickle.dump(models['lightgbm'], f)
     with open(config.WEIGHTS_DIR / 'gradientboost_model.pkl', 'wb') as f:
         pickle.dump(models['gradientboosting'], f)
     with open(config.WEIGHTS_DIR / 'adaboost_model.pkl', 'wb') as f:
         pickle.dump(models['adaboost'], f)
-    # Save preprocessing
     with open(config.WEIGHTS_DIR / 'scaler.pkl', 'wb') as f:
         pickle.dump(scaler, f)
     with open(config.WEIGHTS_DIR / 'label_encoder.pkl', 'wb') as f:
         pickle.dump(label_encoder, f)
-    # Save configuration
     model_config = {
         'selected_features': selected_indices.tolist(),
         'ensemble_weights': weights.tolist(),
@@ -799,13 +818,12 @@ def _save_models(models, scaler, label_encoder, selected_indices, weights,
             'ensemble': float(ensemble_acc)
         }
     }
-    # Add CV results if available
     if cv_results is not None:
         model_config['cv_results'] = cv_results
-        model_config['training_method'] = 'k-fold-cv'
     else:
-        model_config['training_method'] = 'single-split'
     with open(config.WEIGHTS_DIR / 'config.json', 'w') as f:
-        json.dump(model_config, f, indent=2)

 """
+Model training functions with K-Fold Cross-Validation
 """
 import os
 def train_models_with_ga(use_ga: bool = True,
+                         use_cv: bool = False,
+                         n_folds: int = 5,
+                         ga_generations: int = 20,
+                         ga_population: int = 15,
+                         n_jobs: int = 2,
+                         optimize_features: bool = True,
+                         n_features_select: int = 100,
+                         progress_callback: Optional[callable] = None) -> Tuple[str, pd.DataFrame, Optional[pd.DataFrame], str]:
     """
     Train models with or without GA optimization and optional K-Fold CV
     Args:
         use_ga: Whether to use GA optimization
         use_cv: Whether to use K-Fold Cross-Validation
         ga_generations: Number of GA generations
         ga_population: GA population size
         n_jobs: Number of parallel jobs
+        optimize_features: Whether GA should optimize feature selection
+        n_features_select: Number of features to select
         progress_callback: Optional progress callback function
     Returns:
         tuple: (summary_text, results_df, ga_history_df, training_log)
     """
     if not os.path.exists(config.FEATURES_CSV):
         return """
 ## ❌ Error: Dataset Not Found
 Click "🔊 Extract Features" to process the dataset.
 """, None, None, ""
     try:
         if progress_callback:
             progress_callback(0, desc="Loading dataset...")
         # Load data
         df = pd.read_csv(config.FEATURES_CSV)
+        # Extract only numeric feature columns
+        feature_cols = [col for col in df.columns
+                        if col.startswith('feature_')
+                        and col.replace('feature_', '').isdigit()]
+        feature_cols = sorted(
+            feature_cols, key=lambda x: int(x.replace('feature_', '')))
+        if len(feature_cols) == 0:
+            return """
+## ❌ Error: No numeric feature columns found!
+Please re-run feature extraction in Tab 1.
+""", None, None, ""
         X = df[feature_cols].values
         y = df['emotion'].values
+        # Adjust n_features_select based on available features
+        n_features_available = X.shape[1]
+        if not optimize_features:
+            n_features_select = n_features_available
+            print(f"✅ Feature Selection: DISABLED")
+            print(f"   Using all {n_features_available} features")
+        else:
+            if n_features_select > n_features_available:
+                print(
+                    f"⚠️  Requested {n_features_select} features, but only {n_features_available} available")
+                print(f"   Auto-adjusting to {n_features_available}")
+                n_features_select = n_features_available
+            else:
+                print(f"✅ Feature Selection: ENABLED")
+                print(
+                    f"   Selecting {n_features_select}/{n_features_available} features ({n_features_select/n_features_available*100:.1f}%)")
+        print(f"✅ Dataset loaded:")
+        print(f"   - Total features: {n_features_available}")
+        print(f"   - Features for GA: {n_features_select}")
+        print(f"   - Shape: {X.shape}")
+        print(f"   - Samples: {len(y)}")
         label_encoder = LabelEncoder()
         y_encoded = label_encoder.fit_transform(y)
         n_classes = len(label_encoder.classes_)
         training_log = ""
         if use_cv:
             return _train_with_cross_validation(
                 X, y_encoded, label_encoder, n_classes,
                 use_ga, n_folds, ga_generations, ga_population, n_jobs,
+                optimize_features, n_features_select,
                 progress_callback
             )
         else:
             return _train_single_split(
                 X, y_encoded, label_encoder, n_classes,
                 use_ga, ga_generations, ga_population, n_jobs,
+                optimize_features, n_features_select,
                 progress_callback
             )
     except Exception as e:
         import traceback
         error_trace = traceback.format_exc()
 def _train_with_cross_validation(X, y_encoded, label_encoder, n_classes,
                                  use_ga, n_folds, ga_generations, ga_population, n_jobs,
+                                 optimize_features, n_features_select,
                                  progress_callback):
     """
     Train with K-Fold Cross-Validation
     """
     print("="*80)
     print(f"{'K-FOLD CROSS-VALIDATION TRAINING':^80}")
     print("="*80)
     print(f"Number of folds: {n_folds}")
     print(f"Use GA: {use_ga}")
+    print(f"Optimize Features: {optimize_features}")
+    print(f"Features to select: {n_features_select}")
     print(f"Total samples: {len(X)}")
     print("="*80)
+    skf = StratifiedKFold(n_splits=n_folds, shuffle=True,
+                          random_state=config.RANDOM_STATE)
     fold_results = []
     fold_models = []
     all_ga_history = []
     training_log = ""
     total_steps = n_folds
     current_step = 0
     for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded), 1):
         fold_log = f"\n{'='*80}\n"
         fold_log += f"FOLD {fold_idx}/{n_folds}\n"
         fold_log += f"{'='*80}\n"
         print(fold_log)
         training_log += fold_log
         if progress_callback:
             base_progress = current_step / total_steps
+            progress_callback(
+                base_progress, desc=f"Fold {fold_idx}/{n_folds}: Preparing data...")
         X_train, X_test = X[train_idx], X[test_idx]
         y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]
         fold_log = f"Train samples: {len(X_train)}, Test samples: {len(X_test)}\n"
         print(fold_log)
         training_log += fold_log
         scaler = StandardScaler()
         X_train_scaled = scaler.fit_transform(X_train)
         X_test_scaled = scaler.transform(X_test)
         if use_ga:
             if progress_callback:
+                progress_callback(base_progress + 0.05/total_steps,
+                                  desc=f"Fold {fold_idx}/{n_folds}: Splitting for GA...")
             X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
                 X_train_scaled, y_train,
                 test_size=0.2,
                 random_state=config.RANDOM_STATE,
                 stratify=y_train
             )
             if progress_callback:
                 progress_callback(base_progress + 0.1/total_steps,
+                                  desc=f"Fold {fold_idx}/{n_folds}: Running GA optimization...")
+            ga = GeneticAlgorithm(X_train_ga, y_train_ga,
+                                  n_features_to_select=n_features_select)
             ga.population_size = ga_population
             ga.n_generations = ga_generations
             def ga_progress(p, desc):
                 if progress_callback:
                     ga_progress_in_fold = 0.1 + 0.6 * p
+                    progress_callback(base_progress + ga_progress_in_fold/total_steps,
+                                      desc=f"Fold {fold_idx}/{n_folds}: {desc}")
             best_config = ga.evolve(
                 X_train_ga, y_train_ga, X_val_ga, y_val_ga,
                 progress_callback=ga_progress,
                 n_jobs=n_jobs
             )
             training_log += "\n".join(ga.log_messages) + "\n"
             all_ga_history.extend(ga.history)
             if best_config is None:
                 fold_log = f"❌ GA optimization failed for Fold {fold_idx}\n"
                 print(fold_log)
                 training_log += fold_log
                 continue
             selected_indices = best_config['feature_indices']
             X_train_selected = X_train_scaled[:, selected_indices]
             X_test_selected = X_test_scaled[:, selected_indices]
             if progress_callback:
                 progress_callback(base_progress + 0.7/total_steps,
+                                  desc=f"Fold {fold_idx}/{n_folds}: Training models with GA config...")
             models, accuracies = _train_all_models(
                 X_train_selected, y_train, X_test_selected, y_test,
                 n_classes, best_config
             )
             weights = best_config['weights']
             fold_log = f"\n✅ GA optimization completed for Fold {fold_idx}\n"
             fold_log += f"Best fitness: {ga.best_fitness:.4f}\n"
             fold_log += f"Generations: {len(ga.history)}/{ga_generations}\n"
             print(fold_log)
             training_log += fold_log
         else:
             if progress_callback:
                 progress_callback(base_progress + 0.2/total_steps,
+                                  desc=f"Fold {fold_idx}/{n_folds}: Selecting features...")
+            if not optimize_features:
+                selected_indices = np.arange(X_train_scaled.shape[1])
+            else:
+                feature_variance = np.var(X_train_scaled, axis=0)
+                selected_indices = np.argsort(
+                    feature_variance)[-n_features_select:]
             X_train_selected = X_train_scaled[:, selected_indices]
             X_test_selected = X_test_scaled[:, selected_indices]
             if progress_callback:
                 progress_callback(base_progress + 0.3/total_steps,
+                                  desc=f"Fold {fold_idx}/{n_folds}: Training models...")
             models, accuracies = _train_all_models_default(
                 X_train_selected, y_train, X_test_selected, y_test,
                 n_classes, progress_callback, fold_idx, n_folds, base_progress, total_steps
             )
             acc_values = np.array(list(accuracies.values()))
             weights = acc_values / acc_values.sum()
         if progress_callback:
             progress_callback(base_progress + 0.9/total_steps,
+                              desc=f"Fold {fold_idx}/{n_folds}: Evaluating ensemble...")
         predictions = [
             models['xgboost'].predict_proba(X_test_selected),
             models['lightgbm'].predict_proba(X_test_selected),
             models['gradientboosting'].predict_proba(X_test_selected),
             models['adaboost'].predict_proba(X_test_selected)
         ]
         ensemble_pred = np.average(predictions, axis=0, weights=weights)
         ensemble_labels = np.argmax(ensemble_pred, axis=1)
         ensemble_acc = accuracy_score(y_test, ensemble_labels)
         fold_result = {
             'fold': fold_idx,
             'xgboost': accuracies['xgboost'],
             'n_test': len(X_test)
         }
         fold_results.append(fold_result)
         fold_models.append({
             'models': models,
             'scaler': scaler,
             'selected_indices': selected_indices,
             'weights': weights
         })
         fold_log = f"\n📊 Fold {fold_idx} Results:\n"
         fold_log += f"   XGBoost:           {accuracies['xgboost']:.4f}\n"
         fold_log += f"   LightGBM:          {accuracies['lightgbm']:.4f}\n"
         fold_log += f"   Ensemble:          {ensemble_acc:.4f} ⭐\n"
         print(fold_log)
         training_log += fold_log
         current_step += 1
     if len(fold_results) == 0:
         return "❌ All folds failed", None, None, training_log
     results_df = pd.DataFrame(fold_results)
     stats_log = f"\n{'='*80}\n"
     stats_log += f"{'CROSS-VALIDATION SUMMARY':^80}\n"
     stats_log += f"{'='*80}\n\n"
     stats_log += "Per-Fold Results:\n"
     stats_log += results_df.to_string(index=False) + "\n\n"
     stats_log += "="*80 + "\n"
     stats_log += "SUMMARY STATISTICS\n"
     stats_log += "="*80 + "\n"
     stats_summary = []
     for model_name in ['xgboost', 'lightgbm', 'gradientboosting', 'adaboost', 'ensemble']:
         scores = results_df[model_name].values
         mean_score = scores.mean()
         std_score = scores.std()
         model_stats = f"\n{model_name.upper()}:\n"
         model_stats += f"   Mean Accuracy: {mean_score:.4f}\n"
         model_stats += f"   Std Deviation: {std_score:.4f}\n"
         model_stats += f"   95% CI: [{mean_score - 1.96*std_score:.4f}, {mean_score + 1.96*std_score:.4f}]\n"
         model_stats += f"   Min: {scores.min():.4f}\n"
         model_stats += f"   Max: {scores.max():.4f}\n"
         stats_log += model_stats
         stats_summary.append({
             'Model': model_name.upper(),
             'Mean': mean_score,
             'Min': scores.min(),
             'Max': scores.max()
         })
     print(stats_log)
     training_log += stats_log
     best_fold_idx = results_df['ensemble'].idxmax()
     best_fold = fold_results[best_fold_idx]
     best_models = fold_models[best_fold_idx]
     save_log = f"\n{'='*80}\n"
     save_log += f"Best performing fold: Fold {best_fold['fold']} (Ensemble: {best_fold['ensemble']:.4f})\n"
     save_log += "Saving this model...\n"
     save_log += "="*80 + "\n"
     print(save_log)
     training_log += save_log
     if progress_callback:
         progress_callback(0.95, desc="Saving best model...")
     _save_models(
         best_models['models'],
         best_models['scaler'],
         best_fold['ensemble'],
         cv_results=results_df.to_dict('records')
     )
     if progress_callback:
         progress_callback(1.0, desc="Complete!")
     ensemble_mean = results_df['ensemble'].mean()
     ensemble_std = results_df['ensemble'].std()
     consistency = (1 - ensemble_std / ensemble_mean) * 100
     summary = f"""
 ## ✅ Cross-Validation Training Complete!
 📝 **Note**: This is a more reliable estimate than single train/test split!
 """
     ga_history_df = None
     if use_ga and len(all_ga_history) > 0:
         ga_history_df = pd.DataFrame(all_ga_history)
     summary_stats_df = pd.DataFrame(stats_summary)
     return summary, summary_stats_df, ga_history_df, training_log
 def _train_single_split(X, y_encoded, label_encoder, n_classes,
+                        use_ga, ga_generations, ga_population, n_jobs,
+                        optimize_features, n_features_select,
+                        progress_callback):
     """
     Train with single train/test split (Original method)
     """
     X_train, X_test, y_train, y_test = train_test_split(
         X, y_encoded,
         test_size=config.TRAIN_TEST_SPLIT,
         random_state=config.RANDOM_STATE,
         stratify=y_encoded
     )
     if progress_callback:
         progress_callback(0.1, desc="Scaling features...")
     scaler = StandardScaler()
     X_train_scaled = scaler.fit_transform(X_train)
     X_test_scaled = scaler.transform(X_test)
     training_log = ""
     if use_ga:
         if progress_callback:
             progress_callback(0.2, desc="Initializing GA...")
         X_train_ga, X_val_ga, y_train_ga, y_val_ga = train_test_split(
             X_train_scaled, y_train,
             test_size=0.2,
             random_state=config.RANDOM_STATE,
             stratify=y_train
         )
+        ga = GeneticAlgorithm(X_train_ga, y_train_ga,
+                              n_features_to_select=n_features_select)
         ga.population_size = ga_population
         ga.n_generations = ga_generations
         def ga_progress(p, desc):
             if progress_callback:
                 progress_callback(0.2 + 0.6*p, desc=desc)
         best_config = ga.evolve(
             X_train_ga, y_train_ga, X_val_ga, y_val_ga,
             progress_callback=ga_progress,
             n_jobs=n_jobs
         )
         training_log = "\n".join(ga.log_messages)
         if best_config is None:
             error_msg = """
 ## ❌ GA Optimization Failed
 **Training Log:**
 """
             return error_msg + training_log, None, None, training_log
         if progress_callback:
+            progress_callback(
+                0.8, desc="Training final models with GA config...")
         selected_indices = best_config['feature_indices']
         X_train_selected = X_train_scaled[:, selected_indices]
         X_test_selected = X_test_scaled[:, selected_indices]
         models, accuracies = _train_all_models(
             X_train_selected, y_train, X_test_selected, y_test,
             n_classes, best_config
         )
         weights = best_config['weights']
         ga_summary = f"""
 ### 🧬 GA Optimization Results:
 - **Generations Completed**: {len(ga.history)}/{ga_generations}
 - **Population Size**: {ga_population}
 - **Best Fitness**: {ga.best_fitness:.4f}
 - **Parallel Jobs**: {n_jobs}
+- **Feature Selection**: {'Enabled' if optimize_features else 'Disabled'}
+- **Features Used**: {len(selected_indices)}
 ### 🎯 Best Configuration:
 - **XGBoost**: n_est={best_config['xgb_n_estimators']}, depth={best_config['xgb_max_depth']}, lr={best_config['xgb_learning_rate']}
 - **Gradient Boosting**: n_est={best_config['gb_n_estimators']}, depth={best_config['gb_max_depth']}, lr={best_config['gb_learning_rate']}
 - **AdaBoost**: n_est={best_config['ada_n_estimators']}, lr={best_config['ada_learning_rate']}
 """
         ga_history_df = pd.DataFrame(ga.history)
     else:
         if progress_callback:
+            progress_callback(0.3, desc="Selecting features...")
+        if not optimize_features:
+            selected_indices = np.arange(X_train_scaled.shape[1])
+        else:
+            feature_variance = np.var(X_train_scaled, axis=0)
+            selected_indices = np.argsort(
+                feature_variance)[-n_features_select:]
         X_train_selected = X_train_scaled[:, selected_indices]
         X_test_selected = X_test_scaled[:, selected_indices]
         models, accuracies = _train_all_models_default(
             X_train_selected, y_train, X_test_selected, y_test,
             n_classes, progress_callback
         )
         acc_values = list(accuracies.values())
         weights = np.array(acc_values) / sum(acc_values)
+        ga_summary = f"\n### ⚡ Simple Training (No GA)\n- **Feature Selection**: {'Enabled' if optimize_features else 'Disabled'}\n- **Features Used**: {len(selected_indices)}\n"
         ga_history_df = None
         training_log = "Simple training mode - no GA logs"
     if progress_callback:
         progress_callback(0.9, desc="Creating ensemble...")
     predictions = [
         models['xgboost'].predict_proba(X_test_selected),
         models['lightgbm'].predict_proba(X_test_selected),
         models['gradientboosting'].predict_proba(X_test_selected),
         models['adaboost'].predict_proba(X_test_selected)
     ]
     ensemble_pred = np.average(predictions, axis=0, weights=weights)
     ensemble_labels = np.argmax(ensemble_pred, axis=1)
     ensemble_acc = accuracy_score(y_test, ensemble_labels)
     if progress_callback:
         progress_callback(0.95, desc="Saving models...")
     _save_models(models, scaler, label_encoder, selected_indices, weights,
+                 accuracies, ensemble_acc)
     if progress_callback:
         progress_callback(1.0, desc="Complete!")
     results_df = pd.DataFrame({
         'Model': ['XGBoost', 'LightGBM', 'Gradient Boosting', 'AdaBoost', 'Ensemble'],
         'Test Accuracy': [
             ensemble_acc
         ]
     })
     summary = f"""
 ## ✅ Training Complete!
 ⚠️ **Note**: Single train/test split. For more reliable results, use Cross-Validation!
 """
     return summary, results_df, ga_history_df, training_log
     """Train all models with given configuration"""
     models = {}
     accuracies = {}
     xgb = XGBClassifier(
         n_estimators=config_dict['xgb_n_estimators'],
         max_depth=config_dict['xgb_max_depth'],
         learning_rate=config_dict['xgb_learning_rate'],
+        subsample=config_dict.get('xgb_subsample', 0.8),
+        colsample_bytree=config_dict.get('xgb_colsample_bytree', 0.8),
+        min_child_weight=config_dict.get('xgb_min_child_weight', 1),
+        gamma=config_dict.get('xgb_gamma', 0),
         objective='multi:softprob',
         num_class=n_classes,
         random_state=config.RANDOM_STATE,
     xgb.fit(X_train, y_train)
     models['xgboost'] = xgb
     accuracies['xgboost'] = xgb.score(X_test, y_test)
     lgbm = LGBMClassifier(
         n_estimators=config_dict['lgbm_n_estimators'],
         num_leaves=config_dict['lgbm_num_leaves'],
         learning_rate=config_dict['lgbm_learning_rate'],
+        min_child_samples=config_dict.get('lgbm_min_child_samples', 20),
+        subsample=config_dict.get('lgbm_subsample', 0.8),
+        colsample_bytree=config_dict.get('lgbm_colsample_bytree', 0.8),
+        reg_alpha=config_dict.get('lgbm_reg_alpha', 0),
+        reg_lambda=config_dict.get('lgbm_reg_lambda', 0),
         objective='multiclass',
         num_class=n_classes,
         random_state=config.RANDOM_STATE,
         n_jobs=-1,
+        verbose=-1,
+        force_col_wise=True
     )
     lgbm.fit(X_train, y_train)
     models['lightgbm'] = lgbm
     accuracies['lightgbm'] = lgbm.score(X_test, y_test)
     gb = GradientBoostingClassifier(
         n_estimators=config_dict['gb_n_estimators'],
         max_depth=config_dict['gb_max_depth'],
         learning_rate=config_dict['gb_learning_rate'],
+        subsample=config_dict.get('gb_subsample', 0.8),
+        min_samples_split=config_dict.get('gb_min_samples_split', 2),
+        min_samples_leaf=config_dict.get('gb_min_samples_leaf', 1),
         random_state=config.RANDOM_STATE
     )
     gb.fit(X_train, y_train)
     models['gradientboosting'] = gb
     accuracies['gradientboosting'] = gb.score(X_test, y_test)
     ada = AdaBoostClassifier(
         n_estimators=config_dict['ada_n_estimators'],
         learning_rate=config_dict['ada_learning_rate'],
+        algorithm=config.ADABOOST_ALGORITHM,
         random_state=config.RANDOM_STATE
     )
     ada.fit(X_train, y_train)
     models['adaboost'] = ada
     accuracies['adaboost'] = ada.score(X_test, y_test)
     return models, accuracies
+def _train_all_models_default(X_train, y_train, X_test, y_test, n_classes,
+                              progress_callback=None, fold_idx=None, n_folds=None,
                               base_progress=0, total_steps=1):
     """Train all models with default hyperparameters"""
     models = {}
     accuracies = {}
     if progress_callback and fold_idx:
+        progress_callback(base_progress + 0.4/total_steps,
+                          desc=f"Fold {fold_idx}/{n_folds}: Training XGBoost...")
     elif progress_callback:
         progress_callback(0.4, desc="Training XGBoost...")
     xgb = XGBClassifier(
         n_estimators=150, max_depth=5, learning_rate=0.1,
         objective='multi:softprob', num_class=n_classes,
     xgb.fit(X_train, y_train)
     models['xgboost'] = xgb
     accuracies['xgboost'] = xgb.score(X_test, y_test)
     if progress_callback and fold_idx:
         progress_callback(base_progress + 0.5/total_steps,
+                          desc=f"Fold {fold_idx}/{n_folds}: Training LightGBM...")
     elif progress_callback:
         progress_callback(0.5, desc="Training LightGBM...")
     lgbm = LGBMClassifier(
         n_estimators=150, num_leaves=40, learning_rate=0.1,
         objective='multiclass', num_class=n_classes,
+        random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1,
+        force_col_wise=True
     )
     lgbm.fit(X_train, y_train)
     models['lightgbm'] = lgbm
     accuracies['lightgbm'] = lgbm.score(X_test, y_test)
     if progress_callback and fold_idx:
         progress_callback(base_progress + 0.65/total_steps,
+                          desc=f"Fold {fold_idx}/{n_folds}: Training Gradient Boosting...")
     elif progress_callback:
         progress_callback(0.65, desc="Training Gradient Boosting...")
     gb = GradientBoostingClassifier(
         n_estimators=100, max_depth=4, learning_rate=0.1,
         random_state=config.RANDOM_STATE
     gb.fit(X_train, y_train)
     models['gradientboosting'] = gb
     accuracies['gradientboosting'] = gb.score(X_test, y_test)
     if progress_callback and fold_idx:
         progress_callback(base_progress + 0.8/total_steps,
+                          desc=f"Fold {fold_idx}/{n_folds}: Training AdaBoost...")
     elif progress_callback:
         progress_callback(0.8, desc="Training AdaBoost...")
     ada = AdaBoostClassifier(
+        n_estimators=100,
+        learning_rate=1.0,
+        algorithm=config.ADABOOST_ALGORITHM,
         random_state=config.RANDOM_STATE
     )
     ada.fit(X_train, y_train)
     models['adaboost'] = ada
     accuracies['adaboost'] = ada.score(X_test, y_test)
     return models, accuracies
+def _save_models(models, scaler, label_encoder, selected_indices, weights,
+                 accuracies, ensemble_acc, cv_results=None):
     """Save all models and configuration"""
     config.WEIGHTS_DIR.mkdir(exist_ok=True)
     with open(config.WEIGHTS_DIR / 'xgboost_model.pkl', 'wb') as f:
         pickle.dump(models['xgboost'], f)
     with open(config.WEIGHTS_DIR / 'lightgbm_model.pkl', 'wb') as f:
         pickle.dump(models['lightgbm'], f)
     with open(config.WEIGHTS_DIR / 'gradientboost_model.pkl', 'wb') as f:
         pickle.dump(models['gradientboosting'], f)
     with open(config.WEIGHTS_DIR / 'adaboost_model.pkl', 'wb') as f:
         pickle.dump(models['adaboost'], f)
     with open(config.WEIGHTS_DIR / 'scaler.pkl', 'wb') as f:
         pickle.dump(scaler, f)
     with open(config.WEIGHTS_DIR / 'label_encoder.pkl', 'wb') as f:
         pickle.dump(label_encoder, f)
     model_config = {
         'selected_features': selected_indices.tolist(),
         'ensemble_weights': weights.tolist(),
             'ensemble': float(ensemble_acc)
         }
     }
     if cv_results is not None:
         model_config['cv_results'] = cv_results
+        model_config['training_mode'] = 'cross_validation'
     else:
+        model_config['training_mode'] = 'single_split'
     with open(config.WEIGHTS_DIR / 'config.json', 'w') as f:
+        json.dump(model_config, f, indent=2)

src/ui/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (492 Bytes). View file

src/ui/__pycache__/tab1_extraction.cpython-311.pyc ADDED Viewed

Binary file (19.9 kB). View file

src/ui/__pycache__/tab2_training.cpython-311.pyc ADDED Viewed

Binary file (13.4 kB). View file

src/ui/__pycache__/tab3_prediction.cpython-311.pyc ADDED Viewed

Binary file (6.56 kB). View file

src/ui/tab1_extraction.py CHANGED Viewed

@@ -1,19 +1,66 @@
 """
-Tab 1: Feature Extraction UI
 """
 import gradio as gr
 import pandas as pd
 from pathlib import Path
 from src.data_loader import scan_dataset_directory, extract_emotion_from_filename, extract_actor_from_filename, get_dataset_statistics
-from src.feature_extraction import extract_features
 from src.utils import create_waveform_plot, create_spectrogram_plot
 import config
-def extract_dataset_features(progress=gr.Progress()):
-    """Extract features from dataset in data/ directory"""
     try:
         progress(0, desc="Scanning dataset directory...")
@@ -21,50 +68,34 @@ def extract_dataset_features(progress=gr.Progress()):
         audio_files, error = scan_dataset_directory()
         if error:
-            return f"""
-## ❌ Error: {error}
-**Expected structure:**
-data/
-└── RAVDESS/
-└── audio_speech_actors_01-24/
-├── Actor_01/
-│   ├── 03-01-01-01-01-01-01.wav
-│   └── ...
-├── Actor_02/
-└── ...
-**Please ensure dataset is in correct location.**
-""", None, None
         if len(audio_files) == 0:
             return "❌ No audio files found", None, None
         progress(
-            0.05, desc=f"Found {len(audio_files)} files. Extracting features...")
         data_list = []
         failed_files = []
         total_files = len(audio_files)
         for idx, audio_file in enumerate(audio_files):
-            progress(
-                0.05 + (idx + 1) / total_files * 0.90,
-                desc=f"Processing {idx + 1}/{total_files}: {audio_file.name}"
-            )
             try:
-                features, _, _ = extract_features(str(audio_file))
                 filename = audio_file.name
                 emotion = extract_emotion_from_filename(filename)
                 actor = extract_actor_from_filename(filename)
-                row = {
-                    'file_path': str(audio_file),
-                    'filename': filename,
-                    'actor': actor,
-                    'emotion': emotion
-                }
                 for i, feat in enumerate(features):
                     row[f'feature_{i}'] = feat
@@ -81,29 +112,65 @@ data/
         progress(0.95, desc="Saving to CSV...")
         df = pd.DataFrame(data_list)
         df.to_csv(config.FEATURES_CSV, index=False)
         progress(1.0, desc="Complete!")
         stats = get_dataset_statistics(audio_files)
-        summary = f"""
-## ✅ Feature Extraction Complete!
 ### 📊 Statistics:
 - **Total Files**: {stats['total_files']}
 - **Successfully Processed**: {len(df)}
 - **Failed**: {len(failed_files)}
-- **Features per File**: {config.N_FEATURES}
-- **Output**: `{config.FEATURES_CSV}`
 ### 🎭 Emotion Distribution:
 {df['emotion'].value_counts().to_string()}
 ### 👥 Actors: {stats['n_actors']}
-✅ **Ready for training! Go to Tab 2.**
-"""
         if failed_files:
             summary += f"\n\n### ⚠️ Failed Files ({len(failed_files)}):\n"
@@ -124,56 +191,66 @@ def check_dataset_status():
     audio_files, error = scan_dataset_directory()
     if error:
-        return f"""
-## ⚠️ Dataset Not Found
-{error}
-**Please upload RAVDESS dataset to the correct location.**
-"""
     stats = get_dataset_statistics(audio_files)
-    status = f"""
-## ✅ Dataset Found!
-### 📊 Statistics:
-- **Total Files**: {stats['total_files']}
-- **Location**: `{config.DATA_DIR}`
-### 🎭 Emotions:
-"""
     for emotion, count in sorted(stats['emotion_counts'].items()):
         status += f"- **{emotion.capitalize()}**: {count} files\n"
-    status += f"""
-### 👥 Actors: {stats['n_actors']}
-**Click "🔊 Extract Features" to process the dataset.**
-"""
     return status
-def preview_single_audio(audio_file):
-    """Preview single audio file"""
     if audio_file is None:
         return "Please upload an audio file", None, None
     try:
-        features, y, sr = extract_features(audio_file)
-        summary = f"""
-## 🔍 Single File Preview
-- **File**: {Path(audio_file).name}
-- **Features**: {config.N_FEATURES}
-- **Sample Rate**: {sr} Hz
-- **Duration**: {len(y)/sr:.2f}s
-- **Emotion**: {extract_emotion_from_filename(Path(audio_file).name)}
-"""
         waveform = create_waveform_plot(y, sr)
         spectrogram = create_spectrogram_plot(y, sr)
@@ -186,41 +263,58 @@ def preview_single_audio(audio_file):
 def create_tab1():
-    """Create Tab 1: Feature Extraction"""
     with gr.Tab("1️⃣ Feature Extraction"):
-        gr.Markdown("""
-        ## 📁 Extract Features from Dataset
-        Automatically processes all audio files in `data/RAVDESS/audio_speech_actors_01-24/`
-        """)
         with gr.Row():
             with gr.Column(scale=1):
-                check_btn = gr.Button("🔄 Check Dataset Status", size="sm")
-                gr.Markdown("---")
-                extract_btn = gr.Button(
-                    "🔊 Extract Features",
-                    variant="primary",
-                    size="lg"
-                )
-                gr.Markdown("""
-                ---
-                ### 🔍 Preview Single Audio
-                Test feature extraction on one file.
-                """)
                 preview_audio = gr.Audio(
-                    sources=["upload"],
-                    type="filepath",
-                    label="Upload Single File"
-                )
                 preview_btn = gr.Button("Preview Features")
             with gr.Column(scale=2):
                 output_text = gr.Markdown()
                 preview_df = gr.Dataframe(label="Dataset Preview")
@@ -230,19 +324,8 @@ def create_tab1():
             waveform_plot = gr.Plot(label="Waveform")
             spectrogram_plot = gr.Plot(label="Spectrogram")
-        # Event handlers
-        check_btn.click(
-            fn=check_dataset_status,
-            outputs=[output_text]
-        )
-        extract_btn.click(
-            fn=extract_dataset_features,
-            outputs=[output_text, preview_df, emotion_dist]
-        )
-        preview_btn.click(
-            fn=preview_single_audio,
-            inputs=[preview_audio],
-            outputs=[output_text, waveform_plot, spectrogram_plot]
-        )

 """
+Tab 1: Feature Extraction UI with Feature Type Selection and MFCC Count
 """
 import gradio as gr
 import pandas as pd
 from pathlib import Path
+import json
 from src.data_loader import scan_dataset_directory, extract_emotion_from_filename, extract_actor_from_filename, get_dataset_statistics
+from src.feature_extraction import extract_features, get_feature_count
 from src.utils import create_waveform_plot, create_spectrogram_plot
 import config
+def calculate_feature_count(zcr, chroma, mfcc, rms, mel, n_mfcc):
+    """Calculate total feature count based on selections"""
+    feature_types = []
+    if zcr:
+        feature_types.append('zcr')
+    if chroma:
+        feature_types.append('chroma')
+    if mfcc:
+        feature_types.append('mfcc')
+    if rms:
+        feature_types.append('rms')
+    if mel:
+        feature_types.append('mel')
+    total = get_feature_count(feature_types, n_mfcc=n_mfcc)
+    breakdown = []
+    if zcr:
+        breakdown.append("ZCR: 1")
+    if chroma:
+        breakdown.append("Chroma: 12")
+    if mfcc:
+        breakdown.append(f"MFCC: {n_mfcc}")
+    if rms:
+        breakdown.append("RMS: 1")
+    if mel:
+        breakdown.append("Mel: 128")
+    return f"**Total Features: {total}**\n\n*Breakdown: {' + '.join(breakdown)}*"
+def extract_dataset_features(zcr, chroma, mfcc, rms, mel, n_mfcc, progress=gr.Progress()):
+    """Extract features from dataset with selected feature types"""
+    feature_types = []
+    if zcr:
+        feature_types.append('zcr')
+    if chroma:
+        feature_types.append('chroma')
+    if mfcc:
+        feature_types.append('mfcc')
+    if rms:
+        feature_types.append('rms')
+    if mel:
+        feature_types.append('mel')
+    if len(feature_types) == 0:
+        return "❌ Please select at least one feature type!", None, None
     try:
         progress(0, desc="Scanning dataset directory...")
         audio_files, error = scan_dataset_directory()
         if error:
+            return f"## ❌ Error: {error}\n\n**Please ensure dataset is in correct location.**", None, None
         if len(audio_files) == 0:
             return "❌ No audio files found", None, None
+        total_features = get_feature_count(feature_types, n_mfcc=n_mfcc)
         progress(
+            0.05, desc=f"Found {len(audio_files)} files. Extracting {total_features} features...")
         data_list = []
         failed_files = []
         total_files = len(audio_files)
         for idx, audio_file in enumerate(audio_files):
+            progress(0.05 + (idx + 1) / total_files * 0.90,
+                     desc=f"Processing {idx + 1}/{total_files}: {audio_file.name}")
             try:
+                features, _, _, feature_info = extract_features(
+                    str(audio_file), feature_types=feature_types, n_mfcc=n_mfcc)
                 filename = audio_file.name
                 emotion = extract_emotion_from_filename(filename)
                 actor = extract_actor_from_filename(filename)
+                row = {'file_path': str(
+                    audio_file), 'filename': filename, 'actor': actor, 'emotion': emotion}
                 for i, feat in enumerate(features):
                     row[f'feature_{i}'] = feat
         progress(0.95, desc="Saving to CSV...")
         df = pd.DataFrame(data_list)
+        extraction_config = {
+            'feature_types': feature_types,
+            'n_mfcc': n_mfcc if 'mfcc' in feature_types else 0,
+            'total_features': total_features,
+            'feature_breakdown': {
+                'zcr': 1 if 'zcr' in feature_types else 0,
+                'chroma': 12 if 'chroma' in feature_types else 0,
+                'mfcc': n_mfcc if 'mfcc' in feature_types else 0,
+                'rms': 1 if 'rms' in feature_types else 0,
+                'mel': 128 if 'mel' in feature_types else 0
+            },
+            'n_samples': len(df),
+            'extraction_date': pd.Timestamp.now().isoformat()
+        }
         df.to_csv(config.FEATURES_CSV, index=False)
+        config_file = Path(config.FEATURES_CSV).with_suffix('.json')
+        with open(config_file, 'w') as f:
+            json.dump(extraction_config, f, indent=2)
         progress(1.0, desc="Complete!")
         stats = get_dataset_statistics(audio_files)
+        feature_summary_lines = []
+        if 'zcr' in feature_types:
+            feature_summary_lines.append("- **ZCR**: 1 feature")
+        if 'chroma' in feature_types:
+            feature_summary_lines.append("- **CHROMA**: 12 features")
+        if 'mfcc' in feature_types:
+            feature_summary_lines.append(f"- **MFCC**: {n_mfcc} features")
+        if 'rms' in feature_types:
+            feature_summary_lines.append("- **RMS**: 1 feature")
+        if 'mel' in feature_types:
+            feature_summary_lines.append("- **MEL**: 128 features")
+        feature_summary = "\n".join(feature_summary_lines)
+        summary = f"""## ✅ Feature Extraction Complete!
+### 🎨 Selected Feature Types:
+{feature_summary}
 ### 📊 Statistics:
 - **Total Files**: {stats['total_files']}
 - **Successfully Processed**: {len(df)}
 - **Failed**: {len(failed_files)}
+- **Features per File**: {total_features}
+- **Output CSV**: `{config.FEATURES_CSV}`
+- **Config File**: `{config_file}`
 ### 🎭 Emotion Distribution:
 {df['emotion'].value_counts().to_string()}
 ### 👥 Actors: {stats['n_actors']}
+✅ **Ready for training! Go to Tab 2.**"""
         if failed_files:
             summary += f"\n\n### ⚠️ Failed Files ({len(failed_files)}):\n"
     audio_files, error = scan_dataset_directory()
     if error:
+        return f"## ⚠️ Dataset Not Found\n\n{error}\n\n**Please upload RAVDESS dataset to the correct location.**"
     stats = get_dataset_statistics(audio_files)
+    config_file = Path(config.FEATURES_CSV).with_suffix('.json')
+    existing_config = None
+    if config_file.exists():
+        try:
+            with open(config_file, 'r') as f:
+                existing_config = json.load(f)
+        except:
+            pass
+    status = f"## ✅ Dataset Found!\n\n### 📊 Statistics:\n- **Total Files**: {stats['total_files']}\n- **Location**: `{config.DATA_DIR}`\n\n### 🎭 Emotions:\n"
     for emotion, count in sorted(stats['emotion_counts'].items()):
         status += f"- **{emotion.capitalize()}**: {count} files\n"
+    status += f"\n### 👥 Actors: {stats['n_actors']}\n"
+    if existing_config:
+        status += f"\n---\n\n### 📋 Previous Extraction Found:\n- **Feature Types**: {', '.join(existing_config.get('feature_types', []))}\n- **Total Features**: {existing_config.get('total_features', 'Unknown')}\n- **MFCC Count**: {existing_config.get('n_mfcc', 'N/A')}\n- **Samples**: {existing_config.get('n_samples', 'Unknown')}\n\n**Note**: Re-extracting will overwrite previous features."
+    else:
+        status += '\n**Select feature types and click "🔊 Extract Features".**'
     return status
+def preview_single_audio(audio_file, zcr, chroma, mfcc, rms, mel, n_mfcc):
+    """Preview single audio file with selected features"""
     if audio_file is None:
         return "Please upload an audio file", None, None
+    feature_types = []
+    if zcr:
+        feature_types.append('zcr')
+    if chroma:
+        feature_types.append('chroma')
+    if mfcc:
+        feature_types.append('mfcc')
+    if rms:
+        feature_types.append('rms')
+    if mel:
+        feature_types.append('mel')
+    if len(feature_types) == 0:
+        return "❌ Please select at least one feature type!", None, None
     try:
+        features, y, sr, feature_info = extract_features(
+            audio_file, feature_types=feature_types, n_mfcc=n_mfcc)
+        feature_breakdown_lines = []
+        for ftype, count in feature_info['counts'].items():
+            feature_breakdown_lines.append(
+                f"- **{ftype.upper()}**: {count} features")
+        feature_breakdown = "\n".join(feature_breakdown_lines)
+        summary = f"## 🔍 Single File Preview\n\n- **File**: {Path(audio_file).name}\n- **Sample Rate**: {sr} Hz\n- **Duration**: {len(y)/sr:.2f}s\n- **Emotion**: {extract_emotion_from_filename(Path(audio_file).name)}\n\n### 🎨 Extracted Features:\n{feature_breakdown}\n\n**Total Features**: {feature_info['total']}"
         waveform = create_waveform_plot(y, sr)
         spectrogram = create_spectrogram_plot(y, sr)
 def create_tab1():
+    """Create Tab 1: Feature Extraction with Feature Type Selection"""
     with gr.Tab("1️⃣ Feature Extraction"):
+        gr.Markdown(
+            "## 📁 Extract Features from Dataset\n\n**Select which feature types to extract:**")
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("### 🎨 Feature Types")
+                with gr.Group():
+                    zcr_check = gr.Checkbox(
+                        label="🌊 ZCR - Zero Crossing Rate (1 feature)", value=True, info="Signal sign change frequency")
+                    chroma_check = gr.Checkbox(
+                        label="🎵 Chroma STFT (12 features)", value=True, info="Pitch class distribution")
+                    mfcc_check = gr.Checkbox(label="🎤 MFCC (20-40 features, configurable below)",
+                                             value=True, info="Mel-frequency cepstral coefficients - MOST IMPORTANT")
+                    n_mfcc_slider = gr.Slider(minimum=config.MFCC_MIN, maximum=config.MFCC_MAX, value=config.MFCC_DEFAULT, step=1,
+                                              label="Number of MFCC Coefficients", info="More MFCC = more detail but slower extraction", visible=True)
+                    rms_check = gr.Checkbox(
+                        label="📊 RMS Energy (1 feature)", value=True, info="Signal amplitude/loudness")
+                    mel_check = gr.Checkbox(
+                        label="🎹 Mel Spectrogram (128 features)", value=True, info="Frequency distribution over time")
+                feature_count_display = gr.Markdown(calculate_feature_count(
+                    True, True, True, True, True, config.MFCC_DEFAULT))
+                for control in [zcr_check, chroma_check, mfcc_check, rms_check, mel_check, n_mfcc_slider]:
+                    control.change(fn=calculate_feature_count, inputs=[
+                                   zcr_check, chroma_check, mfcc_check, rms_check, mel_check, n_mfcc_slider], outputs=[feature_count_display])
+                def toggle_mfcc_slider(mfcc_enabled):
+                    return gr.update(visible=mfcc_enabled)
+                mfcc_check.change(fn=toggle_mfcc_slider, inputs=[
+                                  mfcc_check], outputs=[n_mfcc_slider])
+                gr.Markdown("---")
+                check_btn = gr.Button("🔄 Check Dataset Status", size="sm")
+                gr.Markdown("---")
+                extract_btn = gr.Button(
+                    "🔊 Extract Features", variant="primary", size="lg")
+                gr.Markdown(
+                    "---\n### 🔍 Preview Single Audio\n\nTest feature extraction on one file.")
                 preview_audio = gr.Audio(
+                    sources=["upload"], type="filepath", label="Upload Single File")
                 preview_btn = gr.Button("Preview Features")
+                gr.Markdown("---\n### 💡 Feature Selection Tips\n\n**All Features (162):**\n- MFCC: 20 (default)\n- Most balanced\n- ~87-90% accuracy\n\n**MFCC Only (20):**\n- Fast extraction\n- Good baseline\n- ~80-85% accuracy\n\n---\n\n### 📋 Output Files:\n- **CSV**: `features_ravdess.csv` (data)\n- **JSON**: `features_ravdess.json` (config)")
             with gr.Column(scale=2):
                 output_text = gr.Markdown()
                 preview_df = gr.Dataframe(label="Dataset Preview")
             waveform_plot = gr.Plot(label="Waveform")
             spectrogram_plot = gr.Plot(label="Spectrogram")
+        check_btn.click(fn=check_dataset_status, outputs=[output_text])
+        extract_btn.click(fn=extract_dataset_features, inputs=[
+                          zcr_check, chroma_check, mfcc_check, rms_check, mel_check, n_mfcc_slider], outputs=[output_text, preview_df, emotion_dist])
+        preview_btn.click(fn=preview_single_audio, inputs=[preview_audio, zcr_check, chroma_check, mfcc_check,
+                          rms_check, mel_check, n_mfcc_slider], outputs=[output_text, waveform_plot, spectrogram_plot])

src/ui/tab2_training.py CHANGED Viewed

@@ -1,30 +1,85 @@
 """
-Tab 2: Model Training UI
 """
 import gradio as gr
 from src.training import train_models_with_ga
 def create_tab2():
     """Create Tab 2: Model Training"""
     with gr.Tab("2️⃣ Model Training"):
         gr.Markdown("""
         ## 🧬 Train Models with Genetic Algorithm
-        Optimize feature selection, hyperparameters, and ensemble weights.
         """)
         with gr.Row():
             with gr.Column(scale=1):
-                # Cross-Validation Toggle
                 use_cv = gr.Checkbox(
                     label="🔄 Use K-Fold Cross-Validation",
                     value=False,
                     info="More reliable evaluation but slower (recommended for final training)"
                 )
                 n_folds = gr.Slider(
                     minimum=3,
                     maximum=10,
@@ -34,34 +89,63 @@ def create_tab2():
                     info="More folds = more reliable but slower",
                     visible=False
                 )
                 gr.Markdown("---")
-                # GA Toggle
                 use_ga = gr.Checkbox(
                     label="🧬 Use Genetic Algorithm Optimization",
                     value=True,
-                    info="GA optimizes features + hyperparameters + ensemble weights"
                 )
                 ga_generations = gr.Slider(
                     minimum=5,
                     maximum=50,
-                    value=20,
                     step=5,
                     label="GA Generations",
                     info="More generations = better optimization but slower"
                 )
                 ga_population = gr.Slider(
                     minimum=5,
                     maximum=30,
-                    value=15,
                     step=5,
                     label="GA Population Size",
                     info="Larger population = more exploration but slower"
                 )
                 n_jobs = gr.Slider(
                     minimum=1,
                     maximum=8,
@@ -70,78 +154,113 @@ def create_tab2():
                     label="Parallel Jobs",
                     info="Number of CPU cores (2 for free tier, 4+ for better hardware)"
                 )
-                # Toggle visibility of CV and GA parameters
                 def toggle_cv_params(use_cv_val):
                     return gr.update(visible=use_cv_val)
                 def toggle_ga_params(use_ga_val):
                     return (
-                        gr.update(visible=use_ga_val),
-                        gr.update(visible=use_ga_val),
-                        gr.update(visible=use_ga_val)
                     )
                 use_cv.change(
                     fn=toggle_cv_params,
                     inputs=[use_cv],
                     outputs=[n_folds]
                 )
                 use_ga.change(
                     fn=toggle_ga_params,
                     inputs=[use_ga],
-                    outputs=[ga_generations, ga_population, n_jobs]
                 )
                 gr.Markdown("---")
                 train_btn = gr.Button(
                     "🚀 Start Training",
                     variant="primary",
                     size="lg"
                 )
                 gr.Markdown("""
                 ### 🔬 Training Modes:
-                **Single Split (Fast):**
-                - ✓ Quick results (15-30 min with GA)
-                - ✓ Good for experimentation
-                - ⚠️ Less reliable estimate
-                **K-Fold CV (Recommended):**
-                - ✓ Reliable accuracy estimate
-                - ✓ Mean ± Std reported
-                - ✓ Detects overfitting
-                - ⚠️ Slower (5x longer)
-                ### 🧬 GA Optimizations:
-                - ✅ **Parallel Evaluation**: 2-4x speedup
-                - ✅ **Early Stopping**: Auto-stop when converged
-                - ✅ **Real-time Logging**: Progress details
-                - ✅ **Feature Selection**: 80 best from 162
-                ### ⏱️ Estimated Time:
-                **Single Split:**
-                - With GA (Parallel): 15-30 minutes
-                - Without GA: 5-10 minutes
-                **5-Fold CV:**
-                - With GA (Parallel): 75-150 minutes
-                - Without GA: 25-50 minutes
-                **10-Fold CV:**
-                - With GA (Parallel): 150-300 minutes
-                - Without GA: 50-100 minutes
-                ---
-                💡 **Tip**: Start with Single Split for quick testing,
-                then use 5-Fold CV for final model!
                 """)
             with gr.Column(scale=2):
                 training_output = gr.Markdown()
                 results_table = gr.Dataframe(
@@ -152,7 +271,7 @@ def create_tab2():
                     label="GA Evolution History / CV Statistics",
                     headers=None
                 )
         with gr.Accordion("📜 Detailed Training Log", open=False):
             training_log = gr.Textbox(
                 label="Training Log",
@@ -161,119 +280,59 @@ def create_tab2():
                 interactive=False,
                 show_copy_button=True
             )
-        # Information boxes
-        with gr.Accordion("ℹ️ Understanding Cross-Validation", open=False):
             gr.Markdown("""
-            ## 🔄 What is K-Fold Cross-Validation?
-            ### How it works:
-            1. **Split data into K folds** (e.g., 5 folds)
-            2. **Train K times**, each time using:
-               - K-1 folds for training
-               - 1 fold for testing
-            3. **Average results** across all folds
-            ### Example with 5-Fold CV:
-        Fold 1: Train on [2,3,4,5], Test on [1]
-        Fold 2: Train on [1,3,4,5], Test on [2]
-        Fold 3: Train on [1,2,4,5], Test on [3]
-        Fold 4: Train on [1,2,3,5], Test on [4]
-        Fold 5: Train on [1,2,3,4], Test on [5]
-        Final Result: Average of all 5 test accuracies
-            ### Why use CV?
-            ✅ **More Reliable**: Every sample is tested exactly once
-            ✅ **Variance Estimate**: Get Mean ± Std instead of single number
-            ✅ **Better Generalization**: Uses all data for both training and testing
-            ✅ **Detect Overfitting**: High std = unstable model
-            ### Interpreting Results:
-            **Good Model:**
-            - Mean: 90.1%
-            - Std: 0.5%
-            - Interpretation: Stable, reliable performance ✓
-            **Overfitting Model:**
-            - Mean: 92.3%
-            - Std: 5.2%
-            - Interpretation: Unstable, unreliable ✗
-            **Underfitting Model:**
-            - Mean: 75.0%
-            - Std: 0.3%
-            - Interpretation: Stable but poor performance ✗
-            ### When to use what?
-            | Scenario | Recommendation |
             |----------|---------------|
-            | Quick experiment | Single Split |
-            | Final model | 5-Fold CV |
-            | Small dataset (<1000 samples) | 10-Fold CV |
-            | Large dataset (>100k samples) | Single Split |
-            | Publication/Research | 5 or 10-Fold CV |
-            """)
-        with gr.Accordion("🧬 Understanding Genetic Algorithm", open=False):
-            gr.Markdown("""
-            ## 🧬 What does GA optimize?
-            ### 1. Feature Selection (80/162)
-            - Finds best combination of audio features
-            - Removes redundant/noisy features
-            - Reduces overfitting
-            ### 2. Hyperparameters
-            - **XGBoost**: n_estimators, max_depth, learning_rate
-            - **LightGBM**: n_estimators, num_leaves, learning_rate
-            - **Gradient Boosting**: n_estimators, max_depth, learning_rate
-            - **AdaBoost**: n_estimators, learning_rate
-            ### 3. Ensemble Weights
-            - Optimal weights for combining models
-            - NOT equal weights [0.25, 0.25, 0.25, 0.25]
-            - NOT accuracy-based weights
-            - Learned from validation performance
-            ### How GA works:
-        1. Create random population (15 solutions)
-        2. Evaluate fitness (train models, measure accuracy)
-        3. Select best solutions (tournament selection)
-        4. Create offspring (crossover + mutation)
-        5. Repeat for 20 generations
-        6. Return best solution found
-            ### Why GA vs Grid Search?
-            **Grid Search:**
-            - Tests every combination
-            - Very slow (days for this problem)
-            - Guarantees finding best in grid
-            **Genetic Algorithm:**
-            - Intelligent search (evolutionary)
-            - Fast (minutes to hours)
-            - Finds near-optimal solution
-            - Can optimize multiple objectives
-            ### Typical Improvement:
-            - **Without GA**: 82-85% accuracy
-            - **With GA**: 87-90% accuracy
-            - **Gain**: +5% absolute improvement
             """)
-        # Event handler
         train_btn.click(
             fn=train_models_with_ga,
-            inputs=[use_ga, use_cv, n_folds, ga_generations, ga_population, n_jobs],
-            outputs=[training_output, results_table, ga_history_table, training_log]
-        )

 """
+Tab 2: Model Training UI with K-Fold Cross-Validation and GA Feature Selection
 """
 import gradio as gr
 from src.training import train_models_with_ga
+import config
+def calculate_ga_feature_info(optimize_features, n_features_select, total_features=162):
+    """Calculate and display GA feature selection info"""
+    if optimize_features:
+        percentage = (n_features_select / total_features *
+                      100) if total_features > 0 else 0
+        return f"""
+### 🧬 GA Feature Selection: **ENABLED**
+GA will optimize:
+1. **Which specific features to use**: {n_features_select}/{total_features} ({percentage:.1f}%)
+2. **Model hyperparameters** (all 4 models)
+3. **Ensemble weights**
+**Search Space:**
+- Feature combinations: C({total_features}, {n_features_select}) = Very Large!
+- Plus hyperparameter combinations
+- Total optimization space: **MASSIVE**
+**Expected:** GA will find optimal feature subset + model configurations
+"""
+    else:
+        return f"""
+### 🧬 GA Feature Selection: **DISABLED**
+GA will optimize:
+- **Model hyperparameters ONLY** (all 4 models)
+- **Ensemble weights**
+**Note:** All {total_features} extracted features will be used (no feature selection)
+This is faster but may include noisy/redundant features.
+"""
+def update_feature_slider_max(csv_path='features_ravdess.csv'):
+    """Update slider maximum based on extracted features"""
+    import pandas as pd
+    import os
+    if not os.path.exists(csv_path):
+        return gr.update(maximum=162, value=100)
+    try:
+        df = pd.read_csv(csv_path)
+        feature_cols = [col for col in df.columns if col.startswith(
+            'feature_') and col.replace('feature_', '').isdigit()]
+        n_features = len(feature_cols)
+        default_select = min(100, int(n_features * 0.7))
+        return gr.update(maximum=n_features, value=default_select, label=f"Features to Select (Max: {n_features})")
+    except:
+        return gr.update(maximum=162, value=100)
 def create_tab2():
     """Create Tab 2: Model Training"""
     with gr.Tab("2️⃣ Model Training"):
         gr.Markdown("""
         ## 🧬 Train Models with Genetic Algorithm
+        Optimize hyperparameters and optionally feature selection.
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 use_cv = gr.Checkbox(
                     label="🔄 Use K-Fold Cross-Validation",
                     value=False,
                     info="More reliable evaluation but slower (recommended for final training)"
                 )
                 n_folds = gr.Slider(
                     minimum=3,
                     maximum=10,
                     info="More folds = more reliable but slower",
                     visible=False
                 )
                 gr.Markdown("---")
                 use_ga = gr.Checkbox(
                     label="🧬 Use Genetic Algorithm Optimization",
                     value=True,
+                    info="GA optimizes hyperparameters + optionally features"
                 )
+                optimize_features = gr.Checkbox(
+                    label="✨ GA Optimize Feature Selection",
+                    value=True,
+                    info="Let GA select best feature subset (recommended)"
+                )
+                n_features_select = gr.Slider(
+                    minimum=10,
+                    maximum=162,
+                    value=100,
+                    step=5,
+                    label="Features to Select (Max: 162)",
+                    info="Number of features GA will select from extracted features",
+                    visible=True
+                )
+                update_slider_btn = gr.Button(
+                    "🔄 Update from Extracted Features",
+                    size="sm",
+                    visible=True
+                )
+                update_slider_btn.click(
+                    fn=update_feature_slider_max,
+                    inputs=[],
+                    outputs=[n_features_select]
+                )
+                gr.Markdown("---")
                 ga_generations = gr.Slider(
                     minimum=5,
                     maximum=50,
+                    value=30,
                     step=5,
                     label="GA Generations",
                     info="More generations = better optimization but slower"
                 )
                 ga_population = gr.Slider(
                     minimum=5,
                     maximum=30,
+                    value=20,
                     step=5,
                     label="GA Population Size",
                     info="Larger population = more exploration but slower"
                 )
                 n_jobs = gr.Slider(
                     minimum=1,
                     maximum=8,
                     label="Parallel Jobs",
                     info="Number of CPU cores (2 for free tier, 4+ for better hardware)"
                 )
+                ga_feature_info = gr.Markdown(
+                    calculate_ga_feature_info(True, 100, 162)
+                )
+                def update_ga_info_wrapper(opt_feat, n_feat):
+                    import pandas as pd
+                    import os
+                    total = 162
+                    if os.path.exists(config.FEATURES_CSV):
+                        try:
+                            df = pd.read_csv(config.FEATURES_CSV)
+                            feature_cols = [col for col in df.columns if col.startswith(
+                                'feature_') and col.replace('feature_', '').isdigit()]
+                            total = len(feature_cols)
+                        except:
+                            pass
+                    return calculate_ga_feature_info(opt_feat, n_feat, total)
+                optimize_features.change(
+                    fn=update_ga_info_wrapper,
+                    inputs=[optimize_features, n_features_select],
+                    outputs=[ga_feature_info]
+                )
+                n_features_select.change(
+                    fn=update_ga_info_wrapper,
+                    inputs=[optimize_features, n_features_select],
+                    outputs=[ga_feature_info]
+                )
                 def toggle_cv_params(use_cv_val):
                     return gr.update(visible=use_cv_val)
                 def toggle_ga_params(use_ga_val):
+                    return tuple([gr.update(visible=use_ga_val)] * 6)
+                def toggle_feature_slider(opt_feat_val):
                     return (
+                        gr.update(visible=opt_feat_val),
+                        gr.update(visible=opt_feat_val)
                     )
                 use_cv.change(
                     fn=toggle_cv_params,
                     inputs=[use_cv],
                     outputs=[n_folds]
                 )
                 use_ga.change(
                     fn=toggle_ga_params,
                     inputs=[use_ga],
+                    outputs=[optimize_features, n_features_select,
+                             update_slider_btn, ga_generations, ga_population, n_jobs]
                 )
+                optimize_features.change(
+                    fn=toggle_feature_slider,
+                    inputs=[optimize_features],
+                    outputs=[n_features_select, update_slider_btn]
+                )
                 gr.Markdown("---")
                 train_btn = gr.Button(
                     "🚀 Start Training",
                     variant="primary",
                     size="lg"
                 )
                 gr.Markdown("""
                 ### 🔬 Training Modes:
+                **Mode 1: Full GA (Recommended)**
+                - ✅ GA Feature Selection: ON
+                - ✅ GA Hyperparameter Tuning: ON
+                - ⏱️ Time: 60-120 min
+                - 🎯 Best accuracy
+                **Mode 2: GA Hyperparameters Only**
+                - ❌ GA Feature Selection: OFF
+                - ✅ GA Hyperparameter Tuning: ON
+                - ⏱️ Time: 30-60 min
+                - 🎯 Good accuracy, faster
+                **Mode 3: No GA (Fast)**
+                - ❌ GA: OFF
+                - ⏱️ Time: 5-10 min
+                - 🎯 Baseline accuracy
+                ---
+                ### 💡 Feature Selection Tips:
+                **Many features (>100):**
+                - Select 60-80%
+                - GA finds most informative
+                **Few features (<50):**
+                - Use all features
+                - Disable feature selection
+                **Medium features (50-100):**
+                - Select 70-90%
+                - Balance info and speed
                 """)
             with gr.Column(scale=2):
                 training_output = gr.Markdown()
                 results_table = gr.Dataframe(
                     label="GA Evolution History / CV Statistics",
                     headers=None
                 )
         with gr.Accordion("📜 Detailed Training Log", open=False):
             training_log = gr.Textbox(
                 label="Training Log",
                 interactive=False,
                 show_copy_button=True
             )
+        with gr.Accordion("ℹ️ Understanding Feature Selection", open=False):
             gr.Markdown("""
+            ## 🎯 What is Feature Selection?
+            ### Why select features?
+            **Too many features can cause:**
+            - ❌ **Overfitting**: Model memorizes noise
+            - ❌ **Curse of dimensionality**: Need exponentially more data
+            - ❌ **Slow training**: More features = more computation
+            - ❌ **Redundancy**: Correlated features don't add info
+            ### How GA selects features:
+            Evolution process finds optimal feature subset through:
+            - Random initialization
+            - Fitness evaluation (accuracy)
+            - Selection (keep best)
+            - Crossover (combine good solutions)
+            - Mutation (explore new combinations)
+            ### Example Results:
+            **Full features (162):**
+            - Accuracy: 87%
+            - Training time: 60 min
+            **GA selected (80 features):**
+            - Accuracy: 90% ✓ (better!)
+            - Training time: 40 min ✓ (faster!)
+            ### When to use:
+            | Features | Recommendation |
             |----------|---------------|
+            | >100 | ✅ Use GA (60-80%) |
+            | 50-100 | ✅ Optional (70-90%) |
+            | <50 | ❌ Use all features |
             """)
         train_btn.click(
             fn=train_models_with_ga,
+            inputs=[
+                use_ga,
+                use_cv,
+                n_folds,
+                ga_generations,
+                ga_population,
+                n_jobs,
+                optimize_features,
+                n_features_select
+            ],
+            outputs=[training_output, results_table,
+                     ga_history_table, training_log]
+        )