Ahmedik95316 commited on
Commit
0908ace
·
1 Parent(s): 113fca9

Update model/train.py

Browse files

Adding LightGBM for Ensemble Model

Files changed (1) hide show
  1. model/train.py +347 -55
model/train.py CHANGED
@@ -1,4 +1,4 @@
1
- # Enhanced version with comprehensive cross-validation and advanced feature engineering
2
 
3
  import seaborn as sns
4
  import matplotlib.pyplot as plt
@@ -14,9 +14,10 @@ from sklearn.model_selection import (
14
  train_test_split, cross_val_score, GridSearchCV,
15
  StratifiedKFold, validation_curve, cross_validate
16
  )
17
- from sklearn.ensemble import RandomForestClassifier
18
  from sklearn.linear_model import LogisticRegression
19
  from sklearn.feature_extraction.text import TfidfVectorizer
 
20
  import pandas as pd
21
  import numpy as np
22
  from pathlib import Path
@@ -31,6 +32,7 @@ from datetime import datetime, timedelta
31
  from typing import Dict, Tuple, Optional, Any, List
32
  import warnings
33
  import re
 
34
  warnings.filterwarnings('ignore')
35
 
36
  # Import enhanced feature engineering components
@@ -161,7 +163,7 @@ class ProgressTracker:
161
 
162
 
163
  def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5,
164
- use_enhanced_features: bool = False) -> Dict:
165
  """Estimate training time based on dataset characteristics and feature complexity"""
166
 
167
  # Base time estimates (in seconds) based on empirical testing
@@ -180,12 +182,16 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
180
  base_times['feature_selection'] *= 2.0 # More features to select from
181
  base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05) # New step
182
 
183
- # Hyperparameter tuning multipliers
184
  tuning_multipliers = {
185
  'logistic_regression': 8 if enable_tuning else 1, # 8 param combinations
186
  'random_forest': 12 if enable_tuning else 1, # 12 param combinations
 
187
  }
188
 
 
 
 
189
  # Cross-validation multiplier
190
  cv_multiplier = cv_folds if dataset_size > 100 else 1
191
 
@@ -202,7 +208,7 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
202
 
203
  estimates['feature_selection'] = base_times['feature_selection']
204
 
205
- # Model training (now includes CV)
206
  for model_name, multiplier in tuning_multipliers.items():
207
  model_time = base_times['simple_training'] * multiplier * cv_multiplier
208
  estimates[f'{model_name}_training'] = model_time
@@ -211,14 +217,19 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
211
  # Cross-validation overhead
212
  estimates['cross_validation'] = base_times['simple_training'] * cv_folds * 0.5
213
 
 
 
 
 
 
214
  # Model saving
215
  estimates['model_saving'] = 1.0
216
 
217
  # Total estimate
218
- total_estimate = sum(estimates.values())
219
 
220
- # Add buffer for overhead (more for enhanced features)
221
- buffer_multiplier = 1.4 if use_enhanced_features else 1.2
222
  total_estimate *= buffer_multiplier
223
 
224
  return {
@@ -228,7 +239,8 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
228
  'dataset_size': dataset_size,
229
  'enable_tuning': enable_tuning,
230
  'cv_folds': cv_folds,
231
- 'use_enhanced_features': use_enhanced_features
 
232
  }
233
 
234
 
@@ -288,7 +300,7 @@ class CrossValidationManager:
288
  cv=cv_strategy,
289
  scoring=scoring_metrics,
290
  return_train_score=True,
291
- n_jobs=1, # Use single job for stability
292
  verbose=0
293
  )
294
 
@@ -381,7 +393,6 @@ class CrossValidationManager:
381
  scores2 = results2['test_scores'][metric]['scores']
382
 
383
  # Paired t-test
384
- from scipy import stats
385
  t_stat, p_value = stats.ttest_rel(scores1, scores2)
386
 
387
  comparison = {
@@ -406,21 +417,135 @@ class CrossValidationManager:
406
  return {'error': str(e)}
407
 
408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  class EnhancedModelTrainer:
410
- """Production-ready model trainer with enhanced feature engineering and comprehensive CV"""
411
 
412
- def __init__(self, use_enhanced_features: bool = None):
413
  # Auto-detect enhanced features if not specified
414
  if use_enhanced_features is None:
415
  self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE
416
  else:
417
  self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE
418
 
 
419
  self.setup_paths()
420
  self.setup_training_config()
421
  self.setup_models()
422
  self.progress_tracker = None
423
  self.cv_manager = CrossValidationManager()
 
424
 
425
  # Enhanced feature tracking
426
  self.feature_engineer = None
@@ -480,14 +605,14 @@ class EnhancedModelTrainer:
480
  self.class_weight = 'balanced'
481
 
482
  def setup_models(self):
483
- """Setup model configurations for comparison"""
484
  self.models = {
485
  'logistic_regression': {
486
  'model': LogisticRegression(
487
  max_iter=self.max_iter,
488
  class_weight=self.class_weight,
489
  random_state=self.random_state,
490
- n_jobs=-1
491
  ),
492
  'param_grid': {
493
  'model__C': [0.1, 1, 10],
@@ -496,15 +621,34 @@ class EnhancedModelTrainer:
496
  },
497
  'random_forest': {
498
  'model': RandomForestClassifier(
499
- n_estimators=50,
500
  class_weight=self.class_weight,
501
  random_state=self.random_state,
502
- n_jobs=-1
503
  ),
504
  'param_grid': {
505
  'model__n_estimators': [50, 100],
506
  'model__max_depth': [10, None]
507
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  }
509
  }
510
 
@@ -752,7 +896,7 @@ class EnhancedModelTrainer:
752
  param_grid,
753
  cv=cv_strategy,
754
  scoring='f1_weighted',
755
- n_jobs=1, # Single job for stability
756
  verbose=0, # Reduce verbosity for speed
757
  return_train_score=True # For overfitting analysis
758
  )
@@ -813,9 +957,10 @@ class EnhancedModelTrainer:
813
  raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
814
 
815
  def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
816
- """Train and evaluate multiple models with enhanced features and comprehensive CV"""
817
 
818
  results = {}
 
819
 
820
  for model_name in self.models.keys():
821
  logger.info(f"Training {model_name} with {'enhanced' if self.use_enhanced_features else 'standard'} features...")
@@ -843,6 +988,9 @@ class EnhancedModelTrainer:
843
  'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
844
  }
845
 
 
 
 
846
  # Log results
847
  test_f1 = evaluation_metrics['f1']
848
  cv_results = evaluation_metrics.get('cross_validation', {})
@@ -857,10 +1005,68 @@ class EnhancedModelTrainer:
857
  logger.error(f"Training failed for {model_name}: {str(e)}")
858
  results[model_name] = {'error': str(e)}
859
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
860
  return results
861
 
862
  def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
863
- """Select the best performing model based on CV results"""
864
 
865
  if self.progress_tracker:
866
  self.progress_tracker.update("Selecting best model")
@@ -870,28 +1076,50 @@ class EnhancedModelTrainer:
870
  best_score = -1
871
  best_metrics = None
872
 
873
- for model_name, result in results.items():
874
- if 'error' in result:
875
- continue
876
-
877
- # Prioritize CV F1 score if available, fallback to test F1
878
- cv_results = result['evaluation_metrics'].get('cross_validation', {})
879
- if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
880
- f1_score = cv_results['test_scores']['f1']['mean']
881
- score_type = "CV F1"
882
- else:
883
- f1_score = result['evaluation_metrics']['f1']
884
- score_type = "Test F1"
885
-
886
- if f1_score > best_score:
887
- best_score = f1_score
888
- best_model_name = model_name
889
- best_model = result['model']
890
- best_metrics = result['evaluation_metrics']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
891
 
892
  if best_model_name is None:
893
  raise ValueError("No models trained successfully")
894
 
 
895
  logger.info(f"Best model: {best_model_name} with {score_type} score: {best_score:.4f}")
896
  return best_model_name, best_model, best_metrics
897
 
@@ -940,6 +1168,10 @@ class EnhancedModelTrainer:
940
  }
941
  joblib.dump(enhanced_ref, self.vectorizer_path)
942
  logger.info(f"✅ Saved enhanced features reference to {self.vectorizer_path}")
 
 
 
 
943
 
944
  except Exception as e:
945
  logger.warning(f"Could not save individual components: {e}")
@@ -954,6 +1186,7 @@ class EnhancedModelTrainer:
954
  metadata = {
955
  'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
956
  'model_type': model_name,
 
957
  'feature_engineering': {
958
  'type': 'enhanced' if self.use_enhanced_features else 'standard',
959
  'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
@@ -973,7 +1206,8 @@ class EnhancedModelTrainer:
973
  'max_features': self.max_features,
974
  'ngram_range': self.ngram_range,
975
  'feature_selection_k': self.feature_selection_k,
976
- 'use_enhanced_features': self.use_enhanced_features
 
977
  }
978
  }
979
 
@@ -1030,6 +1264,18 @@ class EnhancedModelTrainer:
1030
  metadata['cv_accuracy_mean'] = cv_results['test_scores']['accuracy']['mean']
1031
  metadata['cv_accuracy_std'] = cv_results['test_scores']['accuracy']['std']
1032
 
 
 
 
 
 
 
 
 
 
 
 
 
1033
  # Add model comparison results if available
1034
  if len(results) > 1:
1035
  model_comparison = {}
@@ -1058,6 +1304,10 @@ class EnhancedModelTrainer:
1058
  for feature_type, count in feature_metadata.get('feature_types', {}).items():
1059
  logger.info(f" {feature_type}: {count}")
1060
 
 
 
 
 
1061
  logger.info(f"✅ Model artifacts saved successfully with {'enhanced' if self.use_enhanced_features else 'standard'} features")
1062
  return True
1063
 
@@ -1072,18 +1322,22 @@ class EnhancedModelTrainer:
1072
  logger.error(f"Failed to save backup pipeline: {str(e2)}")
1073
  return False
1074
 
1075
- def train_model(self, data_path: str = None, force_enhanced: bool = None) -> Tuple[bool, str]:
1076
- """Main training function with enhanced feature engineering pipeline"""
1077
  try:
1078
- # Override enhanced features setting if specified
1079
  if force_enhanced is not None:
1080
  original_setting = self.use_enhanced_features
1081
  self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE
1082
  if force_enhanced and not ENHANCED_FEATURES_AVAILABLE:
1083
  logger.warning("Enhanced features requested but not available, using standard features")
1084
 
 
 
 
1085
  feature_type = "enhanced" if self.use_enhanced_features else "standard"
1086
- logger.info(f"Starting {feature_type} model training with cross-validation...")
 
1087
 
1088
  # Override data path if provided
1089
  if data_path:
@@ -1099,24 +1353,27 @@ class EnhancedModelTrainer:
1099
  len(df),
1100
  enable_tuning=True,
1101
  cv_folds=self.cv_folds,
1102
- use_enhanced_features=self.use_enhanced_features
 
1103
  )
1104
 
1105
  print(f"\n📊 Enhanced Training Configuration:")
1106
  print(f"Dataset size: {len(df)} samples")
1107
  print(f"Feature engineering: {feature_type.title()}")
1108
  print(f"Cross-validation folds: {self.cv_folds}")
 
 
1109
  print(f"Estimated time: {time_estimate['total_formatted']}")
1110
- print(f"Models to train: {len(self.models)}")
1111
  print(f"Hyperparameter tuning: Enabled")
1112
  if self.use_enhanced_features:
1113
  print(f"Enhanced features: Sentiment, Readability, Entities, Linguistic")
1114
  print()
1115
 
1116
- # Setup progress tracker (adjusted for enhanced features)
1117
  base_steps = 4 + (len(self.models) * 3) + 1 # Basic steps
1118
  enhanced_steps = 2 if self.use_enhanced_features else 0 # Feature engineering steps
1119
- total_steps = base_steps + enhanced_steps
 
1120
  self.progress_tracker = ProgressTracker(total_steps, f"{feature_type.title()} Training Progress")
1121
 
1122
  # Prepare data
@@ -1152,10 +1409,10 @@ class EnhancedModelTrainer:
1152
  if len(X_test) < 1:
1153
  return False, "Cannot create test set. Dataset too small."
1154
 
1155
- # Train and evaluate models with enhanced features
1156
  results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
1157
 
1158
- # Select best model
1159
  best_model_name, best_model, best_metrics = self.select_best_model(results)
1160
 
1161
  # Save model artifacts with enhanced feature information
@@ -1165,7 +1422,7 @@ class EnhancedModelTrainer:
1165
  # Finish progress tracking
1166
  self.progress_tracker.finish()
1167
 
1168
- # Create success message with enhanced feature information
1169
  cv_results = best_metrics.get('cross_validation', {})
1170
  cv_info = ""
1171
  if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
@@ -1180,9 +1437,14 @@ class EnhancedModelTrainer:
1180
  if feature_metadata:
1181
  total_features = feature_metadata.get('total_features', 0)
1182
  feature_info = f", Enhanced Features: {total_features}"
 
 
 
 
 
1183
 
1184
  success_message = (
1185
- f"{feature_type.title()} model training completed successfully. "
1186
  f"Best model: {best_model_name} "
1187
  f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info})"
1188
  )
@@ -1193,22 +1455,24 @@ class EnhancedModelTrainer:
1193
  except Exception as e:
1194
  if self.progress_tracker:
1195
  print() # New line after progress bar
1196
- error_message = f"Enhanced model training failed: {str(e)}"
1197
  logger.error(error_message)
1198
  return False, error_message
1199
 
1200
 
1201
  def main():
1202
- """Main execution function with enhanced feature engineering support"""
1203
  import argparse
1204
 
1205
  # Parse command line arguments
1206
- parser = argparse.ArgumentParser(description='Train fake news detection model with enhanced features')
1207
  parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
1208
  parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
1209
  parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
1210
  parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features')
1211
  parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only')
 
 
1212
  args = parser.parse_args()
1213
 
1214
  # Determine feature engineering mode
@@ -1222,7 +1486,21 @@ def main():
1222
  use_enhanced = False
1223
  logger.info("Standard features explicitly requested")
1224
 
1225
- trainer = EnhancedModelTrainer(use_enhanced_features=use_enhanced)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1226
 
1227
  # Apply CV folds from command line
1228
  if args.cv_folds:
@@ -1246,6 +1524,10 @@ def main():
1246
  if 'enhanced_features' in config and use_enhanced is None:
1247
  trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE
1248
 
 
 
 
 
1249
  # Filter models if specified
1250
  selected_models = config.get('selected_models')
1251
  if selected_models and len(selected_models) < len(trainer.models):
@@ -1258,6 +1540,8 @@ def main():
1258
  logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds")
1259
  if trainer.use_enhanced_features:
1260
  logger.info("Enhanced features enabled via configuration")
 
 
1261
 
1262
  except Exception as e:
1263
  logger.warning(f"Failed to load configuration: {e}, using defaults")
@@ -1277,6 +1561,14 @@ def main():
1277
  print(f" {feature_type}: {count}")
1278
  except Exception as e:
1279
  logger.warning(f"Could not display feature summary: {e}")
 
 
 
 
 
 
 
 
1280
  else:
1281
  print(f"❌ {message}")
1282
  exit(1)
 
1
+ # Enhanced version with LightGBM, ensemble voting, and statistical validation
2
 
3
  import seaborn as sns
4
  import matplotlib.pyplot as plt
 
14
  train_test_split, cross_val_score, GridSearchCV,
15
  StratifiedKFold, validation_curve, cross_validate
16
  )
17
+ from sklearn.ensemble import RandomForestClassifier, VotingClassifier
18
  from sklearn.linear_model import LogisticRegression
19
  from sklearn.feature_extraction.text import TfidfVectorizer
20
+ import lightgbm as lgb
21
  import pandas as pd
22
  import numpy as np
23
  from pathlib import Path
 
32
  from typing import Dict, Tuple, Optional, Any, List
33
  import warnings
34
  import re
35
+ from scipy import stats
36
  warnings.filterwarnings('ignore')
37
 
38
  # Import enhanced feature engineering components
 
163
 
164
 
165
  def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5,
166
+ use_enhanced_features: bool = False, enable_ensemble: bool = True) -> Dict:
167
  """Estimate training time based on dataset characteristics and feature complexity"""
168
 
169
  # Base time estimates (in seconds) based on empirical testing
 
182
  base_times['feature_selection'] *= 2.0 # More features to select from
183
  base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05) # New step
184
 
185
+ # Hyperparameter tuning multipliers with LightGBM
186
  tuning_multipliers = {
187
  'logistic_regression': 8 if enable_tuning else 1, # 8 param combinations
188
  'random_forest': 12 if enable_tuning else 1, # 12 param combinations
189
+ 'lightgbm': 6 if enable_tuning else 1, # 6 param combinations (CPU optimized)
190
  }
191
 
192
+ # Ensemble multiplier
193
+ ensemble_multiplier = 1.3 if enable_ensemble else 1.0 # 30% overhead for ensemble
194
+
195
  # Cross-validation multiplier
196
  cv_multiplier = cv_folds if dataset_size > 100 else 1
197
 
 
208
 
209
  estimates['feature_selection'] = base_times['feature_selection']
210
 
211
+ # Model training (now includes CV and LightGBM)
212
  for model_name, multiplier in tuning_multipliers.items():
213
  model_time = base_times['simple_training'] * multiplier * cv_multiplier
214
  estimates[f'{model_name}_training'] = model_time
 
217
  # Cross-validation overhead
218
  estimates['cross_validation'] = base_times['simple_training'] * cv_folds * 0.5
219
 
220
+ # Ensemble training and validation
221
+ if enable_ensemble:
222
+ estimates['ensemble_training'] = base_times['simple_training'] * 0.5
223
+ estimates['ensemble_validation'] = base_times['evaluation'] * 2
224
+
225
  # Model saving
226
  estimates['model_saving'] = 1.0
227
 
228
  # Total estimate
229
+ total_estimate = sum(estimates.values()) * ensemble_multiplier
230
 
231
+ # Add buffer for overhead (more for enhanced features and ensemble)
232
+ buffer_multiplier = 1.5 if (use_enhanced_features and enable_ensemble) else 1.4 if use_enhanced_features else 1.2
233
  total_estimate *= buffer_multiplier
234
 
235
  return {
 
239
  'dataset_size': dataset_size,
240
  'enable_tuning': enable_tuning,
241
  'cv_folds': cv_folds,
242
+ 'use_enhanced_features': use_enhanced_features,
243
+ 'enable_ensemble': enable_ensemble
244
  }
245
 
246
 
 
300
  cv=cv_strategy,
301
  scoring=scoring_metrics,
302
  return_train_score=True,
303
+ n_jobs=1, # Use single job for stability on HFS
304
  verbose=0
305
  )
306
 
 
393
  scores2 = results2['test_scores'][metric]['scores']
394
 
395
  # Paired t-test
 
396
  t_stat, p_value = stats.ttest_rel(scores1, scores2)
397
 
398
  comparison = {
 
417
  return {'error': str(e)}
418
 
419
 
420
+ class EnsembleManager:
421
+ """Manage ensemble model creation and validation"""
422
+
423
+ def __init__(self, random_state: int = 42):
424
+ self.random_state = random_state
425
+
426
+ def create_ensemble(self, individual_models: Dict[str, Any],
427
+ voting: str = 'soft') -> VotingClassifier:
428
+ """Create ensemble from individual models"""
429
+
430
+ estimators = [(name, model) for name, model in individual_models.items()]
431
+
432
+ ensemble = VotingClassifier(
433
+ estimators=estimators,
434
+ voting=voting,
435
+ n_jobs=1 # CPU optimization for HFS
436
+ )
437
+
438
+ logger.info(f"Created {voting} voting ensemble with {len(estimators)} models")
439
+ return ensemble
440
+
441
+ def evaluate_ensemble_vs_individuals(self, ensemble, individual_models: Dict,
442
+ X_test, y_test) -> Dict:
443
+ """Compare ensemble performance against individual models"""
444
+
445
+ results = {}
446
+
447
+ # Evaluate individual models
448
+ for name, model in individual_models.items():
449
+ y_pred = model.predict(X_test)
450
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
451
+
452
+ results[name] = {
453
+ 'accuracy': float(accuracy_score(y_test, y_pred)),
454
+ 'precision': float(precision_score(y_test, y_pred, average='weighted')),
455
+ 'recall': float(recall_score(y_test, y_pred, average='weighted')),
456
+ 'f1': float(f1_score(y_test, y_pred, average='weighted')),
457
+ 'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
458
+ }
459
+
460
+ # Evaluate ensemble
461
+ y_pred_ensemble = ensemble.predict(X_test)
462
+ y_pred_proba_ensemble = ensemble.predict_proba(X_test)[:, 1]
463
+
464
+ results['ensemble'] = {
465
+ 'accuracy': float(accuracy_score(y_test, y_pred_ensemble)),
466
+ 'precision': float(precision_score(y_test, y_pred_ensemble, average='weighted')),
467
+ 'recall': float(recall_score(y_test, y_pred_ensemble, average='weighted')),
468
+ 'f1': float(f1_score(y_test, y_pred_ensemble, average='weighted')),
469
+ 'roc_auc': float(roc_auc_score(y_test, y_pred_proba_ensemble))
470
+ }
471
+
472
+ # Calculate improvement over best individual model
473
+ best_individual_f1 = max(results[name]['f1'] for name in individual_models.keys())
474
+ ensemble_f1 = results['ensemble']['f1']
475
+ improvement = ensemble_f1 - best_individual_f1
476
+
477
+ results['ensemble_analysis'] = {
478
+ 'best_individual_f1': best_individual_f1,
479
+ 'ensemble_f1': ensemble_f1,
480
+ 'improvement': improvement,
481
+ 'improvement_percentage': (improvement / best_individual_f1) * 100 if best_individual_f1 > 0 else 0,
482
+ 'is_better': improvement > 0
483
+ }
484
+
485
+ return results
486
+
487
+ def statistical_ensemble_comparison(self, ensemble, individual_models: Dict,
488
+ X, y, cv_manager: CrossValidationManager) -> Dict:
489
+ """Perform statistical comparison between ensemble and individual models"""
490
+
491
+ cv_strategy = cv_manager.create_cv_strategy(X, y)
492
+
493
+ results = {}
494
+
495
+ # Get CV results for ensemble
496
+ ensemble_cv = cv_manager.perform_cross_validation(ensemble, X, y, cv_strategy)
497
+ results['ensemble'] = ensemble_cv
498
+
499
+ # Get CV results for individual models
500
+ individual_cv_results = {}
501
+ for name, model in individual_models.items():
502
+ model_cv = cv_manager.perform_cross_validation(model, X, y, cv_strategy)
503
+ individual_cv_results[name] = model_cv
504
+ results[name] = model_cv
505
+
506
+ # Compare ensemble with each individual model
507
+ comparisons = {}
508
+ for name, model_cv in individual_cv_results.items():
509
+ comparison = cv_manager.compare_cv_results(model_cv, ensemble_cv)
510
+ comparisons[f'ensemble_vs_{name}'] = comparison
511
+
512
+ results['statistical_comparisons'] = comparisons
513
+
514
+ # Determine if ensemble should be used
515
+ ensemble_f1_scores = ensemble_cv.get('test_scores', {}).get('f1', {}).get('scores', [])
516
+
517
+ significantly_better_count = 0
518
+ for comparison in comparisons.values():
519
+ if comparison.get('paired_ttest', {}).get('significant', False) and comparison.get('difference', 0) > 0:
520
+ significantly_better_count += 1
521
+
522
+ results['ensemble_recommendation'] = {
523
+ 'use_ensemble': significantly_better_count > 0,
524
+ 'significantly_better_than': significantly_better_count,
525
+ 'total_comparisons': len(comparisons),
526
+ 'confidence': significantly_better_count / len(comparisons) if comparisons else 0
527
+ }
528
+
529
+ return results
530
+
531
+
532
  class EnhancedModelTrainer:
533
+ """Production-ready model trainer with LightGBM, enhanced features, and ensemble voting"""
534
 
535
+ def __init__(self, use_enhanced_features: bool = None, enable_ensemble: bool = True):
536
  # Auto-detect enhanced features if not specified
537
  if use_enhanced_features is None:
538
  self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE
539
  else:
540
  self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE
541
 
542
+ self.enable_ensemble = enable_ensemble
543
  self.setup_paths()
544
  self.setup_training_config()
545
  self.setup_models()
546
  self.progress_tracker = None
547
  self.cv_manager = CrossValidationManager()
548
+ self.ensemble_manager = EnsembleManager()
549
 
550
  # Enhanced feature tracking
551
  self.feature_engineer = None
 
605
  self.class_weight = 'balanced'
606
 
607
  def setup_models(self):
608
+ """Setup model configurations including LightGBM for comparison"""
609
  self.models = {
610
  'logistic_regression': {
611
  'model': LogisticRegression(
612
  max_iter=self.max_iter,
613
  class_weight=self.class_weight,
614
  random_state=self.random_state,
615
+ n_jobs=1 # CPU optimization
616
  ),
617
  'param_grid': {
618
  'model__C': [0.1, 1, 10],
 
621
  },
622
  'random_forest': {
623
  'model': RandomForestClassifier(
624
+ n_estimators=50, # Reduced for CPU efficiency
625
  class_weight=self.class_weight,
626
  random_state=self.random_state,
627
+ n_jobs=1 # CPU optimization
628
  ),
629
  'param_grid': {
630
  'model__n_estimators': [50, 100],
631
  'model__max_depth': [10, None]
632
  }
633
+ },
634
+ 'lightgbm': {
635
+ 'model': lgb.LGBMClassifier(
636
+ objective='binary',
637
+ boosting_type='gbdt',
638
+ num_leaves=31,
639
+ max_depth=10,
640
+ learning_rate=0.1,
641
+ n_estimators=100,
642
+ class_weight=self.class_weight,
643
+ random_state=self.random_state,
644
+ n_jobs=1, # CPU optimization
645
+ verbose=-1 # Suppress LightGBM output
646
+ ),
647
+ 'param_grid': {
648
+ 'model__n_estimators': [50, 100],
649
+ 'model__learning_rate': [0.05, 0.1],
650
+ 'model__num_leaves': [15, 31]
651
+ }
652
  }
653
  }
654
 
 
896
  param_grid,
897
  cv=cv_strategy,
898
  scoring='f1_weighted',
899
+ n_jobs=1, # Single job for CPU optimization
900
  verbose=0, # Reduce verbosity for speed
901
  return_train_score=True # For overfitting analysis
902
  )
 
957
  raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
958
 
959
  def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
960
+ """Train and evaluate multiple models including LightGBM with enhanced features and comprehensive CV"""
961
 
962
  results = {}
963
+ individual_models = {}
964
 
965
  for model_name in self.models.keys():
966
  logger.info(f"Training {model_name} with {'enhanced' if self.use_enhanced_features else 'standard'} features...")
 
988
  'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
989
  }
990
 
991
+ # Store for ensemble creation
992
+ individual_models[model_name] = best_model
993
+
994
  # Log results
995
  test_f1 = evaluation_metrics['f1']
996
  cv_results = evaluation_metrics.get('cross_validation', {})
 
1005
  logger.error(f"Training failed for {model_name}: {str(e)}")
1006
  results[model_name] = {'error': str(e)}
1007
 
1008
+ # Create and evaluate ensemble if enabled and we have multiple successful models
1009
+ if self.enable_ensemble and len(individual_models) >= 2:
1010
+ logger.info("Creating ensemble model...")
1011
+
1012
+ try:
1013
+ # Create ensemble
1014
+ ensemble = self.ensemble_manager.create_ensemble(individual_models, voting='soft')
1015
+
1016
+ # Fit ensemble
1017
+ X_full_train = np.concatenate([X_train, X_test])
1018
+ y_full_train = np.concatenate([y_train, y_test])
1019
+
1020
+ ensemble.fit(X_train, y_train)
1021
+
1022
+ # Evaluate ensemble
1023
+ ensemble_metrics = self.comprehensive_evaluation(
1024
+ ensemble, X_test, y_test, X_train, y_train
1025
+ )
1026
+
1027
+ # Compare ensemble with individual models
1028
+ ensemble_comparison = self.ensemble_manager.evaluate_ensemble_vs_individuals(
1029
+ ensemble, individual_models, X_test, y_test
1030
+ )
1031
+
1032
+ # Statistical comparison
1033
+ statistical_comparison = self.ensemble_manager.statistical_ensemble_comparison(
1034
+ ensemble, individual_models, X_full_train, y_full_train, self.cv_manager
1035
+ )
1036
+
1037
+ # Store ensemble results
1038
+ results['ensemble'] = {
1039
+ 'model': ensemble,
1040
+ 'evaluation_metrics': ensemble_metrics,
1041
+ 'ensemble_comparison': ensemble_comparison,
1042
+ 'statistical_comparison': statistical_comparison,
1043
+ 'training_time': datetime.now().isoformat(),
1044
+ 'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
1045
+ }
1046
+
1047
+ # Add ensemble to individual models for selection
1048
+ individual_models['ensemble'] = ensemble
1049
+
1050
+ # Log ensemble results
1051
+ ensemble_f1 = ensemble_metrics['f1']
1052
+ ensemble_improvement = ensemble_comparison.get('ensemble_analysis', {}).get('improvement', 0)
1053
+ logger.info(f"Ensemble F1: {ensemble_f1:.4f}, Improvement: {ensemble_improvement:.4f}")
1054
+
1055
+ # Log recommendation
1056
+ recommendation = statistical_comparison.get('ensemble_recommendation', {})
1057
+ if recommendation.get('use_ensemble', False):
1058
+ logger.info(f"✅ Ensemble recommended (confidence: {recommendation.get('confidence', 0):.2f})")
1059
+ else:
1060
+ logger.info(f"❌ Ensemble not recommended")
1061
+
1062
+ except Exception as e:
1063
+ logger.error(f"Ensemble creation failed: {str(e)}")
1064
+ results['ensemble'] = {'error': str(e)}
1065
+
1066
  return results
1067
 
1068
  def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
1069
+ """Select the best performing model based on CV results with ensemble consideration"""
1070
 
1071
  if self.progress_tracker:
1072
  self.progress_tracker.update("Selecting best model")
 
1076
  best_score = -1
1077
  best_metrics = None
1078
 
1079
+ # Consider ensemble first if it exists and is recommended
1080
+ if 'ensemble' in results and 'error' not in results['ensemble']:
1081
+ ensemble_result = results['ensemble']
1082
+ statistical_comparison = ensemble_result.get('statistical_comparison', {})
1083
+ recommendation = statistical_comparison.get('ensemble_recommendation', {})
1084
+
1085
+ if recommendation.get('use_ensemble', False):
1086
+ ensemble_metrics = ensemble_result['evaluation_metrics']
1087
+ cv_results = ensemble_metrics.get('cross_validation', {})
1088
+
1089
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
1090
+ f1_score = cv_results['test_scores']['f1']['mean']
1091
+ if f1_score > best_score:
1092
+ best_score = f1_score
1093
+ best_model_name = 'ensemble'
1094
+ best_model = ensemble_result['model']
1095
+ best_metrics = ensemble_metrics
1096
+ logger.info("✅ Ensemble selected as best model")
1097
+
1098
+ # If ensemble not selected, choose best individual model
1099
+ if best_model_name is None:
1100
+ for model_name, result in results.items():
1101
+ if 'error' in result or model_name == 'ensemble':
1102
+ continue
1103
+
1104
+ # Prioritize CV F1 score if available, fallback to test F1
1105
+ cv_results = result['evaluation_metrics'].get('cross_validation', {})
1106
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
1107
+ f1_score = cv_results['test_scores']['f1']['mean']
1108
+ score_type = "CV F1"
1109
+ else:
1110
+ f1_score = result['evaluation_metrics']['f1']
1111
+ score_type = "Test F1"
1112
+
1113
+ if f1_score > best_score:
1114
+ best_score = f1_score
1115
+ best_model_name = model_name
1116
+ best_model = result['model']
1117
+ best_metrics = result['evaluation_metrics']
1118
 
1119
  if best_model_name is None:
1120
  raise ValueError("No models trained successfully")
1121
 
1122
+ score_type = "CV F1" if 'cross_validation' in best_metrics else "Test F1"
1123
  logger.info(f"Best model: {best_model_name} with {score_type} score: {best_score:.4f}")
1124
  return best_model_name, best_model, best_metrics
1125
 
 
1168
  }
1169
  joblib.dump(enhanced_ref, self.vectorizer_path)
1170
  logger.info(f"✅ Saved enhanced features reference to {self.vectorizer_path}")
1171
+ elif model_name == 'ensemble':
1172
+ # Handle ensemble model saving
1173
+ joblib.dump(model, self.model_path)
1174
+ logger.info(f"✅ Saved ensemble model to {self.model_path}")
1175
 
1176
  except Exception as e:
1177
  logger.warning(f"Could not save individual components: {e}")
 
1186
  metadata = {
1187
  'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
1188
  'model_type': model_name,
1189
+ 'is_ensemble': model_name == 'ensemble',
1190
  'feature_engineering': {
1191
  'type': 'enhanced' if self.use_enhanced_features else 'standard',
1192
  'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
 
1206
  'max_features': self.max_features,
1207
  'ngram_range': self.ngram_range,
1208
  'feature_selection_k': self.feature_selection_k,
1209
+ 'use_enhanced_features': self.use_enhanced_features,
1210
+ 'enable_ensemble': self.enable_ensemble
1211
  }
1212
  }
1213
 
 
1264
  metadata['cv_accuracy_mean'] = cv_results['test_scores']['accuracy']['mean']
1265
  metadata['cv_accuracy_std'] = cv_results['test_scores']['accuracy']['std']
1266
 
1267
+ # Add ensemble information if applicable
1268
+ if model_name == 'ensemble' and 'ensemble' in results:
1269
+ ensemble_result = results['ensemble']
1270
+ ensemble_comparison = ensemble_result.get('ensemble_comparison', {})
1271
+ statistical_comparison = ensemble_result.get('statistical_comparison', {})
1272
+
1273
+ metadata['ensemble_info'] = {
1274
+ 'ensemble_analysis': ensemble_comparison.get('ensemble_analysis', {}),
1275
+ 'statistical_recommendation': statistical_comparison.get('ensemble_recommendation', {}),
1276
+ 'individual_models': list(ensemble_comparison.keys()) if ensemble_comparison else []
1277
+ }
1278
+
1279
  # Add model comparison results if available
1280
  if len(results) > 1:
1281
  model_comparison = {}
 
1304
  for feature_type, count in feature_metadata.get('feature_types', {}).items():
1305
  logger.info(f" {feature_type}: {count}")
1306
 
1307
+ # Log ensemble information
1308
+ if model_name == 'ensemble':
1309
+ logger.info(f"✅ Ensemble model selected and saved")
1310
+
1311
  logger.info(f"✅ Model artifacts saved successfully with {'enhanced' if self.use_enhanced_features else 'standard'} features")
1312
  return True
1313
 
 
1322
  logger.error(f"Failed to save backup pipeline: {str(e2)}")
1323
  return False
1324
 
1325
+ def train_model(self, data_path: str = None, force_enhanced: bool = None, force_ensemble: bool = None) -> Tuple[bool, str]:
1326
+ """Main training function with LightGBM, enhanced feature engineering, and ensemble voting"""
1327
  try:
1328
+ # Override settings if specified
1329
  if force_enhanced is not None:
1330
  original_setting = self.use_enhanced_features
1331
  self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE
1332
  if force_enhanced and not ENHANCED_FEATURES_AVAILABLE:
1333
  logger.warning("Enhanced features requested but not available, using standard features")
1334
 
1335
+ if force_ensemble is not None:
1336
+ self.enable_ensemble = force_ensemble
1337
+
1338
  feature_type = "enhanced" if self.use_enhanced_features else "standard"
1339
+ ensemble_info = "with ensemble" if self.enable_ensemble else "without ensemble"
1340
+ logger.info(f"Starting {feature_type} model training {ensemble_info} including LightGBM...")
1341
 
1342
  # Override data path if provided
1343
  if data_path:
 
1353
  len(df),
1354
  enable_tuning=True,
1355
  cv_folds=self.cv_folds,
1356
+ use_enhanced_features=self.use_enhanced_features,
1357
+ enable_ensemble=self.enable_ensemble
1358
  )
1359
 
1360
  print(f"\n📊 Enhanced Training Configuration:")
1361
  print(f"Dataset size: {len(df)} samples")
1362
  print(f"Feature engineering: {feature_type.title()}")
1363
  print(f"Cross-validation folds: {self.cv_folds}")
1364
+ print(f"Models: Logistic Regression, Random Forest, LightGBM")
1365
+ print(f"Ensemble voting: {'Enabled' if self.enable_ensemble else 'Disabled'}")
1366
  print(f"Estimated time: {time_estimate['total_formatted']}")
 
1367
  print(f"Hyperparameter tuning: Enabled")
1368
  if self.use_enhanced_features:
1369
  print(f"Enhanced features: Sentiment, Readability, Entities, Linguistic")
1370
  print()
1371
 
1372
+ # Setup progress tracker (adjusted for LightGBM and ensemble)
1373
  base_steps = 4 + (len(self.models) * 3) + 1 # Basic steps
1374
  enhanced_steps = 2 if self.use_enhanced_features else 0 # Feature engineering steps
1375
+ ensemble_steps = 3 if self.enable_ensemble else 0 # Ensemble creation and evaluation
1376
+ total_steps = base_steps + enhanced_steps + ensemble_steps
1377
  self.progress_tracker = ProgressTracker(total_steps, f"{feature_type.title()} Training Progress")
1378
 
1379
  # Prepare data
 
1409
  if len(X_test) < 1:
1410
  return False, "Cannot create test set. Dataset too small."
1411
 
1412
+ # Train and evaluate models with LightGBM and enhanced features
1413
  results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
1414
 
1415
+ # Select best model (could be ensemble)
1416
  best_model_name, best_model, best_metrics = self.select_best_model(results)
1417
 
1418
  # Save model artifacts with enhanced feature information
 
1422
  # Finish progress tracking
1423
  self.progress_tracker.finish()
1424
 
1425
+ # Create success message with comprehensive information
1426
  cv_results = best_metrics.get('cross_validation', {})
1427
  cv_info = ""
1428
  if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
 
1437
  if feature_metadata:
1438
  total_features = feature_metadata.get('total_features', 0)
1439
  feature_info = f", Enhanced Features: {total_features}"
1440
+
1441
+ # Ensemble information
1442
+ ensemble_info = ""
1443
+ if best_model_name == 'ensemble':
1444
+ ensemble_info = " (Ensemble Model Selected)"
1445
 
1446
  success_message = (
1447
+ f"{feature_type.title()} model training completed successfully{ensemble_info}. "
1448
  f"Best model: {best_model_name} "
1449
  f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info})"
1450
  )
 
1455
  except Exception as e:
1456
  if self.progress_tracker:
1457
  print() # New line after progress bar
1458
+ error_message = f"Enhanced model training with LightGBM failed: {str(e)}"
1459
  logger.error(error_message)
1460
  return False, error_message
1461
 
1462
 
1463
  def main():
1464
+ """Main execution function with LightGBM, enhanced features, and ensemble support"""
1465
  import argparse
1466
 
1467
  # Parse command line arguments
1468
+ parser = argparse.ArgumentParser(description='Train fake news detection model with LightGBM and enhanced features')
1469
  parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
1470
  parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
1471
  parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
1472
  parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features')
1473
  parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only')
1474
+ parser.add_argument('--enable_ensemble', action='store_true', help='Enable ensemble voting')
1475
+ parser.add_argument('--disable_ensemble', action='store_true', help='Disable ensemble voting')
1476
  args = parser.parse_args()
1477
 
1478
  # Determine feature engineering mode
 
1486
  use_enhanced = False
1487
  logger.info("Standard features explicitly requested")
1488
 
1489
+ # Determine ensemble mode
1490
+ enable_ensemble = None
1491
+ if args.enable_ensemble and args.disable_ensemble:
1492
+ logger.warning("Both --enable_ensemble and --disable_ensemble specified. Using default.")
1493
+ elif args.enable_ensemble:
1494
+ enable_ensemble = True
1495
+ logger.info("Ensemble voting explicitly enabled")
1496
+ elif args.disable_ensemble:
1497
+ enable_ensemble = False
1498
+ logger.info("Ensemble voting explicitly disabled")
1499
+
1500
+ trainer = EnhancedModelTrainer(
1501
+ use_enhanced_features=use_enhanced,
1502
+ enable_ensemble=enable_ensemble if enable_ensemble is not None else True
1503
+ )
1504
 
1505
  # Apply CV folds from command line
1506
  if args.cv_folds:
 
1524
  if 'enhanced_features' in config and use_enhanced is None:
1525
  trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE
1526
 
1527
+ # Ensemble configuration
1528
+ if 'enable_ensemble' in config and enable_ensemble is None:
1529
+ trainer.enable_ensemble = config['enable_ensemble']
1530
+
1531
  # Filter models if specified
1532
  selected_models = config.get('selected_models')
1533
  if selected_models and len(selected_models) < len(trainer.models):
 
1540
  logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds")
1541
  if trainer.use_enhanced_features:
1542
  logger.info("Enhanced features enabled via configuration")
1543
+ if trainer.enable_ensemble:
1544
+ logger.info("Ensemble voting enabled via configuration")
1545
 
1546
  except Exception as e:
1547
  logger.warning(f"Failed to load configuration: {e}, using defaults")
 
1561
  print(f" {feature_type}: {count}")
1562
  except Exception as e:
1563
  logger.warning(f"Could not display feature summary: {e}")
1564
+
1565
+ # Print model information
1566
+ print(f"\n🎯 Model Information:")
1567
+ print(f"Models trained: {', '.join(trainer.models.keys())}")
1568
+ if trainer.enable_ensemble:
1569
+ print(f"Ensemble voting: Enabled")
1570
+ else:
1571
+ print(f"Ensemble voting: Disabled")
1572
  else:
1573
  print(f"❌ {message}")
1574
  exit(1)