Ahmedik95316 commited on
Commit
8a926b4
Β·
1 Parent(s): 0cfbe2d

Update model/train.py

Browse files

Adding Enhanced Feature Engineering Pipeline

Files changed (1) hide show
  1. model/train.py +312 -94
model/train.py CHANGED
@@ -1,5 +1,4 @@
1
- # File: model/train.py (MODIFIED)
2
- # Enhanced version with comprehensive cross-validation implementation
3
 
4
  import seaborn as sns
5
  import matplotlib.pyplot as plt
@@ -34,6 +33,21 @@ import warnings
34
  import re
35
  warnings.filterwarnings('ignore')
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # Configure logging
38
  logging.basicConfig(
39
  level=logging.INFO,
@@ -112,7 +126,7 @@ class ProgressTracker:
112
  # Create progress bar
113
  bar_length = 30
114
  filled_length = int(bar_length * self.current_step // self.total_steps)
115
- bar = 'β–ˆ' * filled_length + 'β–‘' * (bar_length - filled_length)
116
 
117
  # Print progress (this will be visible in Streamlit logs)
118
  status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
@@ -146,8 +160,9 @@ class ProgressTracker:
146
  print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
147
 
148
 
149
- def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5) -> Dict:
150
- """Estimate training time based on dataset characteristics"""
 
151
 
152
  # Base time estimates (in seconds) based on empirical testing
153
  base_times = {
@@ -158,6 +173,13 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
158
  'evaluation': max(0.5, dataset_size * 0.01), # ~10ms per sample
159
  }
160
 
 
 
 
 
 
 
 
161
  # Hyperparameter tuning multipliers
162
  tuning_multipliers = {
163
  'logistic_regression': 8 if enable_tuning else 1, # 8 param combinations
@@ -174,6 +196,10 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
174
  estimates['data_loading'] = 0.5
175
  estimates['preprocessing'] = base_times['preprocessing']
176
  estimates['vectorization'] = base_times['vectorization']
 
 
 
 
177
  estimates['feature_selection'] = base_times['feature_selection']
178
 
179
  # Model training (now includes CV)
@@ -191,8 +217,9 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
191
  # Total estimate
192
  total_estimate = sum(estimates.values())
193
 
194
- # Add 20% buffer for overhead
195
- total_estimate *= 1.2
 
196
 
197
  return {
198
  'detailed_estimates': estimates,
@@ -200,7 +227,8 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
200
  'total_formatted': str(timedelta(seconds=int(total_estimate))),
201
  'dataset_size': dataset_size,
202
  'enable_tuning': enable_tuning,
203
- 'cv_folds': cv_folds
 
204
  }
205
 
206
 
@@ -378,15 +406,25 @@ class CrossValidationManager:
378
  return {'error': str(e)}
379
 
380
 
381
- class RobustModelTrainer:
382
- """Production-ready model trainer with comprehensive cross-validation"""
383
 
384
- def __init__(self):
 
 
 
 
 
 
385
  self.setup_paths()
386
  self.setup_training_config()
387
  self.setup_models()
388
  self.progress_tracker = None
389
  self.cv_manager = CrossValidationManager()
 
 
 
 
390
 
391
  def setup_paths(self):
392
  """Setup all necessary paths with proper permissions"""
@@ -394,9 +432,10 @@ class RobustModelTrainer:
394
  self.data_dir = self.base_dir / "data"
395
  self.model_dir = self.base_dir / "model"
396
  self.results_dir = self.base_dir / "results"
 
397
 
398
  # Create directories with proper permissions
399
- for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
400
  dir_path.mkdir(parents=True, exist_ok=True)
401
  # Ensure write permissions
402
  try:
@@ -406,25 +445,39 @@ class RobustModelTrainer:
406
 
407
  # File paths
408
  self.data_path = self.data_dir / "combined_dataset.csv"
409
- self.model_path = Path("/tmp/model.pkl") # Direct path to avoid permission issues
410
  self.vectorizer_path = Path("/tmp/vectorizer.pkl")
411
  self.pipeline_path = Path("/tmp/pipeline.pkl")
412
  self.metadata_path = Path("/tmp/metadata.json")
413
  self.evaluation_path = self.results_dir / "evaluation_results.json"
 
 
 
 
414
 
415
  def setup_training_config(self):
416
- """Setup training configuration with CV parameters"""
417
  self.test_size = 0.2
418
  self.validation_size = 0.1
419
  self.random_state = 42
420
- self.cv_folds = 5 # Primary CV folds
421
- self.max_features = 5000 # Reduced for speed
422
- self.min_df = 1 # More lenient for small datasets
 
 
 
 
 
 
 
 
 
 
 
423
  self.max_df = 0.95
424
- self.ngram_range = (1, 2) # Reduced for speed
425
- self.max_iter = 500 # Reduced for speed
426
  self.class_weight = 'balanced'
427
- self.feature_selection_k = 2000 # Reduced for speed
428
 
429
  def setup_models(self):
430
  """Setup model configurations for comparison"""
@@ -434,22 +487,22 @@ class RobustModelTrainer:
434
  max_iter=self.max_iter,
435
  class_weight=self.class_weight,
436
  random_state=self.random_state,
437
- n_jobs=-1 # Use all cores
438
  ),
439
  'param_grid': {
440
- 'model__C': [0.1, 1, 10], # Reduced grid
441
  'model__penalty': ['l2']
442
  }
443
  },
444
  'random_forest': {
445
  'model': RandomForestClassifier(
446
- n_estimators=50, # Reduced for speed
447
  class_weight=self.class_weight,
448
  random_state=self.random_state,
449
- n_jobs=-1 # Use all cores
450
  ),
451
  'param_grid': {
452
- 'model__n_estimators': [50, 100], # Reduced grid
453
  'model__max_depth': [10, None]
454
  }
455
  }
@@ -494,10 +547,9 @@ class RobustModelTrainer:
494
  return False, None, f"Need at least 2 classes, found: {unique_labels}"
495
 
496
  # Check minimum sample size for CV
497
- min_samples_for_cv = self.cv_folds * 2 # At least 2 samples per fold
498
  if len(df) < min_samples_for_cv:
499
  logger.warning(f"Dataset size ({len(df)}) is small for {self.cv_folds}-fold CV")
500
- # Adjust CV folds for small datasets
501
  self.cv_manager.cv_folds = max(2, len(df) // 3)
502
  logger.info(f"Adjusted CV folds to {self.cv_manager.cv_folds}")
503
 
@@ -519,47 +571,79 @@ class RobustModelTrainer:
519
  logger.error(error_msg)
520
  return False, None, error_msg
521
 
522
- def create_preprocessing_pipeline(self) -> Pipeline:
523
- """Create preprocessing pipeline"""
 
 
 
524
 
525
  if self.progress_tracker:
526
- self.progress_tracker.update("Creating pipeline")
 
527
 
528
- # Use the standalone function instead of lambda
529
- text_preprocessor = FunctionTransformer(
530
- func=preprocess_text_function,
531
- validate=False
532
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
 
534
- # TF-IDF vectorization with optimized parameters
535
- vectorizer = TfidfVectorizer(
536
- max_features=self.max_features,
537
- min_df=self.min_df,
538
- max_df=self.max_df,
539
- ngram_range=self.ngram_range,
540
- stop_words='english',
541
- sublinear_tf=True,
542
- norm='l2'
543
- )
544
 
545
- # Feature selection
546
- feature_selector = SelectKBest(
547
- score_func=chi2,
548
- k=min(self.feature_selection_k, self.max_features)
549
- )
550
 
551
- # Create pipeline
552
- pipeline = Pipeline([
553
- ('preprocess', text_preprocessor),
554
- ('vectorize', vectorizer),
555
- ('feature_select', feature_selector),
556
- ('model', None) # Will be set during training
557
- ])
558
 
559
  return pipeline
560
 
561
  def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
562
- """Comprehensive model evaluation with cross-validation integration"""
563
 
564
  if self.progress_tracker:
565
  self.progress_tracker.update("Evaluating model")
@@ -597,6 +681,25 @@ class RobustModelTrainer:
597
  cv_f1_std = cv_results['test_scores']['f1']['std']
598
  logger.info(f"CV F1 Score: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})")
599
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
600
  # Training accuracy for overfitting detection
601
  try:
602
  if X_train is not None and y_train is not None:
@@ -614,7 +717,8 @@ class RobustModelTrainer:
614
  """Perform hyperparameter tuning with nested cross-validation"""
615
 
616
  if self.progress_tracker:
617
- self.progress_tracker.update(f"Tuning {model_name} with CV")
 
618
 
619
  try:
620
  # Set the model in the pipeline
@@ -709,15 +813,15 @@ class RobustModelTrainer:
709
  raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
710
 
711
  def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
712
- """Train and evaluate multiple models with comprehensive CV"""
713
 
714
  results = {}
715
 
716
  for model_name in self.models.keys():
717
- logger.info(f"Training {model_name} with cross-validation...")
718
 
719
  try:
720
- # Create pipeline
721
  pipeline = self.create_preprocessing_pipeline()
722
 
723
  # Hyperparameter tuning with CV
@@ -735,7 +839,8 @@ class RobustModelTrainer:
735
  'model': best_model,
736
  'tuning_results': tuning_results,
737
  'evaluation_metrics': evaluation_metrics,
738
- 'training_time': datetime.now().isoformat()
 
739
  }
740
 
741
  # Log results
@@ -791,7 +896,7 @@ class RobustModelTrainer:
791
  return best_model_name, best_model, best_metrics
792
 
793
  def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
794
- """Save model artifacts and enhanced metadata with CV results"""
795
  try:
796
  if self.progress_tracker:
797
  self.progress_tracker.update("Saving model")
@@ -807,20 +912,37 @@ class RobustModelTrainer:
807
  joblib.dump(model, alt_pipeline_path)
808
  logger.info(f"βœ… Saved pipeline to {alt_pipeline_path}")
809
 
810
- # Save individual components for backward compatibility
811
- try:
812
- if hasattr(model, 'named_steps') and 'model' in model.named_steps:
813
- joblib.dump(model.named_steps['model'], self.model_path)
814
- logger.info(f"βœ… Saved model to {self.model_path}")
815
- except Exception as e:
816
- logger.warning(f"Could not save model component: {e}")
817
 
 
818
  try:
819
- if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
820
- joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
821
- logger.info(f"βœ… Saved vectorizer to {self.vectorizer_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822
  except Exception as e:
823
- logger.warning(f"Could not save vectorizer component: {e}")
824
 
825
  # Generate data hash
826
  data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
@@ -828,10 +950,15 @@ class RobustModelTrainer:
828
  # Extract CV results
829
  cv_results = metrics.get('cross_validation', {})
830
 
831
- # Create enhanced metadata with CV information
832
  metadata = {
833
  'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
834
  'model_type': model_name,
 
 
 
 
 
835
  'data_version': data_hash,
836
  'test_accuracy': metrics['accuracy'],
837
  'test_f1': metrics['f1'],
@@ -845,10 +972,42 @@ class RobustModelTrainer:
845
  'cv_folds': self.cv_folds,
846
  'max_features': self.max_features,
847
  'ngram_range': self.ngram_range,
848
- 'feature_selection_k': self.feature_selection_k
 
849
  }
850
  }
851
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
852
  # Add comprehensive CV results to metadata
853
  if cv_results and 'test_scores' in cv_results:
854
  metadata['cross_validation'] = {
@@ -892,7 +1051,14 @@ class RobustModelTrainer:
892
  except Exception as e:
893
  logger.warning(f"Could not save metadata: {e}")
894
 
895
- logger.info(f"βœ… Model artifacts saved successfully with CV results")
 
 
 
 
 
 
 
896
  return True
897
 
898
  except Exception as e:
@@ -906,10 +1072,18 @@ class RobustModelTrainer:
906
  logger.error(f"Failed to save backup pipeline: {str(e2)}")
907
  return False
908
 
909
- def train_model(self, data_path: str = None) -> Tuple[bool, str]:
910
- """Main training function with comprehensive CV pipeline"""
911
  try:
912
- logger.info("Starting enhanced model training with cross-validation...")
 
 
 
 
 
 
 
 
913
 
914
  # Override data path if provided
915
  if data_path:
@@ -924,20 +1098,26 @@ class RobustModelTrainer:
924
  time_estimate = estimate_training_time(
925
  len(df),
926
  enable_tuning=True,
927
- cv_folds=self.cv_folds
 
928
  )
929
 
930
  print(f"\nπŸ“Š Enhanced Training Configuration:")
931
  print(f"Dataset size: {len(df)} samples")
 
932
  print(f"Cross-validation folds: {self.cv_folds}")
933
  print(f"Estimated time: {time_estimate['total_formatted']}")
934
  print(f"Models to train: {len(self.models)}")
935
  print(f"Hyperparameter tuning: Enabled")
 
 
936
  print()
937
 
938
- # Setup progress tracker (increased steps for CV)
939
- total_steps = 4 + (len(self.models) * 3) + 1 # Load, split, 3*models (tune+cv+eval), select, save
940
- self.progress_tracker = ProgressTracker(total_steps, "CV Training Progress")
 
 
941
 
942
  # Prepare data
943
  X = df['text'].values
@@ -972,20 +1152,20 @@ class RobustModelTrainer:
972
  if len(X_test) < 1:
973
  return False, "Cannot create test set. Dataset too small."
974
 
975
- # Train and evaluate models with CV
976
  results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
977
 
978
  # Select best model
979
  best_model_name, best_model, best_metrics = self.select_best_model(results)
980
 
981
- # Save model artifacts with CV results
982
  if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
983
  return False, "Failed to save model artifacts"
984
 
985
  # Finish progress tracking
986
  self.progress_tracker.finish()
987
 
988
- # Create success message with CV information
989
  cv_results = best_metrics.get('cross_validation', {})
990
  cv_info = ""
991
  if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
@@ -993,10 +1173,18 @@ class RobustModelTrainer:
993
  cv_f1_std = cv_results['test_scores']['f1']['std']
994
  cv_info = f", CV F1: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})"
995
 
 
 
 
 
 
 
 
 
996
  success_message = (
997
- f"Enhanced model training completed successfully. "
998
  f"Best model: {best_model_name} "
999
- f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info})"
1000
  )
1001
 
1002
  logger.info(success_message)
@@ -1011,17 +1199,30 @@ class RobustModelTrainer:
1011
 
1012
 
1013
  def main():
1014
- """Main execution function with enhanced CV support"""
1015
  import argparse
1016
 
1017
  # Parse command line arguments
1018
- parser = argparse.ArgumentParser(description='Train fake news detection model with cross-validation')
1019
  parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
1020
  parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
1021
  parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
 
 
1022
  args = parser.parse_args()
1023
 
1024
- trainer = RobustModelTrainer()
 
 
 
 
 
 
 
 
 
 
 
1025
 
1026
  # Apply CV folds from command line
1027
  if args.cv_folds:
@@ -1041,6 +1242,10 @@ def main():
1041
  trainer.max_features = config.get('max_features', trainer.max_features)
1042
  trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range))
1043
 
 
 
 
 
1044
  # Filter models if specified
1045
  selected_models = config.get('selected_models')
1046
  if selected_models and len(selected_models) < len(trainer.models):
@@ -1050,7 +1255,9 @@ def main():
1050
  # Update feature selection based on max_features
1051
  trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features)
1052
 
1053
- logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds: {config}")
 
 
1054
 
1055
  except Exception as e:
1056
  logger.warning(f"Failed to load configuration: {e}, using defaults")
@@ -1059,6 +1266,17 @@ def main():
1059
 
1060
  if success:
1061
  print(f"βœ… {message}")
 
 
 
 
 
 
 
 
 
 
 
1062
  else:
1063
  print(f"❌ {message}")
1064
  exit(1)
 
1
+ # Enhanced version with comprehensive cross-validation and advanced feature engineering
 
2
 
3
  import seaborn as sns
4
  import matplotlib.pyplot as plt
 
33
  import re
34
  warnings.filterwarnings('ignore')
35
 
36
+ # Import enhanced feature engineering components
37
+ try:
38
+ from features.feature_engineer import AdvancedFeatureEngineer, create_enhanced_pipeline, analyze_feature_importance
39
+ from features.sentiment_analyzer import SentimentAnalyzer
40
+ from features.readability_analyzer import ReadabilityAnalyzer
41
+ from features.entity_analyzer import EntityAnalyzer
42
+ from features.linguistic_analyzer import LinguisticAnalyzer
43
+ ENHANCED_FEATURES_AVAILABLE = True
44
+ logger = logging.getLogger(__name__)
45
+ logger.info("Enhanced feature engineering components loaded successfully")
46
+ except ImportError as e:
47
+ ENHANCED_FEATURES_AVAILABLE = False
48
+ logger = logging.getLogger(__name__)
49
+ logger.warning(f"Enhanced features not available, falling back to basic TF-IDF: {e}")
50
+
51
  # Configure logging
52
  logging.basicConfig(
53
  level=logging.INFO,
 
126
  # Create progress bar
127
  bar_length = 30
128
  filled_length = int(bar_length * self.current_step // self.total_steps)
129
+ bar = 'β–ˆ' * filled_length + 'β–’' * (bar_length - filled_length)
130
 
131
  # Print progress (this will be visible in Streamlit logs)
132
  status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
 
160
  print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
161
 
162
 
163
+ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5,
164
+ use_enhanced_features: bool = False) -> Dict:
165
+ """Estimate training time based on dataset characteristics and feature complexity"""
166
 
167
  # Base time estimates (in seconds) based on empirical testing
168
  base_times = {
 
173
  'evaluation': max(0.5, dataset_size * 0.01), # ~10ms per sample
174
  }
175
 
176
+ # Enhanced feature engineering time multipliers
177
+ if use_enhanced_features:
178
+ base_times['preprocessing'] *= 2.5 # More complex preprocessing
179
+ base_times['vectorization'] *= 1.5 # Additional feature extraction
180
+ base_times['feature_selection'] *= 2.0 # More features to select from
181
+ base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05) # New step
182
+
183
  # Hyperparameter tuning multipliers
184
  tuning_multipliers = {
185
  'logistic_regression': 8 if enable_tuning else 1, # 8 param combinations
 
196
  estimates['data_loading'] = 0.5
197
  estimates['preprocessing'] = base_times['preprocessing']
198
  estimates['vectorization'] = base_times['vectorization']
199
+
200
+ if use_enhanced_features:
201
+ estimates['enhanced_feature_extraction'] = base_times['enhanced_feature_extraction']
202
+
203
  estimates['feature_selection'] = base_times['feature_selection']
204
 
205
  # Model training (now includes CV)
 
217
  # Total estimate
218
  total_estimate = sum(estimates.values())
219
 
220
+ # Add buffer for overhead (more for enhanced features)
221
+ buffer_multiplier = 1.4 if use_enhanced_features else 1.2
222
+ total_estimate *= buffer_multiplier
223
 
224
  return {
225
  'detailed_estimates': estimates,
 
227
  'total_formatted': str(timedelta(seconds=int(total_estimate))),
228
  'dataset_size': dataset_size,
229
  'enable_tuning': enable_tuning,
230
+ 'cv_folds': cv_folds,
231
+ 'use_enhanced_features': use_enhanced_features
232
  }
233
 
234
 
 
406
  return {'error': str(e)}
407
 
408
 
409
+ class EnhancedModelTrainer:
410
+ """Production-ready model trainer with enhanced feature engineering and comprehensive CV"""
411
 
412
+ def __init__(self, use_enhanced_features: bool = None):
413
+ # Auto-detect enhanced features if not specified
414
+ if use_enhanced_features is None:
415
+ self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE
416
+ else:
417
+ self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE
418
+
419
  self.setup_paths()
420
  self.setup_training_config()
421
  self.setup_models()
422
  self.progress_tracker = None
423
  self.cv_manager = CrossValidationManager()
424
+
425
+ # Enhanced feature tracking
426
+ self.feature_engineer = None
427
+ self.feature_importance_results = {}
428
 
429
  def setup_paths(self):
430
  """Setup all necessary paths with proper permissions"""
 
432
  self.data_dir = self.base_dir / "data"
433
  self.model_dir = self.base_dir / "model"
434
  self.results_dir = self.base_dir / "results"
435
+ self.features_dir = self.base_dir / "features" # New for enhanced features
436
 
437
  # Create directories with proper permissions
438
+ for dir_path in [self.data_dir, self.model_dir, self.results_dir, self.features_dir]:
439
  dir_path.mkdir(parents=True, exist_ok=True)
440
  # Ensure write permissions
441
  try:
 
445
 
446
  # File paths
447
  self.data_path = self.data_dir / "combined_dataset.csv"
448
+ self.model_path = Path("/tmp/model.pkl")
449
  self.vectorizer_path = Path("/tmp/vectorizer.pkl")
450
  self.pipeline_path = Path("/tmp/pipeline.pkl")
451
  self.metadata_path = Path("/tmp/metadata.json")
452
  self.evaluation_path = self.results_dir / "evaluation_results.json"
453
+
454
+ # Enhanced feature paths
455
+ self.feature_engineer_path = Path("/tmp/feature_engineer.pkl")
456
+ self.feature_importance_path = self.results_dir / "feature_importance.json"
457
 
458
  def setup_training_config(self):
459
+ """Setup training configuration with enhanced feature parameters"""
460
  self.test_size = 0.2
461
  self.validation_size = 0.1
462
  self.random_state = 42
463
+ self.cv_folds = 5
464
+
465
+ # Enhanced feature configuration
466
+ if self.use_enhanced_features:
467
+ self.max_features = 7500 # Increased for enhanced features
468
+ self.feature_selection_k = 3000 # More features to select from
469
+ logger.info("Using enhanced feature engineering pipeline")
470
+ else:
471
+ self.max_features = 5000 # Standard TF-IDF
472
+ self.feature_selection_k = 2000
473
+ logger.info("Using standard TF-IDF feature pipeline")
474
+
475
+ # Common parameters
476
+ self.min_df = 1
477
  self.max_df = 0.95
478
+ self.ngram_range = (1, 2)
479
+ self.max_iter = 500
480
  self.class_weight = 'balanced'
 
481
 
482
  def setup_models(self):
483
  """Setup model configurations for comparison"""
 
487
  max_iter=self.max_iter,
488
  class_weight=self.class_weight,
489
  random_state=self.random_state,
490
+ n_jobs=-1
491
  ),
492
  'param_grid': {
493
+ 'model__C': [0.1, 1, 10],
494
  'model__penalty': ['l2']
495
  }
496
  },
497
  'random_forest': {
498
  'model': RandomForestClassifier(
499
+ n_estimators=50,
500
  class_weight=self.class_weight,
501
  random_state=self.random_state,
502
+ n_jobs=-1
503
  ),
504
  'param_grid': {
505
+ 'model__n_estimators': [50, 100],
506
  'model__max_depth': [10, None]
507
  }
508
  }
 
547
  return False, None, f"Need at least 2 classes, found: {unique_labels}"
548
 
549
  # Check minimum sample size for CV
550
+ min_samples_for_cv = self.cv_folds * 2
551
  if len(df) < min_samples_for_cv:
552
  logger.warning(f"Dataset size ({len(df)}) is small for {self.cv_folds}-fold CV")
 
553
  self.cv_manager.cv_folds = max(2, len(df) // 3)
554
  logger.info(f"Adjusted CV folds to {self.cv_manager.cv_folds}")
555
 
 
571
  logger.error(error_msg)
572
  return False, None, error_msg
573
 
574
+ def create_preprocessing_pipeline(self, use_enhanced: bool = None) -> Pipeline:
575
+ """Create preprocessing pipeline with optional enhanced features"""
576
+
577
+ if use_enhanced is None:
578
+ use_enhanced = self.use_enhanced_features
579
 
580
  if self.progress_tracker:
581
+ feature_type = "enhanced" if use_enhanced else "standard"
582
+ self.progress_tracker.update(f"Creating {feature_type} pipeline")
583
 
584
+ if use_enhanced and ENHANCED_FEATURES_AVAILABLE:
585
+ logger.info("Creating enhanced feature engineering pipeline...")
586
+
587
+ # Create enhanced feature engineer
588
+ feature_engineer = AdvancedFeatureEngineer(
589
+ enable_sentiment=True,
590
+ enable_readability=True,
591
+ enable_entities=True,
592
+ enable_linguistic=True,
593
+ feature_selection_k=self.feature_selection_k,
594
+ tfidf_max_features=self.max_features,
595
+ ngram_range=self.ngram_range,
596
+ min_df=self.min_df,
597
+ max_df=self.max_df
598
+ )
599
+
600
+ # Create pipeline with enhanced features
601
+ pipeline = Pipeline([
602
+ ('enhanced_features', feature_engineer),
603
+ ('model', None) # Will be set during training
604
+ ])
605
+
606
+ # Store reference for later use
607
+ self.feature_engineer = feature_engineer
608
+
609
+ else:
610
+ logger.info("Creating standard TF-IDF pipeline...")
611
+
612
+ # Use the standalone function instead of lambda
613
+ text_preprocessor = FunctionTransformer(
614
+ func=preprocess_text_function,
615
+ validate=False
616
+ )
617
 
618
+ # TF-IDF vectorization with optimized parameters
619
+ vectorizer = TfidfVectorizer(
620
+ max_features=self.max_features,
621
+ min_df=self.min_df,
622
+ max_df=self.max_df,
623
+ ngram_range=self.ngram_range,
624
+ stop_words='english',
625
+ sublinear_tf=True,
626
+ norm='l2'
627
+ )
628
 
629
+ # Feature selection
630
+ feature_selector = SelectKBest(
631
+ score_func=chi2,
632
+ k=min(self.feature_selection_k, self.max_features)
633
+ )
634
 
635
+ # Create standard pipeline
636
+ pipeline = Pipeline([
637
+ ('preprocess', text_preprocessor),
638
+ ('vectorize', vectorizer),
639
+ ('feature_select', feature_selector),
640
+ ('model', None) # Will be set during training
641
+ ])
642
 
643
  return pipeline
644
 
645
  def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
646
+ """Comprehensive model evaluation with enhanced feature analysis"""
647
 
648
  if self.progress_tracker:
649
  self.progress_tracker.update("Evaluating model")
 
681
  cv_f1_std = cv_results['test_scores']['f1']['std']
682
  logger.info(f"CV F1 Score: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})")
683
 
684
+ # Enhanced feature analysis
685
+ if self.use_enhanced_features and self.feature_engineer is not None:
686
+ try:
687
+ # Get feature importance if available
688
+ if hasattr(self.feature_engineer, 'get_feature_importance'):
689
+ feature_importance = self.feature_engineer.get_feature_importance(top_k=20)
690
+ metrics['top_features'] = feature_importance
691
+
692
+ # Get feature metadata
693
+ if hasattr(self.feature_engineer, 'get_feature_metadata'):
694
+ feature_metadata = self.feature_engineer.get_feature_metadata()
695
+ metrics['feature_metadata'] = feature_metadata
696
+
697
+ logger.info(f"Enhanced features used: {feature_metadata['total_features']}")
698
+ logger.info(f"Feature breakdown: {feature_metadata['feature_types']}")
699
+
700
+ except Exception as e:
701
+ logger.warning(f"Enhanced feature analysis failed: {e}")
702
+
703
  # Training accuracy for overfitting detection
704
  try:
705
  if X_train is not None and y_train is not None:
 
717
  """Perform hyperparameter tuning with nested cross-validation"""
718
 
719
  if self.progress_tracker:
720
+ feature_type = "enhanced" if self.use_enhanced_features else "standard"
721
+ self.progress_tracker.update(f"Tuning {model_name} with {feature_type} features")
722
 
723
  try:
724
  # Set the model in the pipeline
 
813
  raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
814
 
815
  def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
816
+ """Train and evaluate multiple models with enhanced features and comprehensive CV"""
817
 
818
  results = {}
819
 
820
  for model_name in self.models.keys():
821
+ logger.info(f"Training {model_name} with {'enhanced' if self.use_enhanced_features else 'standard'} features...")
822
 
823
  try:
824
+ # Create pipeline (enhanced or standard)
825
  pipeline = self.create_preprocessing_pipeline()
826
 
827
  # Hyperparameter tuning with CV
 
839
  'model': best_model,
840
  'tuning_results': tuning_results,
841
  'evaluation_metrics': evaluation_metrics,
842
+ 'training_time': datetime.now().isoformat(),
843
+ 'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
844
  }
845
 
846
  # Log results
 
896
  return best_model_name, best_model, best_metrics
897
 
898
  def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
899
+ """Save model artifacts and enhanced metadata with feature engineering results"""
900
  try:
901
  if self.progress_tracker:
902
  self.progress_tracker.update("Saving model")
 
912
  joblib.dump(model, alt_pipeline_path)
913
  logger.info(f"βœ… Saved pipeline to {alt_pipeline_path}")
914
 
915
+ # Save enhanced feature engineer if available
916
+ if self.use_enhanced_features and self.feature_engineer is not None:
917
+ try:
918
+ self.feature_engineer.save_pipeline(self.feature_engineer_path)
919
+ logger.info(f"βœ… Saved feature engineer to {self.feature_engineer_path}")
920
+ except Exception as e:
921
+ logger.warning(f"Could not save feature engineer: {e}")
922
 
923
+ # Save individual components for backward compatibility
924
  try:
925
+ if hasattr(model, 'named_steps'):
926
+ if 'model' in model.named_steps:
927
+ joblib.dump(model.named_steps['model'], self.model_path)
928
+ logger.info(f"βœ… Saved model component to {self.model_path}")
929
+
930
+ # Save vectorizer (standard pipeline) or enhanced features reference
931
+ if 'vectorize' in model.named_steps:
932
+ joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
933
+ logger.info(f"βœ… Saved vectorizer to {self.vectorizer_path}")
934
+ elif 'enhanced_features' in model.named_steps:
935
+ # Save reference to enhanced features
936
+ enhanced_ref = {
937
+ 'type': 'enhanced_features',
938
+ 'feature_engineer_path': str(self.feature_engineer_path),
939
+ 'metadata': self.feature_engineer.get_feature_metadata() if self.feature_engineer else {}
940
+ }
941
+ joblib.dump(enhanced_ref, self.vectorizer_path)
942
+ logger.info(f"βœ… Saved enhanced features reference to {self.vectorizer_path}")
943
+
944
  except Exception as e:
945
+ logger.warning(f"Could not save individual components: {e}")
946
 
947
  # Generate data hash
948
  data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
 
950
  # Extract CV results
951
  cv_results = metrics.get('cross_validation', {})
952
 
953
+ # Create enhanced metadata with feature engineering information
954
  metadata = {
955
  'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
956
  'model_type': model_name,
957
+ 'feature_engineering': {
958
+ 'type': 'enhanced' if self.use_enhanced_features else 'standard',
959
+ 'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
960
+ 'enhanced_features_used': self.use_enhanced_features
961
+ },
962
  'data_version': data_hash,
963
  'test_accuracy': metrics['accuracy'],
964
  'test_f1': metrics['f1'],
 
972
  'cv_folds': self.cv_folds,
973
  'max_features': self.max_features,
974
  'ngram_range': self.ngram_range,
975
+ 'feature_selection_k': self.feature_selection_k,
976
+ 'use_enhanced_features': self.use_enhanced_features
977
  }
978
  }
979
 
980
+ # Add enhanced feature metadata
981
+ if self.use_enhanced_features:
982
+ feature_metadata = metrics.get('feature_metadata', {})
983
+ if feature_metadata:
984
+ metadata['enhanced_features'] = {
985
+ 'total_features': feature_metadata.get('total_features', 0),
986
+ 'feature_types': feature_metadata.get('feature_types', {}),
987
+ 'configuration': feature_metadata.get('configuration', {})
988
+ }
989
+
990
+ # Add top features if available
991
+ top_features = metrics.get('top_features', {})
992
+ if top_features:
993
+ metadata['top_features'] = dict(list(top_features.items())[:10]) # Top 10 features
994
+
995
+ # Save detailed feature importance
996
+ try:
997
+ feature_analysis = {
998
+ 'top_features': top_features,
999
+ 'feature_metadata': feature_metadata,
1000
+ 'timestamp': datetime.now().isoformat(),
1001
+ 'model_version': metadata['model_version']
1002
+ }
1003
+
1004
+ with open(self.feature_importance_path, 'w') as f:
1005
+ json.dump(feature_analysis, f, indent=2)
1006
+ logger.info(f"βœ… Saved feature importance analysis to {self.feature_importance_path}")
1007
+
1008
+ except Exception as e:
1009
+ logger.warning(f"Could not save feature importance: {e}")
1010
+
1011
  # Add comprehensive CV results to metadata
1012
  if cv_results and 'test_scores' in cv_results:
1013
  metadata['cross_validation'] = {
 
1051
  except Exception as e:
1052
  logger.warning(f"Could not save metadata: {e}")
1053
 
1054
+ # Log feature engineering summary
1055
+ if self.use_enhanced_features and feature_metadata:
1056
+ logger.info(f"βœ… Enhanced features summary:")
1057
+ logger.info(f" Total features: {feature_metadata.get('total_features', 0)}")
1058
+ for feature_type, count in feature_metadata.get('feature_types', {}).items():
1059
+ logger.info(f" {feature_type}: {count}")
1060
+
1061
+ logger.info(f"βœ… Model artifacts saved successfully with {'enhanced' if self.use_enhanced_features else 'standard'} features")
1062
  return True
1063
 
1064
  except Exception as e:
 
1072
  logger.error(f"Failed to save backup pipeline: {str(e2)}")
1073
  return False
1074
 
1075
+ def train_model(self, data_path: str = None, force_enhanced: bool = None) -> Tuple[bool, str]:
1076
+ """Main training function with enhanced feature engineering pipeline"""
1077
  try:
1078
+ # Override enhanced features setting if specified
1079
+ if force_enhanced is not None:
1080
+ original_setting = self.use_enhanced_features
1081
+ self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE
1082
+ if force_enhanced and not ENHANCED_FEATURES_AVAILABLE:
1083
+ logger.warning("Enhanced features requested but not available, using standard features")
1084
+
1085
+ feature_type = "enhanced" if self.use_enhanced_features else "standard"
1086
+ logger.info(f"Starting {feature_type} model training with cross-validation...")
1087
 
1088
  # Override data path if provided
1089
  if data_path:
 
1098
  time_estimate = estimate_training_time(
1099
  len(df),
1100
  enable_tuning=True,
1101
+ cv_folds=self.cv_folds,
1102
+ use_enhanced_features=self.use_enhanced_features
1103
  )
1104
 
1105
  print(f"\nπŸ“Š Enhanced Training Configuration:")
1106
  print(f"Dataset size: {len(df)} samples")
1107
+ print(f"Feature engineering: {feature_type.title()}")
1108
  print(f"Cross-validation folds: {self.cv_folds}")
1109
  print(f"Estimated time: {time_estimate['total_formatted']}")
1110
  print(f"Models to train: {len(self.models)}")
1111
  print(f"Hyperparameter tuning: Enabled")
1112
+ if self.use_enhanced_features:
1113
+ print(f"Enhanced features: Sentiment, Readability, Entities, Linguistic")
1114
  print()
1115
 
1116
+ # Setup progress tracker (adjusted for enhanced features)
1117
+ base_steps = 4 + (len(self.models) * 3) + 1 # Basic steps
1118
+ enhanced_steps = 2 if self.use_enhanced_features else 0 # Feature engineering steps
1119
+ total_steps = base_steps + enhanced_steps
1120
+ self.progress_tracker = ProgressTracker(total_steps, f"{feature_type.title()} Training Progress")
1121
 
1122
  # Prepare data
1123
  X = df['text'].values
 
1152
  if len(X_test) < 1:
1153
  return False, "Cannot create test set. Dataset too small."
1154
 
1155
+ # Train and evaluate models with enhanced features
1156
  results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
1157
 
1158
  # Select best model
1159
  best_model_name, best_model, best_metrics = self.select_best_model(results)
1160
 
1161
+ # Save model artifacts with enhanced feature information
1162
  if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
1163
  return False, "Failed to save model artifacts"
1164
 
1165
  # Finish progress tracking
1166
  self.progress_tracker.finish()
1167
 
1168
+ # Create success message with enhanced feature information
1169
  cv_results = best_metrics.get('cross_validation', {})
1170
  cv_info = ""
1171
  if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
 
1173
  cv_f1_std = cv_results['test_scores']['f1']['std']
1174
  cv_info = f", CV F1: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})"
1175
 
1176
+ # Enhanced features summary
1177
+ feature_info = ""
1178
+ if self.use_enhanced_features:
1179
+ feature_metadata = best_metrics.get('feature_metadata', {})
1180
+ if feature_metadata:
1181
+ total_features = feature_metadata.get('total_features', 0)
1182
+ feature_info = f", Enhanced Features: {total_features}"
1183
+
1184
  success_message = (
1185
+ f"{feature_type.title()} model training completed successfully. "
1186
  f"Best model: {best_model_name} "
1187
+ f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info})"
1188
  )
1189
 
1190
  logger.info(success_message)
 
1199
 
1200
 
1201
  def main():
1202
+ """Main execution function with enhanced feature engineering support"""
1203
  import argparse
1204
 
1205
  # Parse command line arguments
1206
+ parser = argparse.ArgumentParser(description='Train fake news detection model with enhanced features')
1207
  parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
1208
  parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
1209
  parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
1210
+ parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features')
1211
+ parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only')
1212
  args = parser.parse_args()
1213
 
1214
+ # Determine feature engineering mode
1215
+ use_enhanced = None
1216
+ if args.enhanced_features and args.standard_features:
1217
+ logger.warning("Both --enhanced_features and --standard_features specified. Using auto-detection.")
1218
+ elif args.enhanced_features:
1219
+ use_enhanced = True
1220
+ logger.info("Enhanced features explicitly requested")
1221
+ elif args.standard_features:
1222
+ use_enhanced = False
1223
+ logger.info("Standard features explicitly requested")
1224
+
1225
+ trainer = EnhancedModelTrainer(use_enhanced_features=use_enhanced)
1226
 
1227
  # Apply CV folds from command line
1228
  if args.cv_folds:
 
1242
  trainer.max_features = config.get('max_features', trainer.max_features)
1243
  trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range))
1244
 
1245
+ # Enhanced feature configuration
1246
+ if 'enhanced_features' in config and use_enhanced is None:
1247
+ trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE
1248
+
1249
  # Filter models if specified
1250
  selected_models = config.get('selected_models')
1251
  if selected_models and len(selected_models) < len(trainer.models):
 
1255
  # Update feature selection based on max_features
1256
  trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features)
1257
 
1258
+ logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds")
1259
+ if trainer.use_enhanced_features:
1260
+ logger.info("Enhanced features enabled via configuration")
1261
 
1262
  except Exception as e:
1263
  logger.warning(f"Failed to load configuration: {e}, using defaults")
 
1266
 
1267
  if success:
1268
  print(f"βœ… {message}")
1269
+
1270
+ # Print feature engineering summary
1271
+ if trainer.use_enhanced_features and trainer.feature_engineer:
1272
+ try:
1273
+ metadata = trainer.feature_engineer.get_feature_metadata()
1274
+ print(f"\nπŸ“ˆ Enhanced Feature Engineering Summary:")
1275
+ print(f"Total features generated: {metadata['total_features']}")
1276
+ for feature_type, count in metadata['feature_types'].items():
1277
+ print(f" {feature_type}: {count}")
1278
+ except Exception as e:
1279
+ logger.warning(f"Could not display feature summary: {e}")
1280
  else:
1281
  print(f"❌ {message}")
1282
  exit(1)