Ahmedik95316 commited on
Commit
ed2e413
·
1 Parent(s): dc74021

Update model/train.py

Browse files

Restoring previous working version

Files changed (1) hide show
  1. model/train.py +719 -951
model/train.py CHANGED
@@ -1,4 +1,4 @@
1
- # Enhanced model/train.py with LightGBM ensemble integration
2
 
3
  import seaborn as sns
4
  import matplotlib.pyplot as plt
@@ -14,7 +14,7 @@ from sklearn.model_selection import (
14
  train_test_split, cross_val_score, GridSearchCV,
15
  StratifiedKFold, validation_curve, cross_validate
16
  )
17
- from sklearn.ensemble import RandomForestClassifier, VotingClassifier
18
  from sklearn.linear_model import LogisticRegression
19
  from sklearn.feature_extraction.text import TfidfVectorizer
20
  import pandas as pd
@@ -31,16 +31,6 @@ from datetime import datetime, timedelta
31
  from typing import Dict, Tuple, Optional, Any, List
32
  import warnings
33
  import re
34
-
35
- # LightGBM import
36
- try:
37
- import lightgbm as lgb
38
- LIGHTGBM_AVAILABLE = True
39
- logging.info("LightGBM available for ensemble training")
40
- except ImportError:
41
- LIGHTGBM_AVAILABLE = False
42
- logging.warning("LightGBM not available - ensemble training will use alternative algorithms")
43
-
44
  warnings.filterwarnings('ignore')
45
 
46
  # Import enhanced feature engineering components
@@ -70,727 +60,454 @@ logging.basicConfig(
70
  logger = logging.getLogger(__name__)
71
 
72
 
73
- class EnsembleModelTrainer:
74
- """Production-ready ensemble model trainer with LightGBM integration"""
75
-
76
- def __init__(self, use_enhanced_features: bool = None, use_ensemble: bool = True):
77
- # Auto-detect enhanced features if not specified
78
- if use_enhanced_features is None:
79
- self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE
80
- else:
81
- self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE
82
 
83
- self.use_ensemble = use_ensemble and LIGHTGBM_AVAILABLE
 
84
 
85
- self.setup_paths()
86
- self.setup_training_config()
87
- self.setup_models()
88
- self.progress_tracker = None
89
- self.cv_manager = CrossValidationManager()
90
 
91
- # Enhanced feature tracking
92
- self.feature_engineer = None
93
- self.feature_importance_results = {}
 
94
 
95
- logger.info(f"Ensemble trainer initialized - Enhanced features: {self.use_enhanced_features}, "
96
- f"LightGBM ensemble: {self.use_ensemble}")
97
-
98
- def setup_paths(self):
99
- """Setup all necessary paths with proper permissions"""
100
- self.base_dir = Path("/tmp")
101
- self.data_dir = self.base_dir / "data"
102
- self.model_dir = self.base_dir / "model"
103
- self.results_dir = self.base_dir / "results"
104
- self.features_dir = self.base_dir / "features"
 
 
 
 
105
 
106
- # Create directories with proper permissions
107
- for dir_path in [self.data_dir, self.model_dir, self.results_dir, self.features_dir]:
108
- dir_path.mkdir(parents=True, exist_ok=True)
109
- try:
110
- dir_path.chmod(0o755)
111
- except:
112
- pass
113
 
114
- # File paths
115
- self.data_path = self.data_dir / "combined_dataset.csv"
116
- self.model_path = Path("/tmp/model.pkl")
117
- self.vectorizer_path = Path("/tmp/vectorizer.pkl")
118
- self.pipeline_path = Path("/tmp/pipeline.pkl")
119
- self.metadata_path = Path("/tmp/metadata.json")
120
- self.evaluation_path = self.results_dir / "evaluation_results.json"
 
 
121
 
122
- # Enhanced feature paths
123
- self.feature_engineer_path = Path("/tmp/feature_engineer.pkl")
124
- self.feature_importance_path = self.results_dir / "feature_importance.json"
 
 
125
 
126
- # Ensemble-specific paths
127
- self.ensemble_path = Path("/tmp/ensemble.pkl")
128
- self.ensemble_metadata_path = Path("/tmp/ensemble_metadata.json")
129
-
130
- def setup_training_config(self):
131
- """Setup training configuration with ensemble parameters"""
132
- self.test_size = 0.2
133
- self.validation_size = 0.1
134
- self.random_state = 42
135
- self.cv_folds = 5
136
 
137
- # Enhanced feature configuration
138
- if self.use_enhanced_features:
139
- self.max_features = 7500
140
- self.feature_selection_k = 3000
141
- logger.info("Using enhanced feature engineering pipeline")
 
142
  else:
143
- self.max_features = 5000
144
- self.feature_selection_k = 2000
145
- logger.info("Using standard TF-IDF feature pipeline")
 
 
 
146
 
147
- # Common parameters
148
- self.min_df = 1
149
- self.max_df = 0.95
150
- self.ngram_range = (1, 2)
151
- self.max_iter = 500
152
- self.class_weight = 'balanced'
 
 
153
 
154
- # LightGBM specific parameters
155
- self.lgb_params = {
156
- 'objective': 'binary',
157
- 'metric': 'binary_logloss',
158
- 'boosting_type': 'gbdt',
159
- 'num_leaves': 31,
160
- 'learning_rate': 0.1,
161
- 'feature_fraction': 0.8,
162
- 'bagging_fraction': 0.8,
163
- 'bagging_freq': 5,
164
- 'verbose': -1,
165
- 'random_state': self.random_state,
166
- 'class_weight': 'balanced'
167
- }
168
-
169
- def setup_models(self):
170
- """Setup model configurations including LightGBM ensemble"""
171
- # Base models
172
- self.models = {
173
- 'logistic_regression': {
174
- 'model': LogisticRegression(
175
- max_iter=self.max_iter,
176
- class_weight=self.class_weight,
177
- random_state=self.random_state,
178
- n_jobs=-1
179
- ),
180
- 'param_grid': {
181
- 'model__C': [0.1, 1, 10],
182
- 'model__penalty': ['l2']
183
- }
184
- },
185
- 'random_forest': {
186
- 'model': RandomForestClassifier(
187
- n_estimators=50,
188
- class_weight=self.class_weight,
189
- random_state=self.random_state,
190
- n_jobs=-1
191
- ),
192
- 'param_grid': {
193
- 'model__n_estimators': [50, 100],
194
- 'model__max_depth': [10, None]
195
- }
196
- }
197
  }
 
198
 
199
- # Add LightGBM if available
200
- if LIGHTGBM_AVAILABLE and self.use_ensemble:
201
- self.models['lightgbm'] = {
202
- 'model': lgb.LGBMClassifier(
203
- **self.lgb_params,
204
- n_estimators=100
205
- ),
206
- 'param_grid': {
207
- 'model__n_estimators': [50, 100],
208
- 'model__learning_rate': [0.05, 0.1],
209
- 'model__num_leaves': [31, 63]
210
- }
211
- }
212
-
213
- def create_lightgbm_ensemble(self, models_dict: Dict, X_train, y_train) -> VotingClassifier:
214
- """Create ensemble with LightGBM and traditional models"""
215
- if not LIGHTGBM_AVAILABLE:
216
- logger.warning("LightGBM not available for ensemble creation")
217
- return None
218
-
219
- logger.info("Creating LightGBM ensemble model...")
220
 
221
- # Prepare estimators for voting classifier
222
- estimators = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
- for model_name, model_info in models_dict.items():
225
- if 'best_estimator' in model_info:
226
- model = model_info['best_estimator']
227
- # Extract the actual model from pipeline
228
- if hasattr(model, 'named_steps') and 'model' in model.named_steps:
229
- actual_model = model.named_steps['model']
230
- else:
231
- actual_model = model
232
-
233
- estimators.append((model_name, actual_model))
234
 
235
- if len(estimators) < 2:
236
- logger.warning("Not enough models for ensemble creation")
237
- return None
238
 
239
- # Create ensemble with soft voting for probability-based predictions
240
- ensemble = VotingClassifier(
241
- estimators=estimators,
242
- voting='soft'
243
- )
244
 
245
- logger.info(f"Ensemble created with {len(estimators)} models: {[name for name, _ in estimators]}")
246
- return ensemble
247
-
248
- def train_ensemble_model(self, X_train, X_test, y_train, y_test, individual_results: Dict) -> Dict:
249
- """Train and evaluate ensemble model"""
250
- if not self.use_ensemble or not LIGHTGBM_AVAILABLE:
251
- logger.info("Ensemble training skipped - using best individual model")
252
- return {}
253
 
254
- logger.info("Training ensemble model with LightGBM integration...")
255
 
256
- try:
257
- # Create ensemble from individual models
258
- ensemble = self.create_lightgbm_ensemble(individual_results, X_train, y_train)
259
-
260
- if ensemble is None:
261
- return {'error': 'Failed to create ensemble'}
262
-
263
- # Train ensemble (models are already trained, just fitting the voting mechanism)
264
- logger.info("Training ensemble voting mechanism...")
265
-
266
- # For voting classifier with already-fitted models, we need to fit on features
267
- # First, we need to prepare features the same way
268
- pipeline = self.create_preprocessing_pipeline()
269
- X_train_processed = pipeline.fit_transform(X_train, y_train)
270
- X_test_processed = pipeline.transform(X_test)
271
-
272
- # Fit the ensemble
273
- ensemble.fit(X_train_processed, y_train)
274
-
275
- # Evaluate ensemble
276
- ensemble_metrics = self.comprehensive_evaluation_ensemble(
277
- ensemble, X_test_processed, y_test, X_train_processed, y_train
278
- )
279
-
280
- # Create ensemble pipeline for consistency
281
- ensemble_pipeline = Pipeline([
282
- ('preprocessing', pipeline.steps[0][1]), # Use same preprocessing
283
- ('ensemble', ensemble)
284
- ])
285
-
286
- ensemble_results = {
287
- 'ensemble': ensemble_pipeline,
288
- 'evaluation_metrics': ensemble_metrics,
289
- 'component_models': list(individual_results.keys()),
290
- 'ensemble_type': 'voting_classifier_with_lightgbm' if 'lightgbm' in individual_results else 'voting_classifier',
291
- 'training_time': datetime.now().isoformat(),
292
- 'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
293
- }
294
-
295
- logger.info(f"Ensemble training completed - F1: {ensemble_metrics.get('f1', 'N/A'):.4f}")
296
- return ensemble_results
297
-
298
- except Exception as e:
299
- logger.error(f"Ensemble training failed: {str(e)}")
300
- return {'error': str(e)}
301
-
302
- def comprehensive_evaluation_ensemble(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
303
- """Comprehensive evaluation specifically for ensemble models"""
304
-
305
- logger.info("Evaluating ensemble model...")
306
-
307
- # Predictions
308
- y_pred = model.predict(X_test)
309
- y_pred_proba = model.predict_proba(X_test)[:, 1]
310
-
311
- # Basic metrics
312
- metrics = {
313
- 'accuracy': float(accuracy_score(y_test, y_pred)),
314
- 'precision': float(precision_score(y_test, y_pred, average='weighted')),
315
- 'recall': float(recall_score(y_test, y_pred, average='weighted')),
316
- 'f1': float(f1_score(y_test, y_pred, average='weighted')),
317
- 'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
318
- }
319
-
320
- # Confusion matrix
321
- cm = confusion_matrix(y_test, y_pred)
322
- metrics['confusion_matrix'] = cm.tolist()
323
-
324
- # Cross-validation on full dataset
325
- if X_train is not None and y_train is not None:
326
- X_full = np.concatenate([X_train, X_test])
327
- y_full = np.concatenate([y_train, y_test])
328
-
329
- logger.info("Performing cross-validation on ensemble...")
330
- cv_results = self.cv_manager.perform_cross_validation(model, X_full, y_full)
331
- metrics['cross_validation'] = cv_results
332
-
333
- if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
334
- cv_f1_mean = cv_results['test_scores']['f1']['mean']
335
- cv_f1_std = cv_results['test_scores']['f1']['std']
336
- logger.info(f"Ensemble CV F1 Score: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
337
-
338
- # Ensemble-specific metrics
339
- metrics['ensemble_info'] = {
340
- 'model_type': 'ensemble',
341
- 'voting_type': getattr(model, 'voting', 'unknown'),
342
- 'n_estimators': len(getattr(model, 'estimators_', [])),
343
- 'estimator_names': [name for name, _ in getattr(model, 'estimators', [])]
344
- }
345
 
346
- return metrics
347
-
348
- def select_best_model(self, results: Dict, ensemble_results: Dict = None) -> Tuple[str, Any, Dict]:
349
- """Select the best performing model including ensemble option"""
350
 
351
- logger.info("Selecting best model from individual models and ensemble...")
352
 
353
- best_model_name = None
354
- best_model = None
355
- best_score = -1
356
- best_metrics = None
 
 
 
 
357
 
358
- # Evaluate individual models
359
- for model_name, result in results.items():
360
- if 'error' in result:
361
- continue
362
-
363
- cv_results = result['evaluation_metrics'].get('cross_validation', {})
364
- if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
365
- f1_score = cv_results['test_scores']['f1']['mean']
366
- score_type = "CV F1"
367
- else:
368
- f1_score = result['evaluation_metrics']['f1']
369
- score_type = "Test F1"
370
-
371
- logger.info(f"Model {model_name}: {score_type} = {f1_score:.4f}")
372
-
373
- if f1_score > best_score:
374
- best_score = f1_score
375
- best_model_name = model_name
376
- best_model = result['model']
377
- best_metrics = result['evaluation_metrics']
378
-
379
- # Evaluate ensemble if available
380
- if ensemble_results and 'evaluation_metrics' in ensemble_results:
381
- ensemble_metrics = ensemble_results['evaluation_metrics']
382
-
383
- cv_results = ensemble_metrics.get('cross_validation', {})
384
- if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
385
- ensemble_f1 = cv_results['test_scores']['f1']['mean']
386
- score_type = "CV F1"
387
- else:
388
- ensemble_f1 = ensemble_metrics['f1']
389
- score_type = "Test F1"
390
 
391
- logger.info(f"Ensemble model: {score_type} = {ensemble_f1:.4f}")
 
 
 
 
 
 
392
 
393
- if ensemble_f1 > best_score:
394
- best_score = ensemble_f1
395
- best_model_name = "ensemble"
396
- best_model = ensemble_results['ensemble']
397
- best_metrics = ensemble_metrics
398
-
399
- if best_model_name is None:
400
- raise ValueError("No models trained successfully")
401
-
402
- logger.info(f"Best model selected: {best_model_name} with F1 score: {best_score:.4f}")
403
- return best_model_name, best_model, best_metrics
404
-
405
- def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict,
406
- ensemble_results: Dict = None) -> bool:
407
- """Enhanced model artifacts saving with ensemble support"""
408
- try:
409
- logger.info(f"Saving model artifacts for {model_name}...")
410
-
411
- # Save the main pipeline/model
412
- if model_name == "ensemble":
413
- # Save ensemble model
414
- joblib.dump(model, self.ensemble_path)
415
- logger.info(f"Saved ensemble model to {self.ensemble_path}")
416
 
417
- # Also save as main pipeline for API compatibility
418
- joblib.dump(model, self.pipeline_path)
419
- logger.info(f"Saved ensemble as main pipeline to {self.pipeline_path}")
 
 
 
 
 
 
420
 
421
- # Save ensemble metadata
422
- ensemble_metadata = {
423
- 'model_type': 'ensemble',
424
- 'ensemble_type': ensemble_results.get('ensemble_type', 'voting_classifier'),
425
- 'component_models': ensemble_results.get('component_models', []),
426
- 'ensemble_info': metrics.get('ensemble_info', {}),
427
- 'timestamp': datetime.now().isoformat()
 
 
 
 
 
 
 
 
 
428
  }
429
 
430
- with open(self.ensemble_metadata_path, 'w') as f:
431
- json.dump(ensemble_metadata, f, indent=2)
432
- logger.info(f"Saved ensemble metadata to {self.ensemble_metadata_path}")
433
-
434
- else:
435
- # Save individual model pipeline
436
- joblib.dump(model, self.pipeline_path)
437
- logger.info(f"Saved {model_name} pipeline to {self.pipeline_path}")
438
-
439
- # Save individual components for backward compatibility
440
- try:
441
- if hasattr(model, 'named_steps'):
442
- if 'model' in model.named_steps:
443
- joblib.dump(model.named_steps['model'], self.model_path)
444
- elif 'ensemble' in model.named_steps:
445
- joblib.dump(model.named_steps['ensemble'], self.model_path)
446
 
447
- # Save vectorizer or enhanced features reference
448
- if 'vectorize' in model.named_steps:
449
- joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
450
- elif 'enhanced_features' in model.named_steps:
451
- enhanced_ref = {
452
- 'type': 'enhanced_features',
453
- 'feature_engineer_path': str(self.feature_engineer_path),
454
- 'metadata': self.feature_engineer.get_feature_metadata() if self.feature_engineer else {}
455
- }
456
- joblib.dump(enhanced_ref, self.vectorizer_path)
457
-
458
- except Exception as e:
459
- logger.warning(f"Could not save individual components: {e}")
460
-
461
- # Generate enhanced metadata
462
- metadata = self._create_enhanced_metadata(model_name, metrics, results, ensemble_results)
463
 
464
- # Save metadata
465
- with open(self.metadata_path, 'w') as f:
466
- json.dump(metadata, f, indent=2)
467
- logger.info(f"Saved metadata to {self.metadata_path}")
468
-
469
- return True
470
-
471
- except Exception as e:
472
- logger.error(f"Failed to save model artifacts: {str(e)}")
473
- return False
474
-
475
- def _create_enhanced_metadata(self, model_name: str, metrics: Dict, results: Dict,
476
- ensemble_results: Dict = None) -> Dict:
477
- """Create comprehensive metadata including ensemble information"""
478
-
479
- # Generate data hash and version
480
- data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
481
- version_suffix = "ensemble" if model_name == "ensemble" else model_name
482
-
483
- metadata = {
484
- 'model_version': f"v2.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{version_suffix}",
485
- 'model_type': model_name,
486
- 'is_ensemble': model_name == "ensemble",
487
- 'data_version': data_hash,
488
- 'test_accuracy': metrics['accuracy'],
489
- 'test_f1': metrics['f1'],
490
- 'test_precision': metrics['precision'],
491
- 'test_recall': metrics['recall'],
492
- 'test_roc_auc': metrics['roc_auc'],
493
- 'timestamp': datetime.now().isoformat(),
494
- 'training_method': 'enhanced_ensemble_training' if self.use_ensemble else 'enhanced_individual_training',
495
- 'lightgbm_available': LIGHTGBM_AVAILABLE,
496
- 'lightgbm_used': self.use_ensemble and LIGHTGBM_AVAILABLE
497
- }
498
-
499
- # Add feature engineering information
500
- metadata['feature_engineering'] = {
501
- 'type': 'enhanced' if self.use_enhanced_features else 'standard',
502
- 'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
503
- 'enhanced_features_used': self.use_enhanced_features
504
- }
505
-
506
- # Add ensemble-specific metadata
507
- if model_name == "ensemble" and ensemble_results:
508
- metadata['ensemble_details'] = {
509
- 'ensemble_type': ensemble_results.get('ensemble_type', 'voting_classifier'),
510
- 'component_models': ensemble_results.get('component_models', []),
511
- 'ensemble_info': metrics.get('ensemble_info', {}),
512
- 'voting_type': metrics.get('ensemble_info', {}).get('voting_type', 'soft')
513
- }
514
 
515
- # Add individual model performances for comparison
516
- metadata['component_performance'] = {}
517
- for comp_model_name in ensemble_results.get('component_models', []):
518
- if comp_model_name in results and 'evaluation_metrics' in results[comp_model_name]:
519
- comp_metrics = results[comp_model_name]['evaluation_metrics']
520
- metadata['component_performance'][comp_model_name] = {
521
- 'f1': comp_metrics.get('f1', 0),
522
- 'accuracy': comp_metrics.get('accuracy', 0)
523
- }
524
-
525
- # Add CV results
526
- cv_results = metrics.get('cross_validation', {})
527
- if cv_results and 'test_scores' in cv_results:
528
- metadata['cross_validation'] = {
529
- 'n_splits': cv_results.get('n_splits', self.cv_folds),
530
- 'test_scores': cv_results['test_scores'],
531
- 'train_scores': cv_results.get('train_scores', {}),
532
- 'overfitting_score': cv_results.get('overfitting_score', 'Unknown'),
533
- 'stability_score': cv_results.get('stability_score', 'Unknown')
534
- }
535
 
536
- if 'f1' in cv_results['test_scores']:
537
- metadata.update({
538
- 'cv_f1_mean': cv_results['test_scores']['f1']['mean'],
539
- 'cv_f1_std': cv_results['test_scores']['f1']['std']
540
- })
541
-
542
- # Add training configuration
543
- metadata['training_config'] = {
544
- 'test_size': self.test_size,
545
- 'cv_folds': self.cv_folds,
546
- 'max_features': self.max_features,
547
- 'use_ensemble': self.use_ensemble,
548
- 'use_enhanced_features': self.use_enhanced_features
549
- }
 
 
550
 
551
- return metadata
552
-
553
- def train_model(self, data_path: str = None, force_enhanced: bool = None,
554
- use_ensemble: bool = None) -> Tuple[bool, str]:
555
- """Main training function with ensemble support"""
556
  try:
557
- # Override settings if specified
558
- if force_enhanced is not None:
559
- original_enhanced = self.use_enhanced_features
560
- self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE
561
-
562
- if use_ensemble is not None:
563
- self.use_ensemble = use_ensemble and LIGHTGBM_AVAILABLE
564
-
565
- feature_type = "enhanced" if self.use_enhanced_features else "standard"
566
- training_type = "ensemble" if self.use_ensemble else "individual"
567
 
568
- logger.info(f"Starting {feature_type} {training_type} model training...")
569
-
570
- # Override data path if provided
571
- if data_path:
572
- self.data_path = Path(data_path)
573
-
574
- # Load and validate data
575
- success, df, message = self.load_and_validate_data()
576
- if not success:
577
- return False, message
578
-
579
- # Estimate training time
580
- time_estimate = estimate_training_time(
581
- len(df),
582
- enable_tuning=True,
583
- cv_folds=self.cv_folds,
584
- use_enhanced_features=self.use_enhanced_features,
585
- use_ensemble=self.use_ensemble
586
- )
587
 
588
- model_count = len(self.models)
589
- logger.info(f"Training Configuration:")
590
- logger.info(f" Dataset size: {len(df)} samples")
591
- logger.info(f" Feature engineering: {feature_type.title()}")
592
- logger.info(f" Training approach: {training_type.title()}")
593
- logger.info(f" Models to train: {model_count}")
594
- logger.info(f" LightGBM available: {LIGHTGBM_AVAILABLE}")
595
- logger.info(f" Estimated time: {time_estimate['total_formatted']}")
596
-
597
- # Setup progress tracker
598
- base_steps = 4 + (model_count * 3) + 2 # Base + model training + ensemble
599
- enhanced_steps = 2 if self.use_enhanced_features else 0
600
- ensemble_steps = 3 if self.use_ensemble else 0
601
- total_steps = base_steps + enhanced_steps + ensemble_steps
602
 
603
- self.progress_tracker = ProgressTracker(
604
- total_steps,
605
- f"{feature_type.title()} {training_type.title()} Training"
606
- )
607
-
608
- # Prepare data
609
- X = df['text'].values
610
- y = df['label'].values
611
-
612
- # Train-test split
613
- self.progress_tracker.update("Splitting data")
 
 
 
614
 
615
- if len(X) < 10:
616
- test_size = max(0.1, 1/len(X))
617
- else:
618
- test_size = self.test_size
619
-
620
- label_counts = pd.Series(y).value_counts()
621
- min_class_count = label_counts.min()
622
- can_stratify = min_class_count >= 2 and len(y) >= 4
623
 
624
- X_train, X_test, y_train, y_test = train_test_split(
625
- X, y,
626
- test_size=test_size,
627
- stratify=y if can_stratify else None,
628
- random_state=self.random_state
629
- )
630
-
631
- logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
632
-
633
- # Train individual models
634
- results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
635
-
636
- # Train ensemble if enabled
637
- ensemble_results = {}
638
- if self.use_ensemble and len([r for r in results.values() if 'error' not in r]) >= 2:
639
- self.progress_tracker.update("Creating ensemble model")
640
- ensemble_results = self.train_ensemble_model(X_train, X_test, y_train, y_test, results)
641
-
642
- if ensemble_results and 'error' not in ensemble_results:
643
- logger.info("Ensemble model trained successfully")
644
- else:
645
- logger.warning("Ensemble training failed, using best individual model")
646
-
647
- # Select best model (individual or ensemble)
648
- best_model_name, best_model, best_metrics = self.select_best_model(results, ensemble_results)
649
-
650
- # Save model artifacts
651
- if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results, ensemble_results):
652
- return False, "Failed to save model artifacts"
653
-
654
- # Finish progress tracking
655
- self.progress_tracker.finish()
656
-
657
- # Create success message
658
- cv_results = best_metrics.get('cross_validation', {})
659
- cv_info = ""
660
- if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
661
- cv_f1_mean = cv_results['test_scores']['f1']['mean']
662
- cv_f1_std = cv_results['test_scores']['f1']['std']
663
- cv_info = f", CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})"
664
-
665
- # Enhanced features info
666
- feature_info = f", {feature_type.title()} Features"
667
- if self.use_enhanced_features:
668
- feature_metadata = best_metrics.get('feature_metadata', {})
669
- if feature_metadata:
670
- total_features = feature_metadata.get('total_features', 0)
671
- feature_info = f", Enhanced Features: {total_features}"
672
 
673
- # Ensemble info
674
- ensemble_info = ""
675
- if best_model_name == "ensemble":
676
- ensemble_details = best_metrics.get('ensemble_info', {})
677
- n_models = ensemble_details.get('n_estimators', 0)
678
- ensemble_info = f", Ensemble: {n_models} models"
679
 
680
- success_message = (
681
- f"{training_type.title()} model training completed successfully. "
682
- f"Best model: {best_model_name} "
683
- f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info}{ensemble_info})"
684
- )
685
 
686
- logger.info(success_message)
687
- return True, success_message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
 
689
- except Exception as e:
690
- if self.progress_tracker:
691
- print()
692
- error_message = f"Enhanced ensemble model training failed: {str(e)}"
693
- logger.error(error_message)
694
- return False, error_message
 
695
 
696
- # Include all other methods from the original trainer (load_and_validate_data,
697
- # create_preprocessing_pipeline, comprehensive_evaluation, train_and_evaluate_models, etc.)
698
- # These remain largely the same but with minor modifications for ensemble support
 
 
 
 
 
699
 
 
 
 
 
 
 
 
 
 
 
 
700
 
701
- def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5,
702
- use_enhanced_features: bool = False, use_ensemble: bool = False) -> Dict:
703
- """Enhanced time estimation including ensemble training"""
704
-
705
- # Base time estimates (in seconds)
706
- base_times = {
707
- 'preprocessing': max(0.1, dataset_size * 0.001),
708
- 'vectorization': max(0.5, dataset_size * 0.01),
709
- 'feature_selection': max(0.2, dataset_size * 0.005),
710
- 'simple_training': max(1.0, dataset_size * 0.02),
711
- 'evaluation': max(0.5, dataset_size * 0.01),
712
- }
713
-
714
- # Enhanced feature engineering time multipliers
715
- if use_enhanced_features:
716
- base_times['preprocessing'] *= 2.5
717
- base_times['vectorization'] *= 1.5
718
- base_times['feature_selection'] *= 2.0
719
- base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05)
720
-
721
- # LightGBM training time (typically faster than RF but slower than LogReg)
722
- if use_ensemble and LIGHTGBM_AVAILABLE:
723
- base_times['lightgbm_training'] = max(2.0, dataset_size * 0.03)
724
- base_times['ensemble_creation'] = max(1.0, dataset_size * 0.005)
725
- base_times['ensemble_evaluation'] = max(1.0, dataset_size * 0.015)
726
-
727
- # Hyperparameter tuning multipliers
728
- tuning_multipliers = {
729
- 'logistic_regression': 8 if enable_tuning else 1,
730
- 'random_forest': 12 if enable_tuning else 1,
731
- }
732
-
733
- if use_ensemble and LIGHTGBM_AVAILABLE:
734
- tuning_multipliers['lightgbm'] = 10 if enable_tuning else 1
735
-
736
- # Cross-validation multiplier
737
- cv_multiplier = cv_folds if dataset_size > 100 else 1
738
-
739
- # Calculate estimates
740
- estimates = {}
741
-
742
- # Preprocessing steps
743
- estimates['data_loading'] = 0.5
744
- estimates['preprocessing'] = base_times['preprocessing']
745
- estimates['vectorization'] = base_times['vectorization']
746
-
747
- if use_enhanced_features:
748
- estimates['enhanced_feature_extraction'] = base_times['enhanced_feature_extraction']
749
-
750
- estimates['feature_selection'] = base_times['feature_selection']
751
-
752
- # Model training (includes CV)
753
- for model_name, multiplier in tuning_multipliers.items():
754
- model_time = base_times['simple_training'] * multiplier * cv_multiplier
755
- estimates[f'{model_name}_training'] = model_time
756
- estimates[f'{model_name}_evaluation'] = base_times['evaluation']
757
-
758
- # Ensemble-specific steps
759
- if use_ensemble and LIGHTGBM_AVAILABLE:
760
- estimates['ensemble_creation'] = base_times['ensemble_creation']
761
- estimates['ensemble_evaluation'] = base_times['ensemble_evaluation']
762
- estimates['ensemble_cross_validation'] = base_times['simple_training'] * cv_folds * 0.3
763
-
764
- # Cross-validation overhead
765
- estimates['cross_validation'] = base_times['simple_training'] * cv_folds * 0.5
766
-
767
- # Model saving
768
- estimates['model_saving'] = 1.0
769
-
770
- # Total estimate
771
- total_estimate = sum(estimates.values())
772
-
773
- # Add buffer for overhead
774
- buffer_multiplier = 1.6 if use_ensemble else (1.4 if use_enhanced_features else 1.2)
775
- total_estimate *= buffer_multiplier
776
-
777
- return {
778
- 'detailed_estimates': estimates,
779
- 'total_seconds': total_estimate,
780
- 'total_formatted': str(timedelta(seconds=int(total_estimate))),
781
- 'dataset_size': dataset_size,
782
- 'enable_tuning': enable_tuning,
783
- 'cv_folds': cv_folds,
784
- 'use_enhanced_features': use_enhanced_features,
785
- 'use_ensemble': use_ensemble,
786
- 'lightgbm_available': LIGHTGBM_AVAILABLE
787
- }
788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
 
790
- # Import all remaining methods from original trainer class
791
- class EnhancedModelTrainer(EnsembleModelTrainer):
792
- """Complete enhanced model trainer inheriting from ensemble trainer"""
793
-
794
  def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]:
795
  """Load and validate training data"""
796
  try:
@@ -1142,260 +859,356 @@ class EnhancedModelTrainer(EnsembleModelTrainer):
1142
 
1143
  return results
1144
 
1145
-
1146
- # Continue with ProgressTracker and CrossValidationManager classes from original...
1147
- class ProgressTracker:
1148
- """Progress tracking with time estimation"""
1149
-
1150
- def __init__(self, total_steps: int, description: str = "Training"):
1151
- self.total_steps = total_steps
1152
- self.current_step = 0
1153
- self.start_time = time.time()
1154
- self.description = description
1155
- self.step_times = []
1156
-
1157
- def update(self, step_name: str = ""):
1158
- """Update progress and print status"""
1159
- self.current_step += 1
1160
- current_time = time.time()
1161
- elapsed = current_time - self.start_time
1162
-
1163
- # Calculate progress percentage
1164
- progress_pct = (self.current_step / self.total_steps) * 100
1165
-
1166
- # Estimate remaining time
1167
- if self.current_step > 0:
1168
- avg_time_per_step = elapsed / self.current_step
1169
- remaining_steps = self.total_steps - self.current_step
1170
- eta_seconds = avg_time_per_step * remaining_steps
1171
- eta = timedelta(seconds=int(eta_seconds))
1172
- else:
1173
- eta = "calculating..."
1174
-
1175
- # Create progress bar
1176
- bar_length = 30
1177
- filled_length = int(bar_length * self.current_step // self.total_steps)
1178
- bar = '█' * filled_length + '░' * (bar_length - filled_length)
1179
-
1180
- # Print progress (this will be visible in Streamlit logs)
1181
- status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
1182
- if step_name:
1183
- status_msg += f" | {step_name}"
1184
- if eta != "calculating...":
1185
- status_msg += f" | ETA: {eta}"
1186
-
1187
- print(status_msg, end='', flush=True)
1188
-
1189
- # Also output JSON for Streamlit parsing (if needed)
1190
- progress_json = {
1191
- "type": "progress",
1192
- "step": self.current_step,
1193
- "total": self.total_steps,
1194
- "percentage": progress_pct,
1195
- "eta": str(eta) if eta != "calculating..." else None,
1196
- "step_name": step_name,
1197
- "elapsed": elapsed
1198
- }
1199
- print(f"\nPROGRESS_JSON: {json.dumps(progress_json)}")
1200
 
1201
- # Store step time for better estimation
1202
- if len(self.step_times) >= 3:
1203
- self.step_times.pop(0)
1204
- self.step_times.append(current_time - (self.start_time + sum(self.step_times)))
1205
-
1206
- def finish(self):
1207
- """Complete progress tracking"""
1208
- total_time = time.time() - self.start_time
1209
- print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
1210
 
 
 
 
 
1211
 
1212
- class CrossValidationManager:
1213
- """Advanced cross-validation management with comprehensive metrics"""
1214
-
1215
- def __init__(self, cv_folds: int = 5, random_state: int = 42):
1216
- self.cv_folds = cv_folds
1217
- self.random_state = random_state
1218
- self.cv_results = {}
1219
-
1220
- def create_cv_strategy(self, X, y) -> StratifiedKFold:
1221
- """Create appropriate CV strategy based on data characteristics"""
1222
- # Calculate appropriate CV folds for small datasets
1223
- n_samples = len(X)
1224
- min_samples_per_fold = 3 # Minimum samples per fold
1225
- max_folds = n_samples // min_samples_per_fold
1226
-
1227
- # Adjust folds based on data size and class distribution
1228
- unique_classes = np.unique(y)
1229
- min_class_count = min([np.sum(y == cls) for cls in unique_classes])
1230
-
1231
- # Ensure each fold has at least one sample from each class
1232
- max_folds_by_class = min_class_count
1233
-
1234
- actual_folds = max(2, min(self.cv_folds, max_folds, max_folds_by_class))
1235
-
1236
- logger.info(f"Using {actual_folds} CV folds (requested: {self.cv_folds})")
1237
-
1238
- return StratifiedKFold(
1239
- n_splits=actual_folds,
1240
- shuffle=True,
1241
- random_state=self.random_state
1242
- )
1243
-
1244
- def perform_cross_validation(self, pipeline, X, y, cv_strategy=None) -> Dict:
1245
- """Perform comprehensive cross-validation with multiple metrics"""
1246
-
1247
- if cv_strategy is None:
1248
- cv_strategy = self.create_cv_strategy(X, y)
1249
-
1250
- logger.info(f"Starting cross-validation with {cv_strategy.n_splits} folds...")
1251
-
1252
- # Define scoring metrics
1253
- scoring_metrics = {
1254
- 'accuracy': 'accuracy',
1255
- 'precision': 'precision_weighted',
1256
- 'recall': 'recall_weighted',
1257
- 'f1': 'f1_weighted',
1258
- 'roc_auc': 'roc_auc'
1259
- }
1260
-
1261
  try:
1262
- # Perform cross-validation
1263
- cv_scores = cross_validate(
1264
- pipeline, X, y,
1265
- cv=cv_strategy,
1266
- scoring=scoring_metrics,
1267
- return_train_score=True,
1268
- n_jobs=1, # Use single job for stability
1269
- verbose=0
1270
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1271
 
1272
- # Process results
1273
- cv_results = {
1274
- 'n_splits': cv_strategy.n_splits,
1275
- 'test_scores': {},
1276
- 'train_scores': {},
1277
- 'fold_results': []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1278
  }
1279
 
1280
- # Calculate statistics for each metric
1281
- for metric_name in scoring_metrics.keys():
1282
- test_key = f'test_{metric_name}'
1283
- train_key = f'train_{metric_name}'
1284
-
1285
- if test_key in cv_scores:
1286
- test_scores = cv_scores[test_key]
1287
- cv_results['test_scores'][metric_name] = {
1288
- 'mean': float(np.mean(test_scores)),
1289
- 'std': float(np.std(test_scores)),
1290
- 'min': float(np.min(test_scores)),
1291
- 'max': float(np.max(test_scores)),
1292
- 'scores': test_scores.tolist()
1293
  }
1294
 
1295
- if train_key in cv_scores:
1296
- train_scores = cv_scores[train_key]
1297
- cv_results['train_scores'][metric_name] = {
1298
- 'mean': float(np.mean(train_scores)),
1299
- 'std': float(np.std(train_scores)),
1300
- 'min': float(np.min(train_scores)),
1301
- 'max': float(np.max(train_scores)),
1302
- 'scores': train_scores.tolist()
1303
- }
 
 
 
 
 
 
 
 
 
 
 
1304
 
1305
- # Store individual fold results
1306
- for fold_idx in range(cv_strategy.n_splits):
1307
- fold_result = {
1308
- 'fold': fold_idx + 1,
1309
- 'test_scores': {},
1310
- 'train_scores': {}
 
 
 
1311
  }
1312
 
1313
- for metric_name in scoring_metrics.keys():
1314
- test_key = f'test_{metric_name}'
1315
- train_key = f'train_{metric_name}'
1316
-
1317
- if test_key in cv_scores:
1318
- fold_result['test_scores'][metric_name] = float(cv_scores[test_key][fold_idx])
1319
- if train_key in cv_scores:
1320
- fold_result['train_scores'][metric_name] = float(cv_scores[train_key][fold_idx])
1321
 
1322
- cv_results['fold_results'].append(fold_result)
 
 
1323
 
1324
- # Calculate overfitting indicators
1325
- if 'accuracy' in cv_results['test_scores'] and 'accuracy' in cv_results['train_scores']:
1326
- train_mean = cv_results['train_scores']['accuracy']['mean']
1327
- test_mean = cv_results['test_scores']['accuracy']['mean']
1328
- cv_results['overfitting_score'] = float(train_mean - test_mean)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1329
 
1330
- # Calculate stability metrics
1331
- if 'accuracy' in cv_results['test_scores']:
1332
- test_std = cv_results['test_scores']['accuracy']['std']
1333
- test_mean = cv_results['test_scores']['accuracy']['mean']
1334
- cv_results['stability_score'] = float(1 - (test_std / test_mean)) if test_mean > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1335
 
1336
- logger.info(f"Cross-validation completed successfully")
1337
- logger.info(f"Mean test accuracy: {cv_results['test_scores'].get('accuracy', {}).get('mean', 'N/A'):.4f}")
1338
- logger.info(f"Mean test F1: {cv_results['test_scores'].get('f1', {}).get('mean', 'N/A'):.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1339
 
1340
- return cv_results
 
 
 
 
 
 
 
 
 
1341
 
1342
- except Exception as e:
1343
- logger.error(f"Cross-validation failed: {e}")
1344
- return {
1345
- 'error': str(e),
1346
- 'n_splits': cv_strategy.n_splits if cv_strategy else self.cv_folds,
1347
- 'fallback': True
1348
- }
1349
 
 
 
 
 
 
 
 
1350
 
1351
- def preprocess_text_function(texts):
1352
- """
1353
- Standalone function for text preprocessing - pickle-safe
1354
- """
1355
- def clean_single_text(text):
1356
- # Convert to string
1357
- text = str(text)
1358
-
1359
- # Remove URLs
1360
- text = re.sub(r'http\S+|www\S+|https\S+', '', text)
1361
-
1362
- # Remove email addresses
1363
- text = re.sub(r'\S+@\S+', '', text)
1364
-
1365
- # Remove excessive punctuation
1366
- text = re.sub(r'[!]{2,}', '!', text)
1367
- text = re.sub(r'[?]{2,}', '?', text)
1368
- text = re.sub(r'[.]{3,}', '...', text)
1369
-
1370
- # Remove non-alphabetic characters except spaces and basic punctuation
1371
- text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
1372
-
1373
- # Remove excessive whitespace
1374
- text = re.sub(r'\s+', ' ', text)
1375
-
1376
- return text.strip().lower()
1377
-
1378
- # Process all texts
1379
- processed = []
1380
- for text in texts:
1381
- processed.append(clean_single_text(text))
1382
-
1383
- return processed
 
 
 
 
 
 
 
 
 
 
 
1384
 
1385
 
1386
  def main():
1387
- """Main execution function with enhanced ensemble support"""
1388
  import argparse
1389
 
1390
  # Parse command line arguments
1391
- parser = argparse.ArgumentParser(description='Train fake news detection model with LightGBM ensemble')
1392
  parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
1393
  parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
1394
  parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
1395
  parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features')
1396
  parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only')
1397
- parser.add_argument('--ensemble', action='store_true', help='Force use of LightGBM ensemble')
1398
- parser.add_argument('--no_ensemble', action='store_true', help='Disable ensemble training')
1399
  args = parser.parse_args()
1400
 
1401
  # Determine feature engineering mode
@@ -1409,18 +1222,7 @@ def main():
1409
  use_enhanced = False
1410
  logger.info("Standard features explicitly requested")
1411
 
1412
- # Determine ensemble mode
1413
- use_ensemble = None
1414
- if args.ensemble and args.no_ensemble:
1415
- logger.warning("Both --ensemble and --no_ensemble specified. Using auto-detection.")
1416
- elif args.ensemble:
1417
- use_ensemble = True
1418
- logger.info("LightGBM ensemble explicitly requested")
1419
- elif args.no_ensemble:
1420
- use_ensemble = False
1421
- logger.info("Ensemble training explicitly disabled")
1422
-
1423
- trainer = EnhancedModelTrainer(use_enhanced_features=use_enhanced, use_ensemble=use_ensemble)
1424
 
1425
  # Apply CV folds from command line
1426
  if args.cv_folds:
@@ -1444,14 +1246,6 @@ def main():
1444
  if 'enhanced_features' in config and use_enhanced is None:
1445
  trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE
1446
 
1447
- # Ensemble configuration
1448
- if 'use_ensemble' in config and use_ensemble is None:
1449
- trainer.use_ensemble = config['use_ensemble'] and LIGHTGBM_AVAILABLE
1450
-
1451
- # LightGBM specific parameters
1452
- if 'lightgbm_params' in config:
1453
- trainer.lgb_params.update(config['lightgbm_params'])
1454
-
1455
  # Filter models if specified
1456
  selected_models = config.get('selected_models')
1457
  if selected_models and len(selected_models) < len(trainer.models):
@@ -1464,19 +1258,10 @@ def main():
1464
  logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds")
1465
  if trainer.use_enhanced_features:
1466
  logger.info("Enhanced features enabled via configuration")
1467
- if trainer.use_ensemble:
1468
- logger.info("LightGBM ensemble enabled via configuration")
1469
 
1470
  except Exception as e:
1471
  logger.warning(f"Failed to load configuration: {e}, using defaults")
1472
 
1473
- # Log final configuration
1474
- logger.info("Final Training Configuration:")
1475
- logger.info(f" Enhanced Features: {trainer.use_enhanced_features} (Available: {ENHANCED_FEATURES_AVAILABLE})")
1476
- logger.info(f" LightGBM Ensemble: {trainer.use_ensemble} (Available: {LIGHTGBM_AVAILABLE})")
1477
- logger.info(f" Models to train: {list(trainer.models.keys())}")
1478
- logger.info(f" Cross-validation folds: {trainer.cv_folds}")
1479
-
1480
  success, message = trainer.train_model(data_path=args.data_path)
1481
 
1482
  if success:
@@ -1492,23 +1277,6 @@ def main():
1492
  print(f" {feature_type}: {count}")
1493
  except Exception as e:
1494
  logger.warning(f"Could not display feature summary: {e}")
1495
-
1496
- # Print ensemble summary
1497
- if trainer.use_ensemble and LIGHTGBM_AVAILABLE:
1498
- try:
1499
- ensemble_metadata_path = Path("/tmp/ensemble_metadata.json")
1500
- if ensemble_metadata_path.exists():
1501
- with open(ensemble_metadata_path, 'r') as f:
1502
- ensemble_metadata = json.load(f)
1503
-
1504
- print(f"\n🎯 Ensemble Model Summary:")
1505
- print(f"Ensemble type: {ensemble_metadata.get('ensemble_type', 'unknown')}")
1506
- print(f"Component models: {', '.join(ensemble_metadata.get('component_models', []))}")
1507
- else:
1508
- print(f"\n🎯 Individual Model Selected (Ensemble not used)")
1509
- except Exception as e:
1510
- logger.warning(f"Could not display ensemble summary: {e}")
1511
-
1512
  else:
1513
  print(f"❌ {message}")
1514
  exit(1)
 
1
+ # Enhanced version with comprehensive cross-validation and advanced feature engineering
2
 
3
  import seaborn as sns
4
  import matplotlib.pyplot as plt
 
14
  train_test_split, cross_val_score, GridSearchCV,
15
  StratifiedKFold, validation_curve, cross_validate
16
  )
17
+ from sklearn.ensemble import RandomForestClassifier
18
  from sklearn.linear_model import LogisticRegression
19
  from sklearn.feature_extraction.text import TfidfVectorizer
20
  import pandas as pd
 
31
  from typing import Dict, Tuple, Optional, Any, List
32
  import warnings
33
  import re
 
 
 
 
 
 
 
 
 
 
34
  warnings.filterwarnings('ignore')
35
 
36
  # Import enhanced feature engineering components
 
60
  logger = logging.getLogger(__name__)
61
 
62
 
63
+ def preprocess_text_function(texts):
64
+ """
65
+ Standalone function for text preprocessing - pickle-safe
66
+ """
67
+ def clean_single_text(text):
68
+ # Convert to string
69
+ text = str(text)
 
 
70
 
71
+ # Remove URLs
72
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text)
73
 
74
+ # Remove email addresses
75
+ text = re.sub(r'\S+@\S+', '', text)
 
 
 
76
 
77
+ # Remove excessive punctuation
78
+ text = re.sub(r'[!]{2,}', '!', text)
79
+ text = re.sub(r'[?]{2,}', '?', text)
80
+ text = re.sub(r'[.]{3,}', '...', text)
81
 
82
+ # Remove non-alphabetic characters except spaces and basic punctuation
83
+ text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
84
+
85
+ # Remove excessive whitespace
86
+ text = re.sub(r'\s+', ' ', text)
87
+
88
+ return text.strip().lower()
89
+
90
+ # Process all texts
91
+ processed = []
92
+ for text in texts:
93
+ processed.append(clean_single_text(text))
94
+
95
+ return processed
96
 
 
 
 
 
 
 
 
97
 
98
+ class ProgressTracker:
99
+ """Progress tracking with time estimation"""
100
+
101
+ def __init__(self, total_steps: int, description: str = "Training"):
102
+ self.total_steps = total_steps
103
+ self.current_step = 0
104
+ self.start_time = time.time()
105
+ self.description = description
106
+ self.step_times = []
107
 
108
+ def update(self, step_name: str = ""):
109
+ """Update progress and print status"""
110
+ self.current_step += 1
111
+ current_time = time.time()
112
+ elapsed = current_time - self.start_time
113
 
114
+ # Calculate progress percentage
115
+ progress_pct = (self.current_step / self.total_steps) * 100
 
 
 
 
 
 
 
 
116
 
117
+ # Estimate remaining time
118
+ if self.current_step > 0:
119
+ avg_time_per_step = elapsed / self.current_step
120
+ remaining_steps = self.total_steps - self.current_step
121
+ eta_seconds = avg_time_per_step * remaining_steps
122
+ eta = timedelta(seconds=int(eta_seconds))
123
  else:
124
+ eta = "calculating..."
125
+
126
+ # Create progress bar
127
+ bar_length = 30
128
+ filled_length = int(bar_length * self.current_step // self.total_steps)
129
+ bar = '█' * filled_length + '▒' * (bar_length - filled_length)
130
 
131
+ # Print progress (this will be visible in Streamlit logs)
132
+ status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
133
+ if step_name:
134
+ status_msg += f" | {step_name}"
135
+ if eta != "calculating...":
136
+ status_msg += f" | ETA: {eta}"
137
+
138
+ print(status_msg, end='', flush=True)
139
 
140
+ # Also output JSON for Streamlit parsing (if needed)
141
+ progress_json = {
142
+ "type": "progress",
143
+ "step": self.current_step,
144
+ "total": self.total_steps,
145
+ "percentage": progress_pct,
146
+ "eta": str(eta) if eta != "calculating..." else None,
147
+ "step_name": step_name,
148
+ "elapsed": elapsed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  }
150
+ print(f"\nPROGRESS_JSON: {json.dumps(progress_json)}")
151
 
152
+ # Store step time for better estimation
153
+ if len(self.step_times) >= 3: # Keep last 3 step times for moving average
154
+ self.step_times.pop(0)
155
+ self.step_times.append(current_time - (self.start_time + sum(self.step_times)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
+ def finish(self):
158
+ """Complete progress tracking"""
159
+ total_time = time.time() - self.start_time
160
+ print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
161
+
162
+
163
+ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5,
164
+ use_enhanced_features: bool = False) -> Dict:
165
+ """Estimate training time based on dataset characteristics and feature complexity"""
166
+
167
+ # Base time estimates (in seconds) based on empirical testing
168
+ base_times = {
169
+ 'preprocessing': max(0.1, dataset_size * 0.001), # ~1ms per sample
170
+ 'vectorization': max(0.5, dataset_size * 0.01), # ~10ms per sample
171
+ 'feature_selection': max(0.2, dataset_size * 0.005), # ~5ms per sample
172
+ 'simple_training': max(1.0, dataset_size * 0.02), # ~20ms per sample
173
+ 'evaluation': max(0.5, dataset_size * 0.01), # ~10ms per sample
174
+ }
175
+
176
+ # Enhanced feature engineering time multipliers
177
+ if use_enhanced_features:
178
+ base_times['preprocessing'] *= 2.5 # More complex preprocessing
179
+ base_times['vectorization'] *= 1.5 # Additional feature extraction
180
+ base_times['feature_selection'] *= 2.0 # More features to select from
181
+ base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05) # New step
182
+
183
+ # Hyperparameter tuning multipliers
184
+ tuning_multipliers = {
185
+ 'logistic_regression': 8 if enable_tuning else 1, # 8 param combinations
186
+ 'random_forest': 12 if enable_tuning else 1, # 12 param combinations
187
+ }
188
+
189
+ # Cross-validation multiplier
190
+ cv_multiplier = cv_folds if dataset_size > 100 else 1
191
+
192
+ # Calculate estimates
193
+ estimates = {}
194
+
195
+ # Preprocessing steps
196
+ estimates['data_loading'] = 0.5
197
+ estimates['preprocessing'] = base_times['preprocessing']
198
+ estimates['vectorization'] = base_times['vectorization']
199
+
200
+ if use_enhanced_features:
201
+ estimates['enhanced_feature_extraction'] = base_times['enhanced_feature_extraction']
202
+
203
+ estimates['feature_selection'] = base_times['feature_selection']
204
+
205
+ # Model training (now includes CV)
206
+ for model_name, multiplier in tuning_multipliers.items():
207
+ model_time = base_times['simple_training'] * multiplier * cv_multiplier
208
+ estimates[f'{model_name}_training'] = model_time
209
+ estimates[f'{model_name}_evaluation'] = base_times['evaluation']
210
+
211
+ # Cross-validation overhead
212
+ estimates['cross_validation'] = base_times['simple_training'] * cv_folds * 0.5
213
+
214
+ # Model saving
215
+ estimates['model_saving'] = 1.0
216
+
217
+ # Total estimate
218
+ total_estimate = sum(estimates.values())
219
+
220
+ # Add buffer for overhead (more for enhanced features)
221
+ buffer_multiplier = 1.4 if use_enhanced_features else 1.2
222
+ total_estimate *= buffer_multiplier
223
+
224
+ return {
225
+ 'detailed_estimates': estimates,
226
+ 'total_seconds': total_estimate,
227
+ 'total_formatted': str(timedelta(seconds=int(total_estimate))),
228
+ 'dataset_size': dataset_size,
229
+ 'enable_tuning': enable_tuning,
230
+ 'cv_folds': cv_folds,
231
+ 'use_enhanced_features': use_enhanced_features
232
+ }
233
+
234
+
235
+ class CrossValidationManager:
236
+ """Advanced cross-validation management with comprehensive metrics"""
237
+
238
+ def __init__(self, cv_folds: int = 5, random_state: int = 42):
239
+ self.cv_folds = cv_folds
240
+ self.random_state = random_state
241
+ self.cv_results = {}
242
 
243
+ def create_cv_strategy(self, X, y) -> StratifiedKFold:
244
+ """Create appropriate CV strategy based on data characteristics"""
245
+ # Calculate appropriate CV folds for small datasets
246
+ n_samples = len(X)
247
+ min_samples_per_fold = 3 # Minimum samples per fold
248
+ max_folds = n_samples // min_samples_per_fold
 
 
 
 
249
 
250
+ # Adjust folds based on data size and class distribution
251
+ unique_classes = np.unique(y)
252
+ min_class_count = min([np.sum(y == cls) for cls in unique_classes])
253
 
254
+ # Ensure each fold has at least one sample from each class
255
+ max_folds_by_class = min_class_count
 
 
 
256
 
257
+ actual_folds = max(2, min(self.cv_folds, max_folds, max_folds_by_class))
 
 
 
 
 
 
 
258
 
259
+ logger.info(f"Using {actual_folds} CV folds (requested: {self.cv_folds})")
260
 
261
+ return StratifiedKFold(
262
+ n_splits=actual_folds,
263
+ shuffle=True,
264
+ random_state=self.random_state
265
+ )
266
+
267
+ def perform_cross_validation(self, pipeline, X, y, cv_strategy=None) -> Dict:
268
+ """Perform comprehensive cross-validation with multiple metrics"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
+ if cv_strategy is None:
271
+ cv_strategy = self.create_cv_strategy(X, y)
 
 
272
 
273
+ logger.info(f"Starting cross-validation with {cv_strategy.n_splits} folds...")
274
 
275
+ # Define scoring metrics
276
+ scoring_metrics = {
277
+ 'accuracy': 'accuracy',
278
+ 'precision': 'precision_weighted',
279
+ 'recall': 'recall_weighted',
280
+ 'f1': 'f1_weighted',
281
+ 'roc_auc': 'roc_auc'
282
+ }
283
 
284
+ try:
285
+ # Perform cross-validation
286
+ cv_scores = cross_validate(
287
+ pipeline, X, y,
288
+ cv=cv_strategy,
289
+ scoring=scoring_metrics,
290
+ return_train_score=True,
291
+ n_jobs=1, # Use single job for stability
292
+ verbose=0
293
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
+ # Process results
296
+ cv_results = {
297
+ 'n_splits': cv_strategy.n_splits,
298
+ 'test_scores': {},
299
+ 'train_scores': {},
300
+ 'fold_results': []
301
+ }
302
 
303
+ # Calculate statistics for each metric
304
+ for metric_name in scoring_metrics.keys():
305
+ test_key = f'test_{metric_name}'
306
+ train_key = f'train_{metric_name}'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
+ if test_key in cv_scores:
309
+ test_scores = cv_scores[test_key]
310
+ cv_results['test_scores'][metric_name] = {
311
+ 'mean': float(np.mean(test_scores)),
312
+ 'std': float(np.std(test_scores)),
313
+ 'min': float(np.min(test_scores)),
314
+ 'max': float(np.max(test_scores)),
315
+ 'scores': test_scores.tolist()
316
+ }
317
 
318
+ if train_key in cv_scores:
319
+ train_scores = cv_scores[train_key]
320
+ cv_results['train_scores'][metric_name] = {
321
+ 'mean': float(np.mean(train_scores)),
322
+ 'std': float(np.std(train_scores)),
323
+ 'min': float(np.min(train_scores)),
324
+ 'max': float(np.max(train_scores)),
325
+ 'scores': train_scores.tolist()
326
+ }
327
+
328
+ # Store individual fold results
329
+ for fold_idx in range(cv_strategy.n_splits):
330
+ fold_result = {
331
+ 'fold': fold_idx + 1,
332
+ 'test_scores': {},
333
+ 'train_scores': {}
334
  }
335
 
336
+ for metric_name in scoring_metrics.keys():
337
+ test_key = f'test_{metric_name}'
338
+ train_key = f'train_{metric_name}'
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ if test_key in cv_scores:
341
+ fold_result['test_scores'][metric_name] = float(cv_scores[test_key][fold_idx])
342
+ if train_key in cv_scores:
343
+ fold_result['train_scores'][metric_name] = float(cv_scores[train_key][fold_idx])
344
+
345
+ cv_results['fold_results'].append(fold_result)
 
 
 
 
 
 
 
 
 
 
346
 
347
+ # Calculate overfitting indicators
348
+ if 'accuracy' in cv_results['test_scores'] and 'accuracy' in cv_results['train_scores']:
349
+ train_mean = cv_results['train_scores']['accuracy']['mean']
350
+ test_mean = cv_results['test_scores']['accuracy']['mean']
351
+ cv_results['overfitting_score'] = float(train_mean - test_mean)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
+ # Calculate stability metrics
354
+ if 'accuracy' in cv_results['test_scores']:
355
+ test_std = cv_results['test_scores']['accuracy']['std']
356
+ test_mean = cv_results['test_scores']['accuracy']['mean']
357
+ cv_results['stability_score'] = float(1 - (test_std / test_mean)) if test_mean > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
+ logger.info(f"Cross-validation completed successfully")
360
+ logger.info(f"Mean test accuracy: {cv_results['test_scores'].get('accuracy', {}).get('mean', 'N/A'):.4f}")
361
+ logger.info(f"Mean test F1: {cv_results['test_scores'].get('f1', {}).get('mean', 'N/A'):.4f}")
362
+
363
+ return cv_results
364
+
365
+ except Exception as e:
366
+ logger.error(f"Cross-validation failed: {e}")
367
+ return {
368
+ 'error': str(e),
369
+ 'n_splits': cv_strategy.n_splits if cv_strategy else self.cv_folds,
370
+ 'fallback': True
371
+ }
372
+
373
+ def compare_cv_results(self, results1: Dict, results2: Dict, metric: str = 'f1') -> Dict:
374
+ """Compare cross-validation results between two models"""
375
 
 
 
 
 
 
376
  try:
377
+ if 'error' in results1 or 'error' in results2:
378
+ return {'error': 'Cannot compare results with errors'}
 
 
 
 
 
 
 
 
379
 
380
+ scores1 = results1['test_scores'][metric]['scores']
381
+ scores2 = results2['test_scores'][metric]['scores']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
+ # Paired t-test
384
+ from scipy import stats
385
+ t_stat, p_value = stats.ttest_rel(scores1, scores2)
 
 
 
 
 
 
 
 
 
 
 
386
 
387
+ comparison = {
388
+ 'metric': metric,
389
+ 'model1_mean': results1['test_scores'][metric]['mean'],
390
+ 'model2_mean': results2['test_scores'][metric]['mean'],
391
+ 'model1_std': results1['test_scores'][metric]['std'],
392
+ 'model2_std': results2['test_scores'][metric]['std'],
393
+ 'difference': results2['test_scores'][metric]['mean'] - results1['test_scores'][metric]['mean'],
394
+ 'paired_ttest': {
395
+ 't_statistic': float(t_stat),
396
+ 'p_value': float(p_value),
397
+ 'significant': p_value < 0.05
398
+ },
399
+ 'effect_size': float(abs(t_stat) / np.sqrt(len(scores1))) if len(scores1) > 0 else 0
400
+ }
401
 
402
+ return comparison
 
 
 
 
 
 
 
403
 
404
+ except Exception as e:
405
+ logger.error(f"CV comparison failed: {e}")
406
+ return {'error': str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
 
 
 
 
 
 
408
 
409
+ class EnhancedModelTrainer:
410
+ """Production-ready model trainer with enhanced feature engineering and comprehensive CV"""
 
 
 
411
 
412
+ def __init__(self, use_enhanced_features: bool = None):
413
+ # Auto-detect enhanced features if not specified
414
+ if use_enhanced_features is None:
415
+ self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE
416
+ else:
417
+ self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE
418
+
419
+ self.setup_paths()
420
+ self.setup_training_config()
421
+ self.setup_models()
422
+ self.progress_tracker = None
423
+ self.cv_manager = CrossValidationManager()
424
+
425
+ # Enhanced feature tracking
426
+ self.feature_engineer = None
427
+ self.feature_importance_results = {}
428
 
429
+ def setup_paths(self):
430
+ """Setup all necessary paths with proper permissions"""
431
+ self.base_dir = Path("/tmp")
432
+ self.data_dir = self.base_dir / "data"
433
+ self.model_dir = self.base_dir / "model"
434
+ self.results_dir = self.base_dir / "results"
435
+ self.features_dir = self.base_dir / "features" # New for enhanced features
436
 
437
+ # Create directories with proper permissions
438
+ for dir_path in [self.data_dir, self.model_dir, self.results_dir, self.features_dir]:
439
+ dir_path.mkdir(parents=True, exist_ok=True)
440
+ # Ensure write permissions
441
+ try:
442
+ dir_path.chmod(0o755)
443
+ except:
444
+ pass
445
 
446
+ # File paths
447
+ self.data_path = self.data_dir / "combined_dataset.csv"
448
+ self.model_path = Path("/tmp/model.pkl")
449
+ self.vectorizer_path = Path("/tmp/vectorizer.pkl")
450
+ self.pipeline_path = Path("/tmp/pipeline.pkl")
451
+ self.metadata_path = Path("/tmp/metadata.json")
452
+ self.evaluation_path = self.results_dir / "evaluation_results.json"
453
+
454
+ # Enhanced feature paths
455
+ self.feature_engineer_path = Path("/tmp/feature_engineer.pkl")
456
+ self.feature_importance_path = self.results_dir / "feature_importance.json"
457
 
458
+ def setup_training_config(self):
459
+ """Setup training configuration with enhanced feature parameters"""
460
+ self.test_size = 0.2
461
+ self.validation_size = 0.1
462
+ self.random_state = 42
463
+ self.cv_folds = 5
464
+
465
+ # Enhanced feature configuration
466
+ if self.use_enhanced_features:
467
+ self.max_features = 7500 # Increased for enhanced features
468
+ self.feature_selection_k = 3000 # More features to select from
469
+ logger.info("Using enhanced feature engineering pipeline")
470
+ else:
471
+ self.max_features = 5000 # Standard TF-IDF
472
+ self.feature_selection_k = 2000
473
+ logger.info("Using standard TF-IDF feature pipeline")
474
+
475
+ # Common parameters
476
+ self.min_df = 1
477
+ self.max_df = 0.95
478
+ self.ngram_range = (1, 2)
479
+ self.max_iter = 500
480
+ self.class_weight = 'balanced'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
 
482
+ def setup_models(self):
483
+ """Setup model configurations for comparison"""
484
+ self.models = {
485
+ 'logistic_regression': {
486
+ 'model': LogisticRegression(
487
+ max_iter=self.max_iter,
488
+ class_weight=self.class_weight,
489
+ random_state=self.random_state,
490
+ n_jobs=-1
491
+ ),
492
+ 'param_grid': {
493
+ 'model__C': [0.1, 1, 10],
494
+ 'model__penalty': ['l2']
495
+ }
496
+ },
497
+ 'random_forest': {
498
+ 'model': RandomForestClassifier(
499
+ n_estimators=50,
500
+ class_weight=self.class_weight,
501
+ random_state=self.random_state,
502
+ n_jobs=-1
503
+ ),
504
+ 'param_grid': {
505
+ 'model__n_estimators': [50, 100],
506
+ 'model__max_depth': [10, None]
507
+ }
508
+ }
509
+ }
510
 
 
 
 
 
511
  def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]:
512
  """Load and validate training data"""
513
  try:
 
859
 
860
  return results
861
 
862
+ def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
863
+ """Select the best performing model based on CV results"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
864
 
865
+ if self.progress_tracker:
866
+ self.progress_tracker.update("Selecting best model")
 
 
 
 
 
 
 
867
 
868
+ best_model_name = None
869
+ best_model = None
870
+ best_score = -1
871
+ best_metrics = None
872
 
873
+ for model_name, result in results.items():
874
+ if 'error' in result:
875
+ continue
876
+
877
+ # Prioritize CV F1 score if available, fallback to test F1
878
+ cv_results = result['evaluation_metrics'].get('cross_validation', {})
879
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
880
+ f1_score = cv_results['test_scores']['f1']['mean']
881
+ score_type = "CV F1"
882
+ else:
883
+ f1_score = result['evaluation_metrics']['f1']
884
+ score_type = "Test F1"
885
+
886
+ if f1_score > best_score:
887
+ best_score = f1_score
888
+ best_model_name = model_name
889
+ best_model = result['model']
890
+ best_metrics = result['evaluation_metrics']
891
+
892
+ if best_model_name is None:
893
+ raise ValueError("No models trained successfully")
894
+
895
+ logger.info(f"Best model: {best_model_name} with {score_type} score: {best_score:.4f}")
896
+ return best_model_name, best_model, best_metrics
897
+
898
+ def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
899
+ """Save model artifacts and enhanced metadata with feature engineering results"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
900
  try:
901
+ if self.progress_tracker:
902
+ self.progress_tracker.update("Saving model")
903
+
904
+ # Save the full pipeline with error handling
905
+ try:
906
+ joblib.dump(model, self.pipeline_path)
907
+ logger.info(f"✅ Saved pipeline to {self.pipeline_path}")
908
+ except Exception as e:
909
+ logger.error(f"Failed to save pipeline: {e}")
910
+ # Try alternative path
911
+ alt_pipeline_path = Path("/tmp") / "pipeline.pkl"
912
+ joblib.dump(model, alt_pipeline_path)
913
+ logger.info(f"✅ Saved pipeline to {alt_pipeline_path}")
914
+
915
+ # Save enhanced feature engineer if available
916
+ if self.use_enhanced_features and self.feature_engineer is not None:
917
+ try:
918
+ self.feature_engineer.save_pipeline(self.feature_engineer_path)
919
+ logger.info(f"✅ Saved feature engineer to {self.feature_engineer_path}")
920
+ except Exception as e:
921
+ logger.warning(f"Could not save feature engineer: {e}")
922
+
923
+ # Save individual components for backward compatibility
924
+ try:
925
+ if hasattr(model, 'named_steps'):
926
+ if 'model' in model.named_steps:
927
+ joblib.dump(model.named_steps['model'], self.model_path)
928
+ logger.info(f"✅ Saved model component to {self.model_path}")
929
+
930
+ # Save vectorizer (standard pipeline) or enhanced features reference
931
+ if 'vectorize' in model.named_steps:
932
+ joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
933
+ logger.info(f"✅ Saved vectorizer to {self.vectorizer_path}")
934
+ elif 'enhanced_features' in model.named_steps:
935
+ # Save reference to enhanced features
936
+ enhanced_ref = {
937
+ 'type': 'enhanced_features',
938
+ 'feature_engineer_path': str(self.feature_engineer_path),
939
+ 'metadata': self.feature_engineer.get_feature_metadata() if self.feature_engineer else {}
940
+ }
941
+ joblib.dump(enhanced_ref, self.vectorizer_path)
942
+ logger.info(f"✅ Saved enhanced features reference to {self.vectorizer_path}")
943
+
944
+ except Exception as e:
945
+ logger.warning(f"Could not save individual components: {e}")
946
+
947
+ # Generate data hash
948
+ data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
949
+
950
+ # Extract CV results
951
+ cv_results = metrics.get('cross_validation', {})
952
 
953
+ # Create enhanced metadata with feature engineering information
954
+ metadata = {
955
+ 'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
956
+ 'model_type': model_name,
957
+ 'feature_engineering': {
958
+ 'type': 'enhanced' if self.use_enhanced_features else 'standard',
959
+ 'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
960
+ 'enhanced_features_used': self.use_enhanced_features
961
+ },
962
+ 'data_version': data_hash,
963
+ 'test_accuracy': metrics['accuracy'],
964
+ 'test_f1': metrics['f1'],
965
+ 'test_precision': metrics['precision'],
966
+ 'test_recall': metrics['recall'],
967
+ 'test_roc_auc': metrics['roc_auc'],
968
+ 'overfitting_score': metrics.get('overfitting_score', 'Unknown'),
969
+ 'timestamp': datetime.now().isoformat(),
970
+ 'training_config': {
971
+ 'test_size': self.test_size,
972
+ 'cv_folds': self.cv_folds,
973
+ 'max_features': self.max_features,
974
+ 'ngram_range': self.ngram_range,
975
+ 'feature_selection_k': self.feature_selection_k,
976
+ 'use_enhanced_features': self.use_enhanced_features
977
+ }
978
  }
979
 
980
+ # Add enhanced feature metadata
981
+ if self.use_enhanced_features:
982
+ feature_metadata = metrics.get('feature_metadata', {})
983
+ if feature_metadata:
984
+ metadata['enhanced_features'] = {
985
+ 'total_features': feature_metadata.get('total_features', 0),
986
+ 'feature_types': feature_metadata.get('feature_types', {}),
987
+ 'configuration': feature_metadata.get('configuration', {})
 
 
 
 
 
988
  }
989
 
990
+ # Add top features if available
991
+ top_features = metrics.get('top_features', {})
992
+ if top_features:
993
+ metadata['top_features'] = dict(list(top_features.items())[:10]) # Top 10 features
994
+
995
+ # Save detailed feature importance
996
+ try:
997
+ feature_analysis = {
998
+ 'top_features': top_features,
999
+ 'feature_metadata': feature_metadata,
1000
+ 'timestamp': datetime.now().isoformat(),
1001
+ 'model_version': metadata['model_version']
1002
+ }
1003
+
1004
+ with open(self.feature_importance_path, 'w') as f:
1005
+ json.dump(feature_analysis, f, indent=2)
1006
+ logger.info(f"✅ Saved feature importance analysis to {self.feature_importance_path}")
1007
+
1008
+ except Exception as e:
1009
+ logger.warning(f"Could not save feature importance: {e}")
1010
 
1011
+ # Add comprehensive CV results to metadata
1012
+ if cv_results and 'test_scores' in cv_results:
1013
+ metadata['cross_validation'] = {
1014
+ 'n_splits': cv_results.get('n_splits', self.cv_folds),
1015
+ 'test_scores': cv_results['test_scores'],
1016
+ 'train_scores': cv_results.get('train_scores', {}),
1017
+ 'overfitting_score': cv_results.get('overfitting_score', 'Unknown'),
1018
+ 'stability_score': cv_results.get('stability_score', 'Unknown'),
1019
+ 'individual_fold_results': cv_results.get('fold_results', [])
1020
  }
1021
 
1022
+ # Add summary statistics
1023
+ if 'f1' in cv_results['test_scores']:
1024
+ metadata['cv_f1_mean'] = cv_results['test_scores']['f1']['mean']
1025
+ metadata['cv_f1_std'] = cv_results['test_scores']['f1']['std']
1026
+ metadata['cv_f1_min'] = cv_results['test_scores']['f1']['min']
1027
+ metadata['cv_f1_max'] = cv_results['test_scores']['f1']['max']
 
 
1028
 
1029
+ if 'accuracy' in cv_results['test_scores']:
1030
+ metadata['cv_accuracy_mean'] = cv_results['test_scores']['accuracy']['mean']
1031
+ metadata['cv_accuracy_std'] = cv_results['test_scores']['accuracy']['std']
1032
 
1033
+ # Add model comparison results if available
1034
+ if len(results) > 1:
1035
+ model_comparison = {}
1036
+ for other_model_name, other_result in results.items():
1037
+ if other_model_name != model_name and 'error' not in other_result:
1038
+ other_cv = other_result['evaluation_metrics'].get('cross_validation', {})
1039
+ if cv_results and other_cv:
1040
+ comparison = self.cv_manager.compare_cv_results(cv_results, other_cv)
1041
+ model_comparison[other_model_name] = comparison
1042
+
1043
+ if model_comparison:
1044
+ metadata['model_comparison'] = model_comparison
1045
+
1046
+ # Save metadata with error handling
1047
+ try:
1048
+ with open(self.metadata_path, 'w') as f:
1049
+ json.dump(metadata, f, indent=2)
1050
+ logger.info(f"✅ Saved enhanced metadata to {self.metadata_path}")
1051
+ except Exception as e:
1052
+ logger.warning(f"Could not save metadata: {e}")
1053
+
1054
+ # Log feature engineering summary
1055
+ if self.use_enhanced_features and feature_metadata:
1056
+ logger.info(f"✅ Enhanced features summary:")
1057
+ logger.info(f" Total features: {feature_metadata.get('total_features', 0)}")
1058
+ for feature_type, count in feature_metadata.get('feature_types', {}).items():
1059
+ logger.info(f" {feature_type}: {count}")
1060
+
1061
+ logger.info(f"✅ Model artifacts saved successfully with {'enhanced' if self.use_enhanced_features else 'standard'} features")
1062
+ return True
1063
+
1064
+ except Exception as e:
1065
+ logger.error(f"Failed to save model artifacts: {str(e)}")
1066
+ # Try to save at least the core pipeline
1067
+ try:
1068
+ joblib.dump(model, Path("/tmp/pipeline_backup.pkl"))
1069
+ logger.info("✅ Saved backup pipeline")
1070
+ return True
1071
+ except Exception as e2:
1072
+ logger.error(f"Failed to save backup pipeline: {str(e2)}")
1073
+ return False
1074
+
1075
+ def train_model(self, data_path: str = None, force_enhanced: bool = None) -> Tuple[bool, str]:
1076
+ """Main training function with enhanced feature engineering pipeline"""
1077
+ try:
1078
+ # Override enhanced features setting if specified
1079
+ if force_enhanced is not None:
1080
+ original_setting = self.use_enhanced_features
1081
+ self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE
1082
+ if force_enhanced and not ENHANCED_FEATURES_AVAILABLE:
1083
+ logger.warning("Enhanced features requested but not available, using standard features")
1084
 
1085
+ feature_type = "enhanced" if self.use_enhanced_features else "standard"
1086
+ logger.info(f"Starting {feature_type} model training with cross-validation...")
1087
+
1088
+ # Override data path if provided
1089
+ if data_path:
1090
+ self.data_path = Path(data_path)
1091
+
1092
+ # Load and validate data
1093
+ success, df, message = self.load_and_validate_data()
1094
+ if not success:
1095
+ return False, message
1096
+
1097
+ # Estimate training time and setup progress tracker
1098
+ time_estimate = estimate_training_time(
1099
+ len(df),
1100
+ enable_tuning=True,
1101
+ cv_folds=self.cv_folds,
1102
+ use_enhanced_features=self.use_enhanced_features
1103
+ )
1104
 
1105
+ print(f"\n📊 Enhanced Training Configuration:")
1106
+ print(f"Dataset size: {len(df)} samples")
1107
+ print(f"Feature engineering: {feature_type.title()}")
1108
+ print(f"Cross-validation folds: {self.cv_folds}")
1109
+ print(f"Estimated time: {time_estimate['total_formatted']}")
1110
+ print(f"Models to train: {len(self.models)}")
1111
+ print(f"Hyperparameter tuning: Enabled")
1112
+ if self.use_enhanced_features:
1113
+ print(f"Enhanced features: Sentiment, Readability, Entities, Linguistic")
1114
+ print()
1115
+
1116
+ # Setup progress tracker (adjusted for enhanced features)
1117
+ base_steps = 4 + (len(self.models) * 3) + 1 # Basic steps
1118
+ enhanced_steps = 2 if self.use_enhanced_features else 0 # Feature engineering steps
1119
+ total_steps = base_steps + enhanced_steps
1120
+ self.progress_tracker = ProgressTracker(total_steps, f"{feature_type.title()} Training Progress")
1121
+
1122
+ # Prepare data
1123
+ X = df['text'].values
1124
+ y = df['label'].values
1125
+
1126
+ # Train-test split with smart handling for small datasets
1127
+ self.progress_tracker.update("Splitting data")
1128
 
1129
+ # Ensure minimum test size for very small datasets
1130
+ if len(X) < 10:
1131
+ test_size = max(0.1, 1/len(X)) # At least 1 sample for test
1132
+ else:
1133
+ test_size = self.test_size
1134
+
1135
+ # Check if stratification is possible
1136
+ label_counts = pd.Series(y).value_counts()
1137
+ min_class_count = label_counts.min()
1138
+ can_stratify = min_class_count >= 2 and len(y) >= 4
1139
 
1140
+ X_train, X_test, y_train, y_test = train_test_split(
1141
+ X, y,
1142
+ test_size=test_size,
1143
+ stratify=y if can_stratify else None,
1144
+ random_state=self.random_state
1145
+ )
 
1146
 
1147
+ logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
1148
+
1149
+ # Additional validation for very small datasets
1150
+ if len(X_train) < 3:
1151
+ logger.warning(f"Very small training set: {len(X_train)} samples. CV results may be unreliable.")
1152
+ if len(X_test) < 1:
1153
+ return False, "Cannot create test set. Dataset too small."
1154
 
1155
+ # Train and evaluate models with enhanced features
1156
+ results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
1157
+
1158
+ # Select best model
1159
+ best_model_name, best_model, best_metrics = self.select_best_model(results)
1160
+
1161
+ # Save model artifacts with enhanced feature information
1162
+ if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
1163
+ return False, "Failed to save model artifacts"
1164
+
1165
+ # Finish progress tracking
1166
+ self.progress_tracker.finish()
1167
+
1168
+ # Create success message with enhanced feature information
1169
+ cv_results = best_metrics.get('cross_validation', {})
1170
+ cv_info = ""
1171
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
1172
+ cv_f1_mean = cv_results['test_scores']['f1']['mean']
1173
+ cv_f1_std = cv_results['test_scores']['f1']['std']
1174
+ cv_info = f", CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})"
1175
+
1176
+ # Enhanced features summary
1177
+ feature_info = ""
1178
+ if self.use_enhanced_features:
1179
+ feature_metadata = best_metrics.get('feature_metadata', {})
1180
+ if feature_metadata:
1181
+ total_features = feature_metadata.get('total_features', 0)
1182
+ feature_info = f", Enhanced Features: {total_features}"
1183
+
1184
+ success_message = (
1185
+ f"{feature_type.title()} model training completed successfully. "
1186
+ f"Best model: {best_model_name} "
1187
+ f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info})"
1188
+ )
1189
+
1190
+ logger.info(success_message)
1191
+ return True, success_message
1192
+
1193
+ except Exception as e:
1194
+ if self.progress_tracker:
1195
+ print() # New line after progress bar
1196
+ error_message = f"Enhanced model training failed: {str(e)}"
1197
+ logger.error(error_message)
1198
+ return False, error_message
1199
 
1200
 
1201
  def main():
1202
+ """Main execution function with enhanced feature engineering support"""
1203
  import argparse
1204
 
1205
  # Parse command line arguments
1206
+ parser = argparse.ArgumentParser(description='Train fake news detection model with enhanced features')
1207
  parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
1208
  parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
1209
  parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
1210
  parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features')
1211
  parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only')
 
 
1212
  args = parser.parse_args()
1213
 
1214
  # Determine feature engineering mode
 
1222
  use_enhanced = False
1223
  logger.info("Standard features explicitly requested")
1224
 
1225
+ trainer = EnhancedModelTrainer(use_enhanced_features=use_enhanced)
 
 
 
 
 
 
 
 
 
 
 
1226
 
1227
  # Apply CV folds from command line
1228
  if args.cv_folds:
 
1246
  if 'enhanced_features' in config and use_enhanced is None:
1247
  trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE
1248
 
 
 
 
 
 
 
 
 
1249
  # Filter models if specified
1250
  selected_models = config.get('selected_models')
1251
  if selected_models and len(selected_models) < len(trainer.models):
 
1258
  logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds")
1259
  if trainer.use_enhanced_features:
1260
  logger.info("Enhanced features enabled via configuration")
 
 
1261
 
1262
  except Exception as e:
1263
  logger.warning(f"Failed to load configuration: {e}, using defaults")
1264
 
 
 
 
 
 
 
 
1265
  success, message = trainer.train_model(data_path=args.data_path)
1266
 
1267
  if success:
 
1277
  print(f" {feature_type}: {count}")
1278
  except Exception as e:
1279
  logger.warning(f"Could not display feature summary: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1280
  else:
1281
  print(f"❌ {message}")
1282
  exit(1)