Ahmedik95316 commited on
Commit
63682de
·
1 Parent(s): b44772d

Update model/train.py

Browse files

Adding LightGBM for Ensemble Model

Files changed (1) hide show
  1. model/train.py +949 -717
model/train.py CHANGED
@@ -1,4 +1,4 @@
1
- # Enhanced version with comprehensive cross-validation and advanced feature engineering
2
 
3
  import seaborn as sns
4
  import matplotlib.pyplot as plt
@@ -14,7 +14,7 @@ from sklearn.model_selection import (
14
  train_test_split, cross_val_score, GridSearchCV,
15
  StratifiedKFold, validation_curve, cross_validate
16
  )
17
- from sklearn.ensemble import RandomForestClassifier
18
  from sklearn.linear_model import LogisticRegression
19
  from sklearn.feature_extraction.text import TfidfVectorizer
20
  import pandas as pd
@@ -31,6 +31,16 @@ from datetime import datetime, timedelta
31
  from typing import Dict, Tuple, Optional, Any, List
32
  import warnings
33
  import re
 
 
 
 
 
 
 
 
 
 
34
  warnings.filterwarnings('ignore')
35
 
36
  # Import enhanced feature engineering components
@@ -60,454 +70,727 @@ logging.basicConfig(
60
  logger = logging.getLogger(__name__)
61
 
62
 
63
- def preprocess_text_function(texts):
64
- """
65
- Standalone function for text preprocessing - pickle-safe
66
- """
67
- def clean_single_text(text):
68
- # Convert to string
69
- text = str(text)
70
-
71
- # Remove URLs
72
- text = re.sub(r'http\S+|www\S+|https\S+', '', text)
73
-
74
- # Remove email addresses
75
- text = re.sub(r'\S+@\S+', '', text)
76
 
77
- # Remove excessive punctuation
78
- text = re.sub(r'[!]{2,}', '!', text)
79
- text = re.sub(r'[?]{2,}', '?', text)
80
- text = re.sub(r'[.]{3,}', '...', text)
81
 
82
- # Remove non-alphabetic characters except spaces and basic punctuation
83
- text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
 
 
 
84
 
85
- # Remove excessive whitespace
86
- text = re.sub(r'\s+', ' ', text)
 
87
 
88
- return text.strip().lower()
89
-
90
- # Process all texts
91
- processed = []
92
- for text in texts:
93
- processed.append(clean_single_text(text))
94
-
95
- return processed
96
 
 
 
 
 
 
 
 
97
 
98
- class ProgressTracker:
99
- """Progress tracking with time estimation"""
100
-
101
- def __init__(self, total_steps: int, description: str = "Training"):
102
- self.total_steps = total_steps
103
- self.current_step = 0
104
- self.start_time = time.time()
105
- self.description = description
106
- self.step_times = []
 
 
 
 
 
 
107
 
108
- def update(self, step_name: str = ""):
109
- """Update progress and print status"""
110
- self.current_step += 1
111
- current_time = time.time()
112
- elapsed = current_time - self.start_time
113
 
114
- # Calculate progress percentage
115
- progress_pct = (self.current_step / self.total_steps) * 100
 
 
 
 
 
 
 
 
116
 
117
- # Estimate remaining time
118
- if self.current_step > 0:
119
- avg_time_per_step = elapsed / self.current_step
120
- remaining_steps = self.total_steps - self.current_step
121
- eta_seconds = avg_time_per_step * remaining_steps
122
- eta = timedelta(seconds=int(eta_seconds))
123
  else:
124
- eta = "calculating..."
125
-
126
- # Create progress bar
127
- bar_length = 30
128
- filled_length = int(bar_length * self.current_step // self.total_steps)
129
- bar = '█' * filled_length + '▒' * (bar_length - filled_length)
130
 
131
- # Print progress (this will be visible in Streamlit logs)
132
- status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
133
- if step_name:
134
- status_msg += f" | {step_name}"
135
- if eta != "calculating...":
136
- status_msg += f" | ETA: {eta}"
137
-
138
- print(status_msg, end='', flush=True)
139
 
140
- # Also output JSON for Streamlit parsing (if needed)
141
- progress_json = {
142
- "type": "progress",
143
- "step": self.current_step,
144
- "total": self.total_steps,
145
- "percentage": progress_pct,
146
- "eta": str(eta) if eta != "calculating..." else None,
147
- "step_name": step_name,
148
- "elapsed": elapsed
 
 
 
 
149
  }
150
- print(f"\nPROGRESS_JSON: {json.dumps(progress_json)}")
151
-
152
- # Store step time for better estimation
153
- if len(self.step_times) >= 3: # Keep last 3 step times for moving average
154
- self.step_times.pop(0)
155
- self.step_times.append(current_time - (self.start_time + sum(self.step_times)))
156
-
157
- def finish(self):
158
- """Complete progress tracking"""
159
- total_time = time.time() - self.start_time
160
- print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
161
-
162
-
163
- def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5,
164
- use_enhanced_features: bool = False) -> Dict:
165
- """Estimate training time based on dataset characteristics and feature complexity"""
166
-
167
- # Base time estimates (in seconds) based on empirical testing
168
- base_times = {
169
- 'preprocessing': max(0.1, dataset_size * 0.001), # ~1ms per sample
170
- 'vectorization': max(0.5, dataset_size * 0.01), # ~10ms per sample
171
- 'feature_selection': max(0.2, dataset_size * 0.005), # ~5ms per sample
172
- 'simple_training': max(1.0, dataset_size * 0.02), # ~20ms per sample
173
- 'evaluation': max(0.5, dataset_size * 0.01), # ~10ms per sample
174
- }
175
-
176
- # Enhanced feature engineering time multipliers
177
- if use_enhanced_features:
178
- base_times['preprocessing'] *= 2.5 # More complex preprocessing
179
- base_times['vectorization'] *= 1.5 # Additional feature extraction
180
- base_times['feature_selection'] *= 2.0 # More features to select from
181
- base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05) # New step
182
-
183
- # Hyperparameter tuning multipliers
184
- tuning_multipliers = {
185
- 'logistic_regression': 8 if enable_tuning else 1, # 8 param combinations
186
- 'random_forest': 12 if enable_tuning else 1, # 12 param combinations
187
- }
188
-
189
- # Cross-validation multiplier
190
- cv_multiplier = cv_folds if dataset_size > 100 else 1
191
-
192
- # Calculate estimates
193
- estimates = {}
194
-
195
- # Preprocessing steps
196
- estimates['data_loading'] = 0.5
197
- estimates['preprocessing'] = base_times['preprocessing']
198
- estimates['vectorization'] = base_times['vectorization']
199
-
200
- if use_enhanced_features:
201
- estimates['enhanced_feature_extraction'] = base_times['enhanced_feature_extraction']
202
-
203
- estimates['feature_selection'] = base_times['feature_selection']
204
-
205
- # Model training (now includes CV)
206
- for model_name, multiplier in tuning_multipliers.items():
207
- model_time = base_times['simple_training'] * multiplier * cv_multiplier
208
- estimates[f'{model_name}_training'] = model_time
209
- estimates[f'{model_name}_evaluation'] = base_times['evaluation']
210
-
211
- # Cross-validation overhead
212
- estimates['cross_validation'] = base_times['simple_training'] * cv_folds * 0.5
213
-
214
- # Model saving
215
- estimates['model_saving'] = 1.0
216
-
217
- # Total estimate
218
- total_estimate = sum(estimates.values())
219
-
220
- # Add buffer for overhead (more for enhanced features)
221
- buffer_multiplier = 1.4 if use_enhanced_features else 1.2
222
- total_estimate *= buffer_multiplier
223
-
224
- return {
225
- 'detailed_estimates': estimates,
226
- 'total_seconds': total_estimate,
227
- 'total_formatted': str(timedelta(seconds=int(total_estimate))),
228
- 'dataset_size': dataset_size,
229
- 'enable_tuning': enable_tuning,
230
- 'cv_folds': cv_folds,
231
- 'use_enhanced_features': use_enhanced_features
232
- }
233
-
234
 
235
- class CrossValidationManager:
236
- """Advanced cross-validation management with comprehensive metrics"""
237
-
238
- def __init__(self, cv_folds: int = 5, random_state: int = 42):
239
- self.cv_folds = cv_folds
240
- self.random_state = random_state
241
- self.cv_results = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
- def create_cv_strategy(self, X, y) -> StratifiedKFold:
244
- """Create appropriate CV strategy based on data characteristics"""
245
- # Calculate appropriate CV folds for small datasets
246
- n_samples = len(X)
247
- min_samples_per_fold = 3 # Minimum samples per fold
248
- max_folds = n_samples // min_samples_per_fold
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
- # Adjust folds based on data size and class distribution
251
- unique_classes = np.unique(y)
252
- min_class_count = min([np.sum(y == cls) for cls in unique_classes])
253
 
254
- # Ensure each fold has at least one sample from each class
255
- max_folds_by_class = min_class_count
256
 
257
- actual_folds = max(2, min(self.cv_folds, max_folds, max_folds_by_class))
 
 
 
 
 
 
 
 
 
258
 
259
- logger.info(f"Using {actual_folds} CV folds (requested: {self.cv_folds})")
 
 
260
 
261
- return StratifiedKFold(
262
- n_splits=actual_folds,
263
- shuffle=True,
264
- random_state=self.random_state
265
  )
266
-
267
- def perform_cross_validation(self, pipeline, X, y, cv_strategy=None) -> Dict:
268
- """Perform comprehensive cross-validation with multiple metrics"""
269
 
270
- if cv_strategy is None:
271
- cv_strategy = self.create_cv_strategy(X, y)
 
 
 
 
 
 
272
 
273
- logger.info(f"Starting cross-validation with {cv_strategy.n_splits} folds...")
274
-
275
- # Define scoring metrics
276
- scoring_metrics = {
277
- 'accuracy': 'accuracy',
278
- 'precision': 'precision_weighted',
279
- 'recall': 'recall_weighted',
280
- 'f1': 'f1_weighted',
281
- 'roc_auc': 'roc_auc'
282
- }
283
 
284
  try:
285
- # Perform cross-validation
286
- cv_scores = cross_validate(
287
- pipeline, X, y,
288
- cv=cv_strategy,
289
- scoring=scoring_metrics,
290
- return_train_score=True,
291
- n_jobs=1, # Use single job for stability
292
- verbose=0
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  )
294
 
295
- # Process results
296
- cv_results = {
297
- 'n_splits': cv_strategy.n_splits,
298
- 'test_scores': {},
299
- 'train_scores': {},
300
- 'fold_results': []
 
 
 
 
 
 
 
301
  }
302
 
303
- # Calculate statistics for each metric
304
- for metric_name in scoring_metrics.keys():
305
- test_key = f'test_{metric_name}'
306
- train_key = f'train_{metric_name}'
307
-
308
- if test_key in cv_scores:
309
- test_scores = cv_scores[test_key]
310
- cv_results['test_scores'][metric_name] = {
311
- 'mean': float(np.mean(test_scores)),
312
- 'std': float(np.std(test_scores)),
313
- 'min': float(np.min(test_scores)),
314
- 'max': float(np.max(test_scores)),
315
- 'scores': test_scores.tolist()
316
- }
317
-
318
- if train_key in cv_scores:
319
- train_scores = cv_scores[train_key]
320
- cv_results['train_scores'][metric_name] = {
321
- 'mean': float(np.mean(train_scores)),
322
- 'std': float(np.std(train_scores)),
323
- 'min': float(np.min(train_scores)),
324
- 'max': float(np.max(train_scores)),
325
- 'scores': train_scores.tolist()
326
- }
327
 
328
- # Store individual fold results
329
- for fold_idx in range(cv_strategy.n_splits):
330
- fold_result = {
331
- 'fold': fold_idx + 1,
332
- 'test_scores': {},
333
- 'train_scores': {}
334
- }
335
-
336
- for metric_name in scoring_metrics.keys():
337
- test_key = f'test_{metric_name}'
338
- train_key = f'train_{metric_name}'
339
-
340
- if test_key in cv_scores:
341
- fold_result['test_scores'][metric_name] = float(cv_scores[test_key][fold_idx])
342
- if train_key in cv_scores:
343
- fold_result['train_scores'][metric_name] = float(cv_scores[train_key][fold_idx])
344
-
345
- cv_results['fold_results'].append(fold_result)
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
- # Calculate overfitting indicators
348
- if 'accuracy' in cv_results['test_scores'] and 'accuracy' in cv_results['train_scores']:
349
- train_mean = cv_results['train_scores']['accuracy']['mean']
350
- test_mean = cv_results['test_scores']['accuracy']['mean']
351
- cv_results['overfitting_score'] = float(train_mean - test_mean)
352
 
353
- # Calculate stability metrics
354
- if 'accuracy' in cv_results['test_scores']:
355
- test_std = cv_results['test_scores']['accuracy']['std']
356
- test_mean = cv_results['test_scores']['accuracy']['mean']
357
- cv_results['stability_score'] = float(1 - (test_std / test_mean)) if test_mean > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
- logger.info(f"Cross-validation completed successfully")
360
- logger.info(f"Mean test accuracy: {cv_results['test_scores'].get('accuracy', {}).get('mean', 'N/A'):.4f}")
361
- logger.info(f"Mean test F1: {cv_results['test_scores'].get('f1', {}).get('mean', 'N/A'):.4f}")
 
 
 
 
362
 
363
- return cv_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
 
 
 
 
 
 
 
365
  except Exception as e:
366
- logger.error(f"Cross-validation failed: {e}")
367
- return {
368
- 'error': str(e),
369
- 'n_splits': cv_strategy.n_splits if cv_strategy else self.cv_folds,
370
- 'fallback': True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  }
372
-
373
- def compare_cv_results(self, results1: Dict, results2: Dict, metric: str = 'f1') -> Dict:
374
- """Compare cross-validation results between two models"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
 
 
 
 
 
376
  try:
377
- if 'error' in results1 or 'error' in results2:
378
- return {'error': 'Cannot compare results with errors'}
 
 
 
 
 
 
 
 
379
 
380
- scores1 = results1['test_scores'][metric]['scores']
381
- scores2 = results2['test_scores'][metric]['scores']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
- # Paired t-test
384
- from scipy import stats
385
- t_stat, p_value = stats.ttest_rel(scores1, scores2)
 
 
 
 
 
 
 
 
 
 
 
386
 
387
- comparison = {
388
- 'metric': metric,
389
- 'model1_mean': results1['test_scores'][metric]['mean'],
390
- 'model2_mean': results2['test_scores'][metric]['mean'],
391
- 'model1_std': results1['test_scores'][metric]['std'],
392
- 'model2_std': results2['test_scores'][metric]['std'],
393
- 'difference': results2['test_scores'][metric]['mean'] - results1['test_scores'][metric]['mean'],
394
- 'paired_ttest': {
395
- 't_statistic': float(t_stat),
396
- 'p_value': float(p_value),
397
- 'significant': p_value < 0.05
398
- },
399
- 'effect_size': float(abs(t_stat) / np.sqrt(len(scores1))) if len(scores1) > 0 else 0
400
- }
401
 
402
- return comparison
 
 
 
 
 
 
 
403
 
404
- except Exception as e:
405
- logger.error(f"CV comparison failed: {e}")
406
- return {'error': str(e)}
 
 
 
407
 
 
408
 
409
- class EnhancedModelTrainer:
410
- """Production-ready model trainer with enhanced feature engineering and comprehensive CV"""
 
 
 
 
 
 
 
 
 
 
 
411
 
412
- def __init__(self, use_enhanced_features: bool = None):
413
- # Auto-detect enhanced features if not specified
414
- if use_enhanced_features is None:
415
- self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE
416
- else:
417
- self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE
418
-
419
- self.setup_paths()
420
- self.setup_training_config()
421
- self.setup_models()
422
- self.progress_tracker = None
423
- self.cv_manager = CrossValidationManager()
424
-
425
- # Enhanced feature tracking
426
- self.feature_engineer = None
427
- self.feature_importance_results = {}
 
 
 
 
 
 
 
 
 
428
 
429
- def setup_paths(self):
430
- """Setup all necessary paths with proper permissions"""
431
- self.base_dir = Path("/tmp")
432
- self.data_dir = self.base_dir / "data"
433
- self.model_dir = self.base_dir / "model"
434
- self.results_dir = self.base_dir / "results"
435
- self.features_dir = self.base_dir / "features" # New for enhanced features
436
 
437
- # Create directories with proper permissions
438
- for dir_path in [self.data_dir, self.model_dir, self.results_dir, self.features_dir]:
439
- dir_path.mkdir(parents=True, exist_ok=True)
440
- # Ensure write permissions
441
- try:
442
- dir_path.chmod(0o755)
443
- except:
444
- pass
445
 
446
- # File paths
447
- self.data_path = self.data_dir / "combined_dataset.csv"
448
- self.model_path = Path("/tmp/model.pkl")
449
- self.vectorizer_path = Path("/tmp/vectorizer.pkl")
450
- self.pipeline_path = Path("/tmp/pipeline.pkl")
451
- self.metadata_path = Path("/tmp/metadata.json")
452
- self.evaluation_path = self.results_dir / "evaluation_results.json"
453
-
454
- # Enhanced feature paths
455
- self.feature_engineer_path = Path("/tmp/feature_engineer.pkl")
456
- self.feature_importance_path = self.results_dir / "feature_importance.json"
457
 
458
- def setup_training_config(self):
459
- """Setup training configuration with enhanced feature parameters"""
460
- self.test_size = 0.2
461
- self.validation_size = 0.1
462
- self.random_state = 42
463
- self.cv_folds = 5
464
-
465
- # Enhanced feature configuration
466
- if self.use_enhanced_features:
467
- self.max_features = 7500 # Increased for enhanced features
468
- self.feature_selection_k = 3000 # More features to select from
469
- logger.info("Using enhanced feature engineering pipeline")
470
- else:
471
- self.max_features = 5000 # Standard TF-IDF
472
- self.feature_selection_k = 2000
473
- logger.info("Using standard TF-IDF feature pipeline")
474
-
475
- # Common parameters
476
- self.min_df = 1
477
- self.max_df = 0.95
478
- self.ngram_range = (1, 2)
479
- self.max_iter = 500
480
- self.class_weight = 'balanced'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
 
482
- def setup_models(self):
483
- """Setup model configurations for comparison"""
484
- self.models = {
485
- 'logistic_regression': {
486
- 'model': LogisticRegression(
487
- max_iter=self.max_iter,
488
- class_weight=self.class_weight,
489
- random_state=self.random_state,
490
- n_jobs=-1
491
- ),
492
- 'param_grid': {
493
- 'model__C': [0.1, 1, 10],
494
- 'model__penalty': ['l2']
495
- }
496
- },
497
- 'random_forest': {
498
- 'model': RandomForestClassifier(
499
- n_estimators=50,
500
- class_weight=self.class_weight,
501
- random_state=self.random_state,
502
- n_jobs=-1
503
- ),
504
- 'param_grid': {
505
- 'model__n_estimators': [50, 100],
506
- 'model__max_depth': [10, None]
507
- }
508
- }
509
- }
510
 
 
 
 
 
511
  def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]:
512
  """Load and validate training data"""
513
  try:
@@ -859,356 +1142,260 @@ class EnhancedModelTrainer:
859
 
860
  return results
861
 
862
- def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
863
- """Select the best performing model based on CV results"""
864
-
865
- if self.progress_tracker:
866
- self.progress_tracker.update("Selecting best model")
867
-
868
- best_model_name = None
869
- best_model = None
870
- best_score = -1
871
- best_metrics = None
872
-
873
- for model_name, result in results.items():
874
- if 'error' in result:
875
- continue
876
-
877
- # Prioritize CV F1 score if available, fallback to test F1
878
- cv_results = result['evaluation_metrics'].get('cross_validation', {})
879
- if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
880
- f1_score = cv_results['test_scores']['f1']['mean']
881
- score_type = "CV F1"
882
- else:
883
- f1_score = result['evaluation_metrics']['f1']
884
- score_type = "Test F1"
885
-
886
- if f1_score > best_score:
887
- best_score = f1_score
888
- best_model_name = model_name
889
- best_model = result['model']
890
- best_metrics = result['evaluation_metrics']
891
 
892
- if best_model_name is None:
893
- raise ValueError("No models trained successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
894
 
895
- logger.info(f"Best model: {best_model_name} with {score_type} score: {best_score:.4f}")
896
- return best_model_name, best_model, best_metrics
897
 
898
- def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
899
- """Save model artifacts and enhanced metadata with feature engineering results"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
900
  try:
901
- if self.progress_tracker:
902
- self.progress_tracker.update("Saving model")
903
-
904
- # Save the full pipeline with error handling
905
- try:
906
- joblib.dump(model, self.pipeline_path)
907
- logger.info(f"✅ Saved pipeline to {self.pipeline_path}")
908
- except Exception as e:
909
- logger.error(f"Failed to save pipeline: {e}")
910
- # Try alternative path
911
- alt_pipeline_path = Path("/tmp") / "pipeline.pkl"
912
- joblib.dump(model, alt_pipeline_path)
913
- logger.info(f"✅ Saved pipeline to {alt_pipeline_path}")
914
-
915
- # Save enhanced feature engineer if available
916
- if self.use_enhanced_features and self.feature_engineer is not None:
917
- try:
918
- self.feature_engineer.save_pipeline(self.feature_engineer_path)
919
- logger.info(f"✅ Saved feature engineer to {self.feature_engineer_path}")
920
- except Exception as e:
921
- logger.warning(f"Could not save feature engineer: {e}")
922
-
923
- # Save individual components for backward compatibility
924
- try:
925
- if hasattr(model, 'named_steps'):
926
- if 'model' in model.named_steps:
927
- joblib.dump(model.named_steps['model'], self.model_path)
928
- logger.info(f"✅ Saved model component to {self.model_path}")
929
-
930
- # Save vectorizer (standard pipeline) or enhanced features reference
931
- if 'vectorize' in model.named_steps:
932
- joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
933
- logger.info(f"✅ Saved vectorizer to {self.vectorizer_path}")
934
- elif 'enhanced_features' in model.named_steps:
935
- # Save reference to enhanced features
936
- enhanced_ref = {
937
- 'type': 'enhanced_features',
938
- 'feature_engineer_path': str(self.feature_engineer_path),
939
- 'metadata': self.feature_engineer.get_feature_metadata() if self.feature_engineer else {}
940
- }
941
- joblib.dump(enhanced_ref, self.vectorizer_path)
942
- logger.info(f"✅ Saved enhanced features reference to {self.vectorizer_path}")
943
-
944
- except Exception as e:
945
- logger.warning(f"Could not save individual components: {e}")
946
-
947
- # Generate data hash
948
- data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
949
-
950
- # Extract CV results
951
- cv_results = metrics.get('cross_validation', {})
952
 
953
- # Create enhanced metadata with feature engineering information
954
- metadata = {
955
- 'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
956
- 'model_type': model_name,
957
- 'feature_engineering': {
958
- 'type': 'enhanced' if self.use_enhanced_features else 'standard',
959
- 'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
960
- 'enhanced_features_used': self.use_enhanced_features
961
- },
962
- 'data_version': data_hash,
963
- 'test_accuracy': metrics['accuracy'],
964
- 'test_f1': metrics['f1'],
965
- 'test_precision': metrics['precision'],
966
- 'test_recall': metrics['recall'],
967
- 'test_roc_auc': metrics['roc_auc'],
968
- 'overfitting_score': metrics.get('overfitting_score', 'Unknown'),
969
- 'timestamp': datetime.now().isoformat(),
970
- 'training_config': {
971
- 'test_size': self.test_size,
972
- 'cv_folds': self.cv_folds,
973
- 'max_features': self.max_features,
974
- 'ngram_range': self.ngram_range,
975
- 'feature_selection_k': self.feature_selection_k,
976
- 'use_enhanced_features': self.use_enhanced_features
977
- }
978
  }
979
 
980
- # Add enhanced feature metadata
981
- if self.use_enhanced_features:
982
- feature_metadata = metrics.get('feature_metadata', {})
983
- if feature_metadata:
984
- metadata['enhanced_features'] = {
985
- 'total_features': feature_metadata.get('total_features', 0),
986
- 'feature_types': feature_metadata.get('feature_types', {}),
987
- 'configuration': feature_metadata.get('configuration', {})
 
 
 
 
 
988
  }
989
 
990
- # Add top features if available
991
- top_features = metrics.get('top_features', {})
992
- if top_features:
993
- metadata['top_features'] = dict(list(top_features.items())[:10]) # Top 10 features
994
-
995
- # Save detailed feature importance
996
- try:
997
- feature_analysis = {
998
- 'top_features': top_features,
999
- 'feature_metadata': feature_metadata,
1000
- 'timestamp': datetime.now().isoformat(),
1001
- 'model_version': metadata['model_version']
1002
- }
1003
-
1004
- with open(self.feature_importance_path, 'w') as f:
1005
- json.dump(feature_analysis, f, indent=2)
1006
- logger.info(f"✅ Saved feature importance analysis to {self.feature_importance_path}")
1007
-
1008
- except Exception as e:
1009
- logger.warning(f"Could not save feature importance: {e}")
1010
 
1011
- # Add comprehensive CV results to metadata
1012
- if cv_results and 'test_scores' in cv_results:
1013
- metadata['cross_validation'] = {
1014
- 'n_splits': cv_results.get('n_splits', self.cv_folds),
1015
- 'test_scores': cv_results['test_scores'],
1016
- 'train_scores': cv_results.get('train_scores', {}),
1017
- 'overfitting_score': cv_results.get('overfitting_score', 'Unknown'),
1018
- 'stability_score': cv_results.get('stability_score', 'Unknown'),
1019
- 'individual_fold_results': cv_results.get('fold_results', [])
1020
  }
1021
 
1022
- # Add summary statistics
1023
- if 'f1' in cv_results['test_scores']:
1024
- metadata['cv_f1_mean'] = cv_results['test_scores']['f1']['mean']
1025
- metadata['cv_f1_std'] = cv_results['test_scores']['f1']['std']
1026
- metadata['cv_f1_min'] = cv_results['test_scores']['f1']['min']
1027
- metadata['cv_f1_max'] = cv_results['test_scores']['f1']['max']
1028
-
1029
- if 'accuracy' in cv_results['test_scores']:
1030
- metadata['cv_accuracy_mean'] = cv_results['test_scores']['accuracy']['mean']
1031
- metadata['cv_accuracy_std'] = cv_results['test_scores']['accuracy']['std']
1032
-
1033
- # Add model comparison results if available
1034
- if len(results) > 1:
1035
- model_comparison = {}
1036
- for other_model_name, other_result in results.items():
1037
- if other_model_name != model_name and 'error' not in other_result:
1038
- other_cv = other_result['evaluation_metrics'].get('cross_validation', {})
1039
- if cv_results and other_cv:
1040
- comparison = self.cv_manager.compare_cv_results(cv_results, other_cv)
1041
- model_comparison[other_model_name] = comparison
1042
 
1043
- if model_comparison:
1044
- metadata['model_comparison'] = model_comparison
1045
-
1046
- # Save metadata with error handling
1047
- try:
1048
- with open(self.metadata_path, 'w') as f:
1049
- json.dump(metadata, f, indent=2)
1050
- logger.info(f"✅ Saved enhanced metadata to {self.metadata_path}")
1051
- except Exception as e:
1052
- logger.warning(f"Could not save metadata: {e}")
1053
-
1054
- # Log feature engineering summary
1055
- if self.use_enhanced_features and feature_metadata:
1056
- logger.info(f"✅ Enhanced features summary:")
1057
- logger.info(f" Total features: {feature_metadata.get('total_features', 0)}")
1058
- for feature_type, count in feature_metadata.get('feature_types', {}).items():
1059
- logger.info(f" {feature_type}: {count}")
1060
-
1061
- logger.info(f"✅ Model artifacts saved successfully with {'enhanced' if self.use_enhanced_features else 'standard'} features")
1062
- return True
1063
-
1064
- except Exception as e:
1065
- logger.error(f"Failed to save model artifacts: {str(e)}")
1066
- # Try to save at least the core pipeline
1067
- try:
1068
- joblib.dump(model, Path("/tmp/pipeline_backup.pkl"))
1069
- logger.info("✅ Saved backup pipeline")
1070
- return True
1071
- except Exception as e2:
1072
- logger.error(f"Failed to save backup pipeline: {str(e2)}")
1073
- return False
1074
-
1075
- def train_model(self, data_path: str = None, force_enhanced: bool = None) -> Tuple[bool, str]:
1076
- """Main training function with enhanced feature engineering pipeline"""
1077
- try:
1078
- # Override enhanced features setting if specified
1079
- if force_enhanced is not None:
1080
- original_setting = self.use_enhanced_features
1081
- self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE
1082
- if force_enhanced and not ENHANCED_FEATURES_AVAILABLE:
1083
- logger.warning("Enhanced features requested but not available, using standard features")
1084
 
1085
- feature_type = "enhanced" if self.use_enhanced_features else "standard"
1086
- logger.info(f"Starting {feature_type} model training with cross-validation...")
1087
-
1088
- # Override data path if provided
1089
- if data_path:
1090
- self.data_path = Path(data_path)
1091
-
1092
- # Load and validate data
1093
- success, df, message = self.load_and_validate_data()
1094
- if not success:
1095
- return False, message
1096
-
1097
- # Estimate training time and setup progress tracker
1098
- time_estimate = estimate_training_time(
1099
- len(df),
1100
- enable_tuning=True,
1101
- cv_folds=self.cv_folds,
1102
- use_enhanced_features=self.use_enhanced_features
1103
- )
1104
 
1105
- print(f"\n📊 Enhanced Training Configuration:")
1106
- print(f"Dataset size: {len(df)} samples")
1107
- print(f"Feature engineering: {feature_type.title()}")
1108
- print(f"Cross-validation folds: {self.cv_folds}")
1109
- print(f"Estimated time: {time_estimate['total_formatted']}")
1110
- print(f"Models to train: {len(self.models)}")
1111
- print(f"Hyperparameter tuning: Enabled")
1112
- if self.use_enhanced_features:
1113
- print(f"Enhanced features: Sentiment, Readability, Entities, Linguistic")
1114
- print()
1115
-
1116
- # Setup progress tracker (adjusted for enhanced features)
1117
- base_steps = 4 + (len(self.models) * 3) + 1 # Basic steps
1118
- enhanced_steps = 2 if self.use_enhanced_features else 0 # Feature engineering steps
1119
- total_steps = base_steps + enhanced_steps
1120
- self.progress_tracker = ProgressTracker(total_steps, f"{feature_type.title()} Training Progress")
1121
-
1122
- # Prepare data
1123
- X = df['text'].values
1124
- y = df['label'].values
1125
-
1126
- # Train-test split with smart handling for small datasets
1127
- self.progress_tracker.update("Splitting data")
1128
 
1129
- # Ensure minimum test size for very small datasets
1130
- if len(X) < 10:
1131
- test_size = max(0.1, 1/len(X)) # At least 1 sample for test
1132
- else:
1133
- test_size = self.test_size
1134
-
1135
- # Check if stratification is possible
1136
- label_counts = pd.Series(y).value_counts()
1137
- min_class_count = label_counts.min()
1138
- can_stratify = min_class_count >= 2 and len(y) >= 4
1139
 
1140
- X_train, X_test, y_train, y_test = train_test_split(
1141
- X, y,
1142
- test_size=test_size,
1143
- stratify=y if can_stratify else None,
1144
- random_state=self.random_state
1145
- )
1146
-
1147
- logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
1148
 
1149
- # Additional validation for very small datasets
1150
- if len(X_train) < 3:
1151
- logger.warning(f"Very small training set: {len(X_train)} samples. CV results may be unreliable.")
1152
- if len(X_test) < 1:
1153
- return False, "Cannot create test set. Dataset too small."
1154
-
1155
- # Train and evaluate models with enhanced features
1156
- results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
1157
-
1158
- # Select best model
1159
- best_model_name, best_model, best_metrics = self.select_best_model(results)
1160
-
1161
- # Save model artifacts with enhanced feature information
1162
- if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
1163
- return False, "Failed to save model artifacts"
1164
-
1165
- # Finish progress tracking
1166
- self.progress_tracker.finish()
1167
-
1168
- # Create success message with enhanced feature information
1169
- cv_results = best_metrics.get('cross_validation', {})
1170
- cv_info = ""
1171
- if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
1172
- cv_f1_mean = cv_results['test_scores']['f1']['mean']
1173
- cv_f1_std = cv_results['test_scores']['f1']['std']
1174
- cv_info = f", CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})"
1175
-
1176
- # Enhanced features summary
1177
- feature_info = ""
1178
- if self.use_enhanced_features:
1179
- feature_metadata = best_metrics.get('feature_metadata', {})
1180
- if feature_metadata:
1181
- total_features = feature_metadata.get('total_features', 0)
1182
- feature_info = f", Enhanced Features: {total_features}"
1183
-
1184
- success_message = (
1185
- f"{feature_type.title()} model training completed successfully. "
1186
- f"Best model: {best_model_name} "
1187
- f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info})"
1188
- )
1189
 
1190
- logger.info(success_message)
1191
- return True, success_message
1192
 
1193
- except Exception as e:
1194
- if self.progress_tracker:
1195
- print() # New line after progress bar
1196
- error_message = f"Enhanced model training failed: {str(e)}"
1197
- logger.error(error_message)
1198
- return False, error_message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1199
 
1200
 
1201
  def main():
1202
- """Main execution function with enhanced feature engineering support"""
1203
  import argparse
1204
 
1205
  # Parse command line arguments
1206
- parser = argparse.ArgumentParser(description='Train fake news detection model with enhanced features')
1207
  parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
1208
  parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
1209
  parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
1210
  parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features')
1211
  parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only')
 
 
1212
  args = parser.parse_args()
1213
 
1214
  # Determine feature engineering mode
@@ -1222,7 +1409,18 @@ def main():
1222
  use_enhanced = False
1223
  logger.info("Standard features explicitly requested")
1224
 
1225
- trainer = EnhancedModelTrainer(use_enhanced_features=use_enhanced)
 
 
 
 
 
 
 
 
 
 
 
1226
 
1227
  # Apply CV folds from command line
1228
  if args.cv_folds:
@@ -1246,6 +1444,14 @@ def main():
1246
  if 'enhanced_features' in config and use_enhanced is None:
1247
  trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE
1248
 
 
 
 
 
 
 
 
 
1249
  # Filter models if specified
1250
  selected_models = config.get('selected_models')
1251
  if selected_models and len(selected_models) < len(trainer.models):
@@ -1258,10 +1464,19 @@ def main():
1258
  logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds")
1259
  if trainer.use_enhanced_features:
1260
  logger.info("Enhanced features enabled via configuration")
 
 
1261
 
1262
  except Exception as e:
1263
  logger.warning(f"Failed to load configuration: {e}, using defaults")
1264
 
 
 
 
 
 
 
 
1265
  success, message = trainer.train_model(data_path=args.data_path)
1266
 
1267
  if success:
@@ -1277,6 +1492,23 @@ def main():
1277
  print(f" {feature_type}: {count}")
1278
  except Exception as e:
1279
  logger.warning(f"Could not display feature summary: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1280
  else:
1281
  print(f"❌ {message}")
1282
  exit(1)
 
1
+ # Enhanced model/train.py with LightGBM ensemble integration
2
 
3
  import seaborn as sns
4
  import matplotlib.pyplot as plt
 
14
  train_test_split, cross_val_score, GridSearchCV,
15
  StratifiedKFold, validation_curve, cross_validate
16
  )
17
+ from sklearn.ensemble import RandomForestClassifier, VotingClassifier
18
  from sklearn.linear_model import LogisticRegression
19
  from sklearn.feature_extraction.text import TfidfVectorizer
20
  import pandas as pd
 
31
  from typing import Dict, Tuple, Optional, Any, List
32
  import warnings
33
  import re
34
+
35
+ # LightGBM import
36
+ try:
37
+ import lightgbm as lgb
38
+ LIGHTGBM_AVAILABLE = True
39
+ logging.info("LightGBM available for ensemble training")
40
+ except ImportError:
41
+ LIGHTGBM_AVAILABLE = False
42
+ logging.warning("LightGBM not available - ensemble training will use alternative algorithms")
43
+
44
  warnings.filterwarnings('ignore')
45
 
46
  # Import enhanced feature engineering components
 
70
  logger = logging.getLogger(__name__)
71
 
72
 
73
+ class EnsembleModelTrainer:
74
+ """Production-ready ensemble model trainer with LightGBM integration"""
75
+
76
+ def __init__(self, use_enhanced_features: bool = None, use_ensemble: bool = True):
77
+ # Auto-detect enhanced features if not specified
78
+ if use_enhanced_features is None:
79
+ self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE
80
+ else:
81
+ self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE
 
 
 
 
82
 
83
+ self.use_ensemble = use_ensemble and LIGHTGBM_AVAILABLE
 
 
 
84
 
85
+ self.setup_paths()
86
+ self.setup_training_config()
87
+ self.setup_models()
88
+ self.progress_tracker = None
89
+ self.cv_manager = CrossValidationManager()
90
 
91
+ # Enhanced feature tracking
92
+ self.feature_engineer = None
93
+ self.feature_importance_results = {}
94
 
95
+ logger.info(f"Ensemble trainer initialized - Enhanced features: {self.use_enhanced_features}, "
96
+ f"LightGBM ensemble: {self.use_ensemble}")
 
 
 
 
 
 
97
 
98
+ def setup_paths(self):
99
+ """Setup all necessary paths with proper permissions"""
100
+ self.base_dir = Path("/tmp")
101
+ self.data_dir = self.base_dir / "data"
102
+ self.model_dir = self.base_dir / "model"
103
+ self.results_dir = self.base_dir / "results"
104
+ self.features_dir = self.base_dir / "features"
105
 
106
+ # Create directories with proper permissions
107
+ for dir_path in [self.data_dir, self.model_dir, self.results_dir, self.features_dir]:
108
+ dir_path.mkdir(parents=True, exist_ok=True)
109
+ try:
110
+ dir_path.chmod(0o755)
111
+ except:
112
+ pass
113
+
114
+ # File paths
115
+ self.data_path = self.data_dir / "combined_dataset.csv"
116
+ self.model_path = Path("/tmp/model.pkl")
117
+ self.vectorizer_path = Path("/tmp/vectorizer.pkl")
118
+ self.pipeline_path = Path("/tmp/pipeline.pkl")
119
+ self.metadata_path = Path("/tmp/metadata.json")
120
+ self.evaluation_path = self.results_dir / "evaluation_results.json"
121
 
122
+ # Enhanced feature paths
123
+ self.feature_engineer_path = Path("/tmp/feature_engineer.pkl")
124
+ self.feature_importance_path = self.results_dir / "feature_importance.json"
 
 
125
 
126
+ # Ensemble-specific paths
127
+ self.ensemble_path = Path("/tmp/ensemble.pkl")
128
+ self.ensemble_metadata_path = Path("/tmp/ensemble_metadata.json")
129
+
130
+ def setup_training_config(self):
131
+ """Setup training configuration with ensemble parameters"""
132
+ self.test_size = 0.2
133
+ self.validation_size = 0.1
134
+ self.random_state = 42
135
+ self.cv_folds = 5
136
 
137
+ # Enhanced feature configuration
138
+ if self.use_enhanced_features:
139
+ self.max_features = 7500
140
+ self.feature_selection_k = 3000
141
+ logger.info("Using enhanced feature engineering pipeline")
 
142
  else:
143
+ self.max_features = 5000
144
+ self.feature_selection_k = 2000
145
+ logger.info("Using standard TF-IDF feature pipeline")
 
 
 
146
 
147
+ # Common parameters
148
+ self.min_df = 1
149
+ self.max_df = 0.95
150
+ self.ngram_range = (1, 2)
151
+ self.max_iter = 500
152
+ self.class_weight = 'balanced'
 
 
153
 
154
+ # LightGBM specific parameters
155
+ self.lgb_params = {
156
+ 'objective': 'binary',
157
+ 'metric': 'binary_logloss',
158
+ 'boosting_type': 'gbdt',
159
+ 'num_leaves': 31,
160
+ 'learning_rate': 0.1,
161
+ 'feature_fraction': 0.8,
162
+ 'bagging_fraction': 0.8,
163
+ 'bagging_freq': 5,
164
+ 'verbose': -1,
165
+ 'random_state': self.random_state,
166
+ 'class_weight': 'balanced'
167
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ def setup_models(self):
170
+ """Setup model configurations including LightGBM ensemble"""
171
+ # Base models
172
+ self.models = {
173
+ 'logistic_regression': {
174
+ 'model': LogisticRegression(
175
+ max_iter=self.max_iter,
176
+ class_weight=self.class_weight,
177
+ random_state=self.random_state,
178
+ n_jobs=-1
179
+ ),
180
+ 'param_grid': {
181
+ 'model__C': [0.1, 1, 10],
182
+ 'model__penalty': ['l2']
183
+ }
184
+ },
185
+ 'random_forest': {
186
+ 'model': RandomForestClassifier(
187
+ n_estimators=50,
188
+ class_weight=self.class_weight,
189
+ random_state=self.random_state,
190
+ n_jobs=-1
191
+ ),
192
+ 'param_grid': {
193
+ 'model__n_estimators': [50, 100],
194
+ 'model__max_depth': [10, None]
195
+ }
196
+ }
197
+ }
198
 
199
+ # Add LightGBM if available
200
+ if LIGHTGBM_AVAILABLE and self.use_ensemble:
201
+ self.models['lightgbm'] = {
202
+ 'model': lgb.LGBMClassifier(
203
+ **self.lgb_params,
204
+ n_estimators=100
205
+ ),
206
+ 'param_grid': {
207
+ 'model__n_estimators': [50, 100],
208
+ 'model__learning_rate': [0.05, 0.1],
209
+ 'model__num_leaves': [31, 63]
210
+ }
211
+ }
212
+
213
+ def create_lightgbm_ensemble(self, models_dict: Dict, X_train, y_train) -> VotingClassifier:
214
+ """Create ensemble with LightGBM and traditional models"""
215
+ if not LIGHTGBM_AVAILABLE:
216
+ logger.warning("LightGBM not available for ensemble creation")
217
+ return None
218
 
219
+ logger.info("Creating LightGBM ensemble model...")
 
 
220
 
221
+ # Prepare estimators for voting classifier
222
+ estimators = []
223
 
224
+ for model_name, model_info in models_dict.items():
225
+ if 'best_estimator' in model_info:
226
+ model = model_info['best_estimator']
227
+ # Extract the actual model from pipeline
228
+ if hasattr(model, 'named_steps') and 'model' in model.named_steps:
229
+ actual_model = model.named_steps['model']
230
+ else:
231
+ actual_model = model
232
+
233
+ estimators.append((model_name, actual_model))
234
 
235
+ if len(estimators) < 2:
236
+ logger.warning("Not enough models for ensemble creation")
237
+ return None
238
 
239
+ # Create ensemble with soft voting for probability-based predictions
240
+ ensemble = VotingClassifier(
241
+ estimators=estimators,
242
+ voting='soft'
243
  )
 
 
 
244
 
245
+ logger.info(f"Ensemble created with {len(estimators)} models: {[name for name, _ in estimators]}")
246
+ return ensemble
247
+
248
+ def train_ensemble_model(self, X_train, X_test, y_train, y_test, individual_results: Dict) -> Dict:
249
+ """Train and evaluate ensemble model"""
250
+ if not self.use_ensemble or not LIGHTGBM_AVAILABLE:
251
+ logger.info("Ensemble training skipped - using best individual model")
252
+ return {}
253
 
254
+ logger.info("Training ensemble model with LightGBM integration...")
 
 
 
 
 
 
 
 
 
255
 
256
  try:
257
+ # Create ensemble from individual models
258
+ ensemble = self.create_lightgbm_ensemble(individual_results, X_train, y_train)
259
+
260
+ if ensemble is None:
261
+ return {'error': 'Failed to create ensemble'}
262
+
263
+ # Train ensemble (models are already trained, just fitting the voting mechanism)
264
+ logger.info("Training ensemble voting mechanism...")
265
+
266
+ # For voting classifier with already-fitted models, we need to fit on features
267
+ # First, we need to prepare features the same way
268
+ pipeline = self.create_preprocessing_pipeline()
269
+ X_train_processed = pipeline.fit_transform(X_train, y_train)
270
+ X_test_processed = pipeline.transform(X_test)
271
+
272
+ # Fit the ensemble
273
+ ensemble.fit(X_train_processed, y_train)
274
+
275
+ # Evaluate ensemble
276
+ ensemble_metrics = self.comprehensive_evaluation_ensemble(
277
+ ensemble, X_test_processed, y_test, X_train_processed, y_train
278
  )
279
 
280
+ # Create ensemble pipeline for consistency
281
+ ensemble_pipeline = Pipeline([
282
+ ('preprocessing', pipeline.steps[0][1]), # Use same preprocessing
283
+ ('ensemble', ensemble)
284
+ ])
285
+
286
+ ensemble_results = {
287
+ 'ensemble': ensemble_pipeline,
288
+ 'evaluation_metrics': ensemble_metrics,
289
+ 'component_models': list(individual_results.keys()),
290
+ 'ensemble_type': 'voting_classifier_with_lightgbm' if 'lightgbm' in individual_results else 'voting_classifier',
291
+ 'training_time': datetime.now().isoformat(),
292
+ 'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
293
  }
294
 
295
+ logger.info(f"Ensemble training completed - F1: {ensemble_metrics.get('f1', 'N/A'):.4f}")
296
+ return ensemble_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
+ except Exception as e:
299
+ logger.error(f"Ensemble training failed: {str(e)}")
300
+ return {'error': str(e)}
301
+
302
+ def comprehensive_evaluation_ensemble(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
303
+ """Comprehensive evaluation specifically for ensemble models"""
304
+
305
+ logger.info("Evaluating ensemble model...")
306
+
307
+ # Predictions
308
+ y_pred = model.predict(X_test)
309
+ y_pred_proba = model.predict_proba(X_test)[:, 1]
310
+
311
+ # Basic metrics
312
+ metrics = {
313
+ 'accuracy': float(accuracy_score(y_test, y_pred)),
314
+ 'precision': float(precision_score(y_test, y_pred, average='weighted')),
315
+ 'recall': float(recall_score(y_test, y_pred, average='weighted')),
316
+ 'f1': float(f1_score(y_test, y_pred, average='weighted')),
317
+ 'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
318
+ }
319
+
320
+ # Confusion matrix
321
+ cm = confusion_matrix(y_test, y_pred)
322
+ metrics['confusion_matrix'] = cm.tolist()
323
+
324
+ # Cross-validation on full dataset
325
+ if X_train is not None and y_train is not None:
326
+ X_full = np.concatenate([X_train, X_test])
327
+ y_full = np.concatenate([y_train, y_test])
328
 
329
+ logger.info("Performing cross-validation on ensemble...")
330
+ cv_results = self.cv_manager.perform_cross_validation(model, X_full, y_full)
331
+ metrics['cross_validation'] = cv_results
 
 
332
 
333
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
334
+ cv_f1_mean = cv_results['test_scores']['f1']['mean']
335
+ cv_f1_std = cv_results['test_scores']['f1']['std']
336
+ logger.info(f"Ensemble CV F1 Score: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
337
+
338
+ # Ensemble-specific metrics
339
+ metrics['ensemble_info'] = {
340
+ 'model_type': 'ensemble',
341
+ 'voting_type': getattr(model, 'voting', 'unknown'),
342
+ 'n_estimators': len(getattr(model, 'estimators_', [])),
343
+ 'estimator_names': [name for name, _ in getattr(model, 'estimators', [])]
344
+ }
345
+
346
+ return metrics
347
+
348
+ def select_best_model(self, results: Dict, ensemble_results: Dict = None) -> Tuple[str, Any, Dict]:
349
+ """Select the best performing model including ensemble option"""
350
+
351
+ logger.info("Selecting best model from individual models and ensemble...")
352
+
353
+ best_model_name = None
354
+ best_model = None
355
+ best_score = -1
356
+ best_metrics = None
357
+
358
+ # Evaluate individual models
359
+ for model_name, result in results.items():
360
+ if 'error' in result:
361
+ continue
362
+
363
+ cv_results = result['evaluation_metrics'].get('cross_validation', {})
364
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
365
+ f1_score = cv_results['test_scores']['f1']['mean']
366
+ score_type = "CV F1"
367
+ else:
368
+ f1_score = result['evaluation_metrics']['f1']
369
+ score_type = "Test F1"
370
+
371
+ logger.info(f"Model {model_name}: {score_type} = {f1_score:.4f}")
372
+
373
+ if f1_score > best_score:
374
+ best_score = f1_score
375
+ best_model_name = model_name
376
+ best_model = result['model']
377
+ best_metrics = result['evaluation_metrics']
378
+
379
+ # Evaluate ensemble if available
380
+ if ensemble_results and 'evaluation_metrics' in ensemble_results:
381
+ ensemble_metrics = ensemble_results['evaluation_metrics']
382
 
383
+ cv_results = ensemble_metrics.get('cross_validation', {})
384
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
385
+ ensemble_f1 = cv_results['test_scores']['f1']['mean']
386
+ score_type = "CV F1"
387
+ else:
388
+ ensemble_f1 = ensemble_metrics['f1']
389
+ score_type = "Test F1"
390
 
391
+ logger.info(f"Ensemble model: {score_type} = {ensemble_f1:.4f}")
392
+
393
+ if ensemble_f1 > best_score:
394
+ best_score = ensemble_f1
395
+ best_model_name = "ensemble"
396
+ best_model = ensemble_results['ensemble']
397
+ best_metrics = ensemble_metrics
398
+
399
+ if best_model_name is None:
400
+ raise ValueError("No models trained successfully")
401
+
402
+ logger.info(f"Best model selected: {best_model_name} with F1 score: {best_score:.4f}")
403
+ return best_model_name, best_model, best_metrics
404
+
405
+ def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict,
406
+ ensemble_results: Dict = None) -> bool:
407
+ """Enhanced model artifacts saving with ensemble support"""
408
+ try:
409
+ logger.info(f"Saving model artifacts for {model_name}...")
410
+
411
+ # Save the main pipeline/model
412
+ if model_name == "ensemble":
413
+ # Save ensemble model
414
+ joblib.dump(model, self.ensemble_path)
415
+ logger.info(f"Saved ensemble model to {self.ensemble_path}")
416
+
417
+ # Also save as main pipeline for API compatibility
418
+ joblib.dump(model, self.pipeline_path)
419
+ logger.info(f"Saved ensemble as main pipeline to {self.pipeline_path}")
420
+
421
+ # Save ensemble metadata
422
+ ensemble_metadata = {
423
+ 'model_type': 'ensemble',
424
+ 'ensemble_type': ensemble_results.get('ensemble_type', 'voting_classifier'),
425
+ 'component_models': ensemble_results.get('component_models', []),
426
+ 'ensemble_info': metrics.get('ensemble_info', {}),
427
+ 'timestamp': datetime.now().isoformat()
428
+ }
429
+
430
+ with open(self.ensemble_metadata_path, 'w') as f:
431
+ json.dump(ensemble_metadata, f, indent=2)
432
+ logger.info(f"Saved ensemble metadata to {self.ensemble_metadata_path}")
433
+
434
+ else:
435
+ # Save individual model pipeline
436
+ joblib.dump(model, self.pipeline_path)
437
+ logger.info(f"Saved {model_name} pipeline to {self.pipeline_path}")
438
+
439
+ # Save individual components for backward compatibility
440
+ try:
441
+ if hasattr(model, 'named_steps'):
442
+ if 'model' in model.named_steps:
443
+ joblib.dump(model.named_steps['model'], self.model_path)
444
+ elif 'ensemble' in model.named_steps:
445
+ joblib.dump(model.named_steps['ensemble'], self.model_path)
446
+
447
+ # Save vectorizer or enhanced features reference
448
+ if 'vectorize' in model.named_steps:
449
+ joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
450
+ elif 'enhanced_features' in model.named_steps:
451
+ enhanced_ref = {
452
+ 'type': 'enhanced_features',
453
+ 'feature_engineer_path': str(self.feature_engineer_path),
454
+ 'metadata': self.feature_engineer.get_feature_metadata() if self.feature_engineer else {}
455
+ }
456
+ joblib.dump(enhanced_ref, self.vectorizer_path)
457
+
458
+ except Exception as e:
459
+ logger.warning(f"Could not save individual components: {e}")
460
+
461
+ # Generate enhanced metadata
462
+ metadata = self._create_enhanced_metadata(model_name, metrics, results, ensemble_results)
463
 
464
+ # Save metadata
465
+ with open(self.metadata_path, 'w') as f:
466
+ json.dump(metadata, f, indent=2)
467
+ logger.info(f"Saved metadata to {self.metadata_path}")
468
+
469
+ return True
470
+
471
  except Exception as e:
472
+ logger.error(f"Failed to save model artifacts: {str(e)}")
473
+ return False
474
+
475
+ def _create_enhanced_metadata(self, model_name: str, metrics: Dict, results: Dict,
476
+ ensemble_results: Dict = None) -> Dict:
477
+ """Create comprehensive metadata including ensemble information"""
478
+
479
+ # Generate data hash and version
480
+ data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
481
+ version_suffix = "ensemble" if model_name == "ensemble" else model_name
482
+
483
+ metadata = {
484
+ 'model_version': f"v2.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{version_suffix}",
485
+ 'model_type': model_name,
486
+ 'is_ensemble': model_name == "ensemble",
487
+ 'data_version': data_hash,
488
+ 'test_accuracy': metrics['accuracy'],
489
+ 'test_f1': metrics['f1'],
490
+ 'test_precision': metrics['precision'],
491
+ 'test_recall': metrics['recall'],
492
+ 'test_roc_auc': metrics['roc_auc'],
493
+ 'timestamp': datetime.now().isoformat(),
494
+ 'training_method': 'enhanced_ensemble_training' if self.use_ensemble else 'enhanced_individual_training',
495
+ 'lightgbm_available': LIGHTGBM_AVAILABLE,
496
+ 'lightgbm_used': self.use_ensemble and LIGHTGBM_AVAILABLE
497
+ }
498
+
499
+ # Add feature engineering information
500
+ metadata['feature_engineering'] = {
501
+ 'type': 'enhanced' if self.use_enhanced_features else 'standard',
502
+ 'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
503
+ 'enhanced_features_used': self.use_enhanced_features
504
+ }
505
+
506
+ # Add ensemble-specific metadata
507
+ if model_name == "ensemble" and ensemble_results:
508
+ metadata['ensemble_details'] = {
509
+ 'ensemble_type': ensemble_results.get('ensemble_type', 'voting_classifier'),
510
+ 'component_models': ensemble_results.get('component_models', []),
511
+ 'ensemble_info': metrics.get('ensemble_info', {}),
512
+ 'voting_type': metrics.get('ensemble_info', {}).get('voting_type', 'soft')
513
  }
514
+
515
+ # Add individual model performances for comparison
516
+ metadata['component_performance'] = {}
517
+ for comp_model_name in ensemble_results.get('component_models', []):
518
+ if comp_model_name in results and 'evaluation_metrics' in results[comp_model_name]:
519
+ comp_metrics = results[comp_model_name]['evaluation_metrics']
520
+ metadata['component_performance'][comp_model_name] = {
521
+ 'f1': comp_metrics.get('f1', 0),
522
+ 'accuracy': comp_metrics.get('accuracy', 0)
523
+ }
524
+
525
+ # Add CV results
526
+ cv_results = metrics.get('cross_validation', {})
527
+ if cv_results and 'test_scores' in cv_results:
528
+ metadata['cross_validation'] = {
529
+ 'n_splits': cv_results.get('n_splits', self.cv_folds),
530
+ 'test_scores': cv_results['test_scores'],
531
+ 'train_scores': cv_results.get('train_scores', {}),
532
+ 'overfitting_score': cv_results.get('overfitting_score', 'Unknown'),
533
+ 'stability_score': cv_results.get('stability_score', 'Unknown')
534
+ }
535
+
536
+ if 'f1' in cv_results['test_scores']:
537
+ metadata.update({
538
+ 'cv_f1_mean': cv_results['test_scores']['f1']['mean'],
539
+ 'cv_f1_std': cv_results['test_scores']['f1']['std']
540
+ })
541
+
542
+ # Add training configuration
543
+ metadata['training_config'] = {
544
+ 'test_size': self.test_size,
545
+ 'cv_folds': self.cv_folds,
546
+ 'max_features': self.max_features,
547
+ 'use_ensemble': self.use_ensemble,
548
+ 'use_enhanced_features': self.use_enhanced_features
549
+ }
550
 
551
+ return metadata
552
+
553
+ def train_model(self, data_path: str = None, force_enhanced: bool = None,
554
+ use_ensemble: bool = None) -> Tuple[bool, str]:
555
+ """Main training function with ensemble support"""
556
  try:
557
+ # Override settings if specified
558
+ if force_enhanced is not None:
559
+ original_enhanced = self.use_enhanced_features
560
+ self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE
561
+
562
+ if use_ensemble is not None:
563
+ self.use_ensemble = use_ensemble and LIGHTGBM_AVAILABLE
564
+
565
+ feature_type = "enhanced" if self.use_enhanced_features else "standard"
566
+ training_type = "ensemble" if self.use_ensemble else "individual"
567
 
568
+ logger.info(f"Starting {feature_type} {training_type} model training...")
569
+
570
+ # Override data path if provided
571
+ if data_path:
572
+ self.data_path = Path(data_path)
573
+
574
+ # Load and validate data
575
+ success, df, message = self.load_and_validate_data()
576
+ if not success:
577
+ return False, message
578
+
579
+ # Estimate training time
580
+ time_estimate = estimate_training_time(
581
+ len(df),
582
+ enable_tuning=True,
583
+ cv_folds=self.cv_folds,
584
+ use_enhanced_features=self.use_enhanced_features,
585
+ use_ensemble=self.use_ensemble
586
+ )
587
 
588
+ model_count = len(self.models)
589
+ logger.info(f"Training Configuration:")
590
+ logger.info(f" Dataset size: {len(df)} samples")
591
+ logger.info(f" Feature engineering: {feature_type.title()}")
592
+ logger.info(f" Training approach: {training_type.title()}")
593
+ logger.info(f" Models to train: {model_count}")
594
+ logger.info(f" LightGBM available: {LIGHTGBM_AVAILABLE}")
595
+ logger.info(f" Estimated time: {time_estimate['total_formatted']}")
596
+
597
+ # Setup progress tracker
598
+ base_steps = 4 + (model_count * 3) + 2 # Base + model training + ensemble
599
+ enhanced_steps = 2 if self.use_enhanced_features else 0
600
+ ensemble_steps = 3 if self.use_ensemble else 0
601
+ total_steps = base_steps + enhanced_steps + ensemble_steps
602
 
603
+ self.progress_tracker = ProgressTracker(
604
+ total_steps,
605
+ f"{feature_type.title()} {training_type.title()} Training"
606
+ )
607
+
608
+ # Prepare data
609
+ X = df['text'].values
610
+ y = df['label'].values
611
+
612
+ # Train-test split
613
+ self.progress_tracker.update("Splitting data")
 
 
 
614
 
615
+ if len(X) < 10:
616
+ test_size = max(0.1, 1/len(X))
617
+ else:
618
+ test_size = self.test_size
619
+
620
+ label_counts = pd.Series(y).value_counts()
621
+ min_class_count = label_counts.min()
622
+ can_stratify = min_class_count >= 2 and len(y) >= 4
623
 
624
+ X_train, X_test, y_train, y_test = train_test_split(
625
+ X, y,
626
+ test_size=test_size,
627
+ stratify=y if can_stratify else None,
628
+ random_state=self.random_state
629
+ )
630
 
631
+ logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test")
632
 
633
+ # Train individual models
634
+ results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
635
+
636
+ # Train ensemble if enabled
637
+ ensemble_results = {}
638
+ if self.use_ensemble and len([r for r in results.values() if 'error' not in r]) >= 2:
639
+ self.progress_tracker.update("Creating ensemble model")
640
+ ensemble_results = self.train_ensemble_model(X_train, X_test, y_train, y_test, results)
641
+
642
+ if ensemble_results and 'error' not in ensemble_results:
643
+ logger.info("Ensemble model trained successfully")
644
+ else:
645
+ logger.warning("Ensemble training failed, using best individual model")
646
 
647
+ # Select best model (individual or ensemble)
648
+ best_model_name, best_model, best_metrics = self.select_best_model(results, ensemble_results)
649
+
650
+ # Save model artifacts
651
+ if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results, ensemble_results):
652
+ return False, "Failed to save model artifacts"
653
+
654
+ # Finish progress tracking
655
+ self.progress_tracker.finish()
656
+
657
+ # Create success message
658
+ cv_results = best_metrics.get('cross_validation', {})
659
+ cv_info = ""
660
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
661
+ cv_f1_mean = cv_results['test_scores']['f1']['mean']
662
+ cv_f1_std = cv_results['test_scores']['f1']['std']
663
+ cv_info = f", CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})"
664
+
665
+ # Enhanced features info
666
+ feature_info = f", {feature_type.title()} Features"
667
+ if self.use_enhanced_features:
668
+ feature_metadata = best_metrics.get('feature_metadata', {})
669
+ if feature_metadata:
670
+ total_features = feature_metadata.get('total_features', 0)
671
+ feature_info = f", Enhanced Features: {total_features}"
672
 
673
+ # Ensemble info
674
+ ensemble_info = ""
675
+ if best_model_name == "ensemble":
676
+ ensemble_details = best_metrics.get('ensemble_info', {})
677
+ n_models = ensemble_details.get('n_estimators', 0)
678
+ ensemble_info = f", Ensemble: {n_models} models"
 
679
 
680
+ success_message = (
681
+ f"{training_type.title()} model training completed successfully. "
682
+ f"Best model: {best_model_name} "
683
+ f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info}{ensemble_info})"
684
+ )
 
 
 
685
 
686
+ logger.info(success_message)
687
+ return True, success_message
 
 
 
 
 
 
 
 
 
688
 
689
+ except Exception as e:
690
+ if self.progress_tracker:
691
+ print()
692
+ error_message = f"Enhanced ensemble model training failed: {str(e)}"
693
+ logger.error(error_message)
694
+ return False, error_message
695
+
696
+ # Include all other methods from the original trainer (load_and_validate_data,
697
+ # create_preprocessing_pipeline, comprehensive_evaluation, train_and_evaluate_models, etc.)
698
+ # These remain largely the same but with minor modifications for ensemble support
699
+
700
+
701
+ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5,
702
+ use_enhanced_features: bool = False, use_ensemble: bool = False) -> Dict:
703
+ """Enhanced time estimation including ensemble training"""
704
+
705
+ # Base time estimates (in seconds)
706
+ base_times = {
707
+ 'preprocessing': max(0.1, dataset_size * 0.001),
708
+ 'vectorization': max(0.5, dataset_size * 0.01),
709
+ 'feature_selection': max(0.2, dataset_size * 0.005),
710
+ 'simple_training': max(1.0, dataset_size * 0.02),
711
+ 'evaluation': max(0.5, dataset_size * 0.01),
712
+ }
713
+
714
+ # Enhanced feature engineering time multipliers
715
+ if use_enhanced_features:
716
+ base_times['preprocessing'] *= 2.5
717
+ base_times['vectorization'] *= 1.5
718
+ base_times['feature_selection'] *= 2.0
719
+ base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05)
720
+
721
+ # LightGBM training time (typically faster than RF but slower than LogReg)
722
+ if use_ensemble and LIGHTGBM_AVAILABLE:
723
+ base_times['lightgbm_training'] = max(2.0, dataset_size * 0.03)
724
+ base_times['ensemble_creation'] = max(1.0, dataset_size * 0.005)
725
+ base_times['ensemble_evaluation'] = max(1.0, dataset_size * 0.015)
726
+
727
+ # Hyperparameter tuning multipliers
728
+ tuning_multipliers = {
729
+ 'logistic_regression': 8 if enable_tuning else 1,
730
+ 'random_forest': 12 if enable_tuning else 1,
731
+ }
732
+
733
+ if use_ensemble and LIGHTGBM_AVAILABLE:
734
+ tuning_multipliers['lightgbm'] = 10 if enable_tuning else 1
735
+
736
+ # Cross-validation multiplier
737
+ cv_multiplier = cv_folds if dataset_size > 100 else 1
738
+
739
+ # Calculate estimates
740
+ estimates = {}
741
+
742
+ # Preprocessing steps
743
+ estimates['data_loading'] = 0.5
744
+ estimates['preprocessing'] = base_times['preprocessing']
745
+ estimates['vectorization'] = base_times['vectorization']
746
+
747
+ if use_enhanced_features:
748
+ estimates['enhanced_feature_extraction'] = base_times['enhanced_feature_extraction']
749
+
750
+ estimates['feature_selection'] = base_times['feature_selection']
751
+
752
+ # Model training (includes CV)
753
+ for model_name, multiplier in tuning_multipliers.items():
754
+ model_time = base_times['simple_training'] * multiplier * cv_multiplier
755
+ estimates[f'{model_name}_training'] = model_time
756
+ estimates[f'{model_name}_evaluation'] = base_times['evaluation']
757
+
758
+ # Ensemble-specific steps
759
+ if use_ensemble and LIGHTGBM_AVAILABLE:
760
+ estimates['ensemble_creation'] = base_times['ensemble_creation']
761
+ estimates['ensemble_evaluation'] = base_times['ensemble_evaluation']
762
+ estimates['ensemble_cross_validation'] = base_times['simple_training'] * cv_folds * 0.3
763
+
764
+ # Cross-validation overhead
765
+ estimates['cross_validation'] = base_times['simple_training'] * cv_folds * 0.5
766
+
767
+ # Model saving
768
+ estimates['model_saving'] = 1.0
769
+
770
+ # Total estimate
771
+ total_estimate = sum(estimates.values())
772
+
773
+ # Add buffer for overhead
774
+ buffer_multiplier = 1.6 if use_ensemble else (1.4 if use_enhanced_features else 1.2)
775
+ total_estimate *= buffer_multiplier
776
+
777
+ return {
778
+ 'detailed_estimates': estimates,
779
+ 'total_seconds': total_estimate,
780
+ 'total_formatted': str(timedelta(seconds=int(total_estimate))),
781
+ 'dataset_size': dataset_size,
782
+ 'enable_tuning': enable_tuning,
783
+ 'cv_folds': cv_folds,
784
+ 'use_enhanced_features': use_enhanced_features,
785
+ 'use_ensemble': use_ensemble,
786
+ 'lightgbm_available': LIGHTGBM_AVAILABLE
787
+ }
788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
 
790
+ # Import all remaining methods from original trainer class
791
+ class EnhancedModelTrainer(EnsembleModelTrainer):
792
+ """Complete enhanced model trainer inheriting from ensemble trainer"""
793
+
794
  def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]:
795
  """Load and validate training data"""
796
  try:
 
1142
 
1143
  return results
1144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1145
 
1146
+ # Continue with ProgressTracker and CrossValidationManager classes from original...
1147
+ class ProgressTracker:
1148
+ """Progress tracking with time estimation"""
1149
+
1150
+ def __init__(self, total_steps: int, description: str = "Training"):
1151
+ self.total_steps = total_steps
1152
+ self.current_step = 0
1153
+ self.start_time = time.time()
1154
+ self.description = description
1155
+ self.step_times = []
1156
+
1157
+ def update(self, step_name: str = ""):
1158
+ """Update progress and print status"""
1159
+ self.current_step += 1
1160
+ current_time = time.time()
1161
+ elapsed = current_time - self.start_time
1162
+
1163
+ # Calculate progress percentage
1164
+ progress_pct = (self.current_step / self.total_steps) * 100
1165
+
1166
+ # Estimate remaining time
1167
+ if self.current_step > 0:
1168
+ avg_time_per_step = elapsed / self.current_step
1169
+ remaining_steps = self.total_steps - self.current_step
1170
+ eta_seconds = avg_time_per_step * remaining_steps
1171
+ eta = timedelta(seconds=int(eta_seconds))
1172
+ else:
1173
+ eta = "calculating..."
1174
+
1175
+ # Create progress bar
1176
+ bar_length = 30
1177
+ filled_length = int(bar_length * self.current_step // self.total_steps)
1178
+ bar = '█' * filled_length + '░' * (bar_length - filled_length)
1179
+
1180
+ # Print progress (this will be visible in Streamlit logs)
1181
+ status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
1182
+ if step_name:
1183
+ status_msg += f" | {step_name}"
1184
+ if eta != "calculating...":
1185
+ status_msg += f" | ETA: {eta}"
1186
+
1187
+ print(status_msg, end='', flush=True)
1188
+
1189
+ # Also output JSON for Streamlit parsing (if needed)
1190
+ progress_json = {
1191
+ "type": "progress",
1192
+ "step": self.current_step,
1193
+ "total": self.total_steps,
1194
+ "percentage": progress_pct,
1195
+ "eta": str(eta) if eta != "calculating..." else None,
1196
+ "step_name": step_name,
1197
+ "elapsed": elapsed
1198
+ }
1199
+ print(f"\nPROGRESS_JSON: {json.dumps(progress_json)}")
1200
+
1201
+ # Store step time for better estimation
1202
+ if len(self.step_times) >= 3:
1203
+ self.step_times.pop(0)
1204
+ self.step_times.append(current_time - (self.start_time + sum(self.step_times)))
1205
+
1206
+ def finish(self):
1207
+ """Complete progress tracking"""
1208
+ total_time = time.time() - self.start_time
1209
+ print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
1210
 
 
 
1211
 
1212
+ class CrossValidationManager:
1213
+ """Advanced cross-validation management with comprehensive metrics"""
1214
+
1215
+ def __init__(self, cv_folds: int = 5, random_state: int = 42):
1216
+ self.cv_folds = cv_folds
1217
+ self.random_state = random_state
1218
+ self.cv_results = {}
1219
+
1220
+ def create_cv_strategy(self, X, y) -> StratifiedKFold:
1221
+ """Create appropriate CV strategy based on data characteristics"""
1222
+ # Calculate appropriate CV folds for small datasets
1223
+ n_samples = len(X)
1224
+ min_samples_per_fold = 3 # Minimum samples per fold
1225
+ max_folds = n_samples // min_samples_per_fold
1226
+
1227
+ # Adjust folds based on data size and class distribution
1228
+ unique_classes = np.unique(y)
1229
+ min_class_count = min([np.sum(y == cls) for cls in unique_classes])
1230
+
1231
+ # Ensure each fold has at least one sample from each class
1232
+ max_folds_by_class = min_class_count
1233
+
1234
+ actual_folds = max(2, min(self.cv_folds, max_folds, max_folds_by_class))
1235
+
1236
+ logger.info(f"Using {actual_folds} CV folds (requested: {self.cv_folds})")
1237
+
1238
+ return StratifiedKFold(
1239
+ n_splits=actual_folds,
1240
+ shuffle=True,
1241
+ random_state=self.random_state
1242
+ )
1243
+
1244
+ def perform_cross_validation(self, pipeline, X, y, cv_strategy=None) -> Dict:
1245
+ """Perform comprehensive cross-validation with multiple metrics"""
1246
+
1247
+ if cv_strategy is None:
1248
+ cv_strategy = self.create_cv_strategy(X, y)
1249
+
1250
+ logger.info(f"Starting cross-validation with {cv_strategy.n_splits} folds...")
1251
+
1252
+ # Define scoring metrics
1253
+ scoring_metrics = {
1254
+ 'accuracy': 'accuracy',
1255
+ 'precision': 'precision_weighted',
1256
+ 'recall': 'recall_weighted',
1257
+ 'f1': 'f1_weighted',
1258
+ 'roc_auc': 'roc_auc'
1259
+ }
1260
+
1261
  try:
1262
+ # Perform cross-validation
1263
+ cv_scores = cross_validate(
1264
+ pipeline, X, y,
1265
+ cv=cv_strategy,
1266
+ scoring=scoring_metrics,
1267
+ return_train_score=True,
1268
+ n_jobs=1, # Use single job for stability
1269
+ verbose=0
1270
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1271
 
1272
+ # Process results
1273
+ cv_results = {
1274
+ 'n_splits': cv_strategy.n_splits,
1275
+ 'test_scores': {},
1276
+ 'train_scores': {},
1277
+ 'fold_results': []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1278
  }
1279
 
1280
+ # Calculate statistics for each metric
1281
+ for metric_name in scoring_metrics.keys():
1282
+ test_key = f'test_{metric_name}'
1283
+ train_key = f'train_{metric_name}'
1284
+
1285
+ if test_key in cv_scores:
1286
+ test_scores = cv_scores[test_key]
1287
+ cv_results['test_scores'][metric_name] = {
1288
+ 'mean': float(np.mean(test_scores)),
1289
+ 'std': float(np.std(test_scores)),
1290
+ 'min': float(np.min(test_scores)),
1291
+ 'max': float(np.max(test_scores)),
1292
+ 'scores': test_scores.tolist()
1293
  }
1294
 
1295
+ if train_key in cv_scores:
1296
+ train_scores = cv_scores[train_key]
1297
+ cv_results['train_scores'][metric_name] = {
1298
+ 'mean': float(np.mean(train_scores)),
1299
+ 'std': float(np.std(train_scores)),
1300
+ 'min': float(np.min(train_scores)),
1301
+ 'max': float(np.max(train_scores)),
1302
+ 'scores': train_scores.tolist()
1303
+ }
 
 
 
 
 
 
 
 
 
 
 
1304
 
1305
+ # Store individual fold results
1306
+ for fold_idx in range(cv_strategy.n_splits):
1307
+ fold_result = {
1308
+ 'fold': fold_idx + 1,
1309
+ 'test_scores': {},
1310
+ 'train_scores': {}
 
 
 
1311
  }
1312
 
1313
+ for metric_name in scoring_metrics.keys():
1314
+ test_key = f'test_{metric_name}'
1315
+ train_key = f'train_{metric_name}'
1316
+
1317
+ if test_key in cv_scores:
1318
+ fold_result['test_scores'][metric_name] = float(cv_scores[test_key][fold_idx])
1319
+ if train_key in cv_scores:
1320
+ fold_result['train_scores'][metric_name] = float(cv_scores[train_key][fold_idx])
 
 
 
 
 
 
 
 
 
 
 
 
1321
 
1322
+ cv_results['fold_results'].append(fold_result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1323
 
1324
+ # Calculate overfitting indicators
1325
+ if 'accuracy' in cv_results['test_scores'] and 'accuracy' in cv_results['train_scores']:
1326
+ train_mean = cv_results['train_scores']['accuracy']['mean']
1327
+ test_mean = cv_results['test_scores']['accuracy']['mean']
1328
+ cv_results['overfitting_score'] = float(train_mean - test_mean)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1329
 
1330
+ # Calculate stability metrics
1331
+ if 'accuracy' in cv_results['test_scores']:
1332
+ test_std = cv_results['test_scores']['accuracy']['std']
1333
+ test_mean = cv_results['test_scores']['accuracy']['mean']
1334
+ cv_results['stability_score'] = float(1 - (test_std / test_mean)) if test_mean > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1335
 
1336
+ logger.info(f"Cross-validation completed successfully")
1337
+ logger.info(f"Mean test accuracy: {cv_results['test_scores'].get('accuracy', {}).get('mean', 'N/A'):.4f}")
1338
+ logger.info(f"Mean test F1: {cv_results['test_scores'].get('f1', {}).get('mean', 'N/A'):.4f}")
 
 
 
 
 
 
 
1339
 
1340
+ return cv_results
 
 
 
 
 
 
 
1341
 
1342
+ except Exception as e:
1343
+ logger.error(f"Cross-validation failed: {e}")
1344
+ return {
1345
+ 'error': str(e),
1346
+ 'n_splits': cv_strategy.n_splits if cv_strategy else self.cv_folds,
1347
+ 'fallback': True
1348
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1349
 
 
 
1350
 
1351
+ def preprocess_text_function(texts):
1352
+ """
1353
+ Standalone function for text preprocessing - pickle-safe
1354
+ """
1355
+ def clean_single_text(text):
1356
+ # Convert to string
1357
+ text = str(text)
1358
+
1359
+ # Remove URLs
1360
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text)
1361
+
1362
+ # Remove email addresses
1363
+ text = re.sub(r'\S+@\S+', '', text)
1364
+
1365
+ # Remove excessive punctuation
1366
+ text = re.sub(r'[!]{2,}', '!', text)
1367
+ text = re.sub(r'[?]{2,}', '?', text)
1368
+ text = re.sub(r'[.]{3,}', '...', text)
1369
+
1370
+ # Remove non-alphabetic characters except spaces and basic punctuation
1371
+ text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
1372
+
1373
+ # Remove excessive whitespace
1374
+ text = re.sub(r'\s+', ' ', text)
1375
+
1376
+ return text.strip().lower()
1377
+
1378
+ # Process all texts
1379
+ processed = []
1380
+ for text in texts:
1381
+ processed.append(clean_single_text(text))
1382
+
1383
+ return processed
1384
 
1385
 
1386
  def main():
1387
+ """Main execution function with enhanced ensemble support"""
1388
  import argparse
1389
 
1390
  # Parse command line arguments
1391
+ parser = argparse.ArgumentParser(description='Train fake news detection model with LightGBM ensemble')
1392
  parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
1393
  parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
1394
  parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
1395
  parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features')
1396
  parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only')
1397
+ parser.add_argument('--ensemble', action='store_true', help='Force use of LightGBM ensemble')
1398
+ parser.add_argument('--no_ensemble', action='store_true', help='Disable ensemble training')
1399
  args = parser.parse_args()
1400
 
1401
  # Determine feature engineering mode
 
1409
  use_enhanced = False
1410
  logger.info("Standard features explicitly requested")
1411
 
1412
+ # Determine ensemble mode
1413
+ use_ensemble = None
1414
+ if args.ensemble and args.no_ensemble:
1415
+ logger.warning("Both --ensemble and --no_ensemble specified. Using auto-detection.")
1416
+ elif args.ensemble:
1417
+ use_ensemble = True
1418
+ logger.info("LightGBM ensemble explicitly requested")
1419
+ elif args.no_ensemble:
1420
+ use_ensemble = False
1421
+ logger.info("Ensemble training explicitly disabled")
1422
+
1423
+ trainer = EnhancedModelTrainer(use_enhanced_features=use_enhanced, use_ensemble=use_ensemble)
1424
 
1425
  # Apply CV folds from command line
1426
  if args.cv_folds:
 
1444
  if 'enhanced_features' in config and use_enhanced is None:
1445
  trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE
1446
 
1447
+ # Ensemble configuration
1448
+ if 'use_ensemble' in config and use_ensemble is None:
1449
+ trainer.use_ensemble = config['use_ensemble'] and LIGHTGBM_AVAILABLE
1450
+
1451
+ # LightGBM specific parameters
1452
+ if 'lightgbm_params' in config:
1453
+ trainer.lgb_params.update(config['lightgbm_params'])
1454
+
1455
  # Filter models if specified
1456
  selected_models = config.get('selected_models')
1457
  if selected_models and len(selected_models) < len(trainer.models):
 
1464
  logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds")
1465
  if trainer.use_enhanced_features:
1466
  logger.info("Enhanced features enabled via configuration")
1467
+ if trainer.use_ensemble:
1468
+ logger.info("LightGBM ensemble enabled via configuration")
1469
 
1470
  except Exception as e:
1471
  logger.warning(f"Failed to load configuration: {e}, using defaults")
1472
 
1473
+ # Log final configuration
1474
+ logger.info("Final Training Configuration:")
1475
+ logger.info(f" Enhanced Features: {trainer.use_enhanced_features} (Available: {ENHANCED_FEATURES_AVAILABLE})")
1476
+ logger.info(f" LightGBM Ensemble: {trainer.use_ensemble} (Available: {LIGHTGBM_AVAILABLE})")
1477
+ logger.info(f" Models to train: {list(trainer.models.keys())}")
1478
+ logger.info(f" Cross-validation folds: {trainer.cv_folds}")
1479
+
1480
  success, message = trainer.train_model(data_path=args.data_path)
1481
 
1482
  if success:
 
1492
  print(f" {feature_type}: {count}")
1493
  except Exception as e:
1494
  logger.warning(f"Could not display feature summary: {e}")
1495
+
1496
+ # Print ensemble summary
1497
+ if trainer.use_ensemble and LIGHTGBM_AVAILABLE:
1498
+ try:
1499
+ ensemble_metadata_path = Path("/tmp/ensemble_metadata.json")
1500
+ if ensemble_metadata_path.exists():
1501
+ with open(ensemble_metadata_path, 'r') as f:
1502
+ ensemble_metadata = json.load(f)
1503
+
1504
+ print(f"\n🎯 Ensemble Model Summary:")
1505
+ print(f"Ensemble type: {ensemble_metadata.get('ensemble_type', 'unknown')}")
1506
+ print(f"Component models: {', '.join(ensemble_metadata.get('component_models', []))}")
1507
+ else:
1508
+ print(f"\n🎯 Individual Model Selected (Ensemble not used)")
1509
+ except Exception as e:
1510
+ logger.warning(f"Could not display ensemble summary: {e}")
1511
+
1512
  else:
1513
  print(f"❌ {message}")
1514
  exit(1)