Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files

xet

Community

Ahmedik95316 commited on Aug 20

Commit

ead9c37

1 Parent(s): 9a1ffc0

Update model/train.py

Browse files

Cross Validation Implementation

Files changed (1) hide show

model/train.py +355 -116

model/train.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.feature_selection import SelectKBest, chi2
@@ -10,7 +13,7 @@ from sklearn.metrics import (
 )
 from sklearn.model_selection import (
     train_test_split, cross_val_score, GridSearchCV,
-    StratifiedKFold, validation_curve
 )
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
@@ -26,7 +29,7 @@ import sys
 import os
 import time
 from datetime import datetime, timedelta
-from typing import Dict, Tuple, Optional, Any
 import warnings
 import re
 warnings.filterwarnings('ignore')
@@ -143,7 +146,7 @@ class ProgressTracker:
         print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
-def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 3) -> Dict:
     """Estimate training time based on dataset characteristics"""
     # Base time estimates (in seconds) based on empirical testing
@@ -173,12 +176,15 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
     estimates['vectorization'] = base_times['vectorization']
     estimates['feature_selection'] = base_times['feature_selection']
-    # Model training
     for model_name, multiplier in tuning_multipliers.items():
         model_time = base_times['simple_training'] * multiplier * cv_multiplier
         estimates[f'{model_name}_training'] = model_time
         estimates[f'{model_name}_evaluation'] = base_times['evaluation']
     # Model saving
     estimates['model_saving'] = 1.0
@@ -198,14 +204,189 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
     }
 class RobustModelTrainer:
-    """Production-ready model trainer with comprehensive evaluation and validation"""
     def __init__(self):
         self.setup_paths()
         self.setup_training_config()
         self.setup_models()
         self.progress_tracker = None
     def setup_paths(self):
         """Setup all necessary paths with proper permissions"""
@@ -232,11 +413,11 @@ class RobustModelTrainer:
         self.evaluation_path = self.results_dir / "evaluation_results.json"
     def setup_training_config(self):
-        """Setup training configuration"""
         self.test_size = 0.2
         self.validation_size = 0.1
         self.random_state = 42
-        self.cv_folds = 3
         self.max_features = 5000  # Reduced for speed
         self.min_df = 1  # More lenient for small datasets
         self.max_df = 0.95
@@ -312,13 +493,13 @@ class RobustModelTrainer:
             if len(unique_labels) < 2:
                 return False, None, f"Need at least 2 classes, found: {unique_labels}"
-            # Check minimum sample size - more lenient
-            if len(df) < 6:
-                return False, None, f"Insufficient samples for training: {len(df)} (minimum: 6)"
-            # Warning for small datasets
-            if len(df) < 50:
-                logger.warning(f"Small dataset detected: {len(df)} samples. Results may be unreliable.")
             # Check class balance
             label_counts = df['label'].value_counts()
@@ -378,7 +559,7 @@ class RobustModelTrainer:
         return pipeline
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
-        """Comprehensive model evaluation with multiple metrics"""
         if self.progress_tracker:
             self.progress_tracker.update("Evaluating model")
@@ -400,40 +581,22 @@ class RobustModelTrainer:
         cm = confusion_matrix(y_test, y_pred)
         metrics['confusion_matrix'] = cm.tolist()
-        # Smart cross-validation based on dataset size
-        if X_train is not None and y_train is not None and len(X_train) >= 20:
-            try:
-                # Calculate appropriate CV folds for small datasets
-                n_samples = len(X_train)
-                min_samples_per_fold = 3  # Minimum samples per fold
-                max_folds = n_samples // min_samples_per_fold
-                cv_folds = max(2, min(self.cv_folds, max_folds))
-                if cv_folds >= 2:
-                    cv_scores = cross_val_score(
-                        model, X_train, y_train,
-                        cv=StratifiedKFold(
-                            n_splits=cv_folds,
-                            shuffle=True,
-                            random_state=self.random_state
-                        ),
-                        scoring='f1_weighted',
-                        n_jobs=1  # Single job for small datasets
-                    )
-                    metrics['cv_scores'] = {
-                        'mean': float(cv_scores.mean()),
-                        'std': float(cv_scores.std()),
-                        'scores': cv_scores.tolist(),
-                        'folds_used': cv_folds
-                    }
-                else:
-                    metrics['cv_scores'] = {'note': 'Dataset too small for reliable CV'}
-            except Exception as e:
-                logger.warning(f"Cross-validation failed: {e}")
-                metrics['cv_scores'] = {'note': f'CV failed: {str(e)}'}
-        else:
-            metrics['cv_scores'] = {'note': 'Skipped for very small dataset'}
         # Training accuracy for overfitting detection
         try:
             if X_train is not None and y_train is not None:
@@ -447,11 +610,11 @@ class RobustModelTrainer:
         return metrics
-    def hyperparameter_tuning(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
-        """Perform hyperparameter tuning with cross-validation"""
         if self.progress_tracker:
-            self.progress_tracker.update(f"Tuning {model_name}")
         try:
             # Set the model in the pipeline
@@ -461,63 +624,68 @@ class RobustModelTrainer:
             if len(X_train) < 20:
                 logger.info(f"Skipping hyperparameter tuning for {model_name} due to small dataset")
                 pipeline.fit(X_train, y_train)
                 return pipeline, {
                     'best_params': 'default_parameters',
-                    'best_score': 'not_calculated',
                     'best_estimator': pipeline,
                     'note': 'Hyperparameter tuning skipped for small dataset'
                 }
             # Get parameter grid
             param_grid = self.models[model_name]['param_grid']
-            # Calculate appropriate CV folds for small datasets
-            n_samples = len(X_train)
-            min_samples_per_fold = 3
-            max_folds = n_samples // min_samples_per_fold
-            cv_folds = max(2, min(self.cv_folds, max_folds))
-            if cv_folds < 2:
-                # Fallback to simple training
-                logger.info(f"Dataset too small for CV, using simple training for {model_name}")
-                pipeline.fit(X_train, y_train)
-                return pipeline, {
-                    'best_params': 'default_parameters',
-                    'best_score': 'not_calculated',
-                    'best_estimator': pipeline,
-                    'note': 'Simple training used due to very small dataset'
-                }
-            # Create GridSearchCV
             grid_search = GridSearchCV(
                 pipeline,
                 param_grid,
-                cv=StratifiedKFold(n_splits=cv_folds,
-                                   shuffle=True, random_state=self.random_state),
                 scoring='f1_weighted',
-                n_jobs=1,  # Single job for small datasets
-                verbose=0   # Reduce verbosity for speed
             )
             # Fit grid search
             grid_search.fit(X_train, y_train)
             # Extract results
             tuning_results = {
                 'best_params': grid_search.best_params_,
                 'best_score': float(grid_search.best_score_),
                 'best_estimator': grid_search.best_estimator_,
-                'cv_folds_used': cv_folds,
-                'cv_results': {
                     'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
                     'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
                     'params': grid_search.cv_results_['params']
                 }
             }
             logger.info(f"Hyperparameter tuning completed for {model_name}")
-            logger.info(f"Best score: {grid_search.best_score_:.4f}")
             logger.info(f"Best params: {grid_search.best_params_}")
             return grid_search.best_estimator_, tuning_results
@@ -527,29 +695,37 @@ class RobustModelTrainer:
             try:
                 pipeline.set_params(model=self.models[model_name]['model'])
                 pipeline.fit(X_train, y_train)
-                return pipeline, {'error': str(e), 'fallback': 'simple_training'}
             except Exception as e2:
                 logger.error(f"Fallback training also failed for {model_name}: {str(e2)}")
                 raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
-        """Train and evaluate multiple models"""
         results = {}
         for model_name in self.models.keys():
-            logger.info(f"Training {model_name}...")
             try:
                 # Create pipeline
                 pipeline = self.create_preprocessing_pipeline()
-                # Hyperparameter tuning
-                best_model, tuning_results = self.hyperparameter_tuning(
                     pipeline, X_train, y_train, model_name
                 )
-                # Comprehensive evaluation
                 evaluation_metrics = self.comprehensive_evaluation(
                     best_model, X_test, y_test, X_train, y_train
                 )
@@ -562,8 +738,15 @@ class RobustModelTrainer:
                     'training_time': datetime.now().isoformat()
                 }
-                logger.info(f"Model {model_name} - F1: {evaluation_metrics['f1']:.4f}, "
-                            f"Accuracy: {evaluation_metrics['accuracy']:.4f}")
             except Exception as e:
                 logger.error(f"Training failed for {model_name}: {str(e)}")
@@ -572,7 +755,7 @@ class RobustModelTrainer:
         return results
     def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
-        """Select the best performing model"""
         if self.progress_tracker:
             self.progress_tracker.update("Selecting best model")
@@ -586,8 +769,14 @@ class RobustModelTrainer:
             if 'error' in result:
                 continue
-            # Use F1 score as primary metric
-            f1_score = result['evaluation_metrics']['f1']
             if f1_score > best_score:
                 best_score = f1_score
@@ -598,12 +787,11 @@ class RobustModelTrainer:
         if best_model_name is None:
             raise ValueError("No models trained successfully")
-        logger.info(
-            f"Best model: {best_model_name} with F1 score: {best_score:.4f}")
         return best_model_name, best_model, best_metrics
-    def save_model_artifacts(self, model, model_name: str, metrics: Dict) -> bool:
-        """Save model artifacts and metadata with robust error handling"""
         try:
             if self.progress_tracker:
                 self.progress_tracker.update("Saving model")
@@ -637,7 +825,10 @@ class RobustModelTrainer:
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
-            # Create metadata
             metadata = {
                 'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                 'model_type': model_name,
@@ -648,8 +839,6 @@ class RobustModelTrainer:
                 'test_recall': metrics['recall'],
                 'test_roc_auc': metrics['roc_auc'],
                 'overfitting_score': metrics.get('overfitting_score', 'Unknown'),
-                'cv_score_mean': metrics.get('cv_scores', {}).get('mean', 'Unknown'),
-                'cv_score_std': metrics.get('cv_scores', {}).get('std', 'Unknown'),
                 'timestamp': datetime.now().isoformat(),
                 'training_config': {
                     'test_size': self.test_size,
@@ -659,16 +848,51 @@ class RobustModelTrainer:
                     'feature_selection_k': self.feature_selection_k
                 }
             }
             # Save metadata with error handling
             try:
                 with open(self.metadata_path, 'w') as f:
                     json.dump(metadata, f, indent=2)
-                logger.info(f"✅ Saved metadata to {self.metadata_path}")
             except Exception as e:
                 logger.warning(f"Could not save metadata: {e}")
-            logger.info(f"✅ Model artifacts saved successfully")
             return True
         except Exception as e:
@@ -683,9 +907,9 @@ class RobustModelTrainer:
                 return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
-        """Main training function with comprehensive pipeline"""
         try:
-            logger.info("Starting model training pipeline...")
             # Override data path if provided
             if data_path:
@@ -703,16 +927,17 @@ class RobustModelTrainer:
                 cv_folds=self.cv_folds
             )
-            print(f"\n📊 Training Configuration:")
             print(f"Dataset size: {len(df)} samples")
             print(f"Estimated time: {time_estimate['total_formatted']}")
             print(f"Models to train: {len(self.models)}")
-            print(f"Cross-validation folds: {self.cv_folds}")
             print()
-            # Setup progress tracker
-            total_steps = 4 + (len(self.models) * 2) + 1  # Load, split, 2*models, select, save
-            self.progress_tracker = ProgressTracker(total_steps, "Training Progress")
             # Prepare data
             X = df['text'].values
@@ -743,28 +968,35 @@ class RobustModelTrainer:
             # Additional validation for very small datasets
             if len(X_train) < 3:
-                logger.warning(f"Very small training set: {len(X_train)} samples. Results may be unreliable.")
             if len(X_test) < 1:
                 return False, "Cannot create test set. Dataset too small."
-            # Train and evaluate models
-            results = self.train_and_evaluate_models(
-                X_train, X_test, y_train, y_test)
             # Select best model
             best_model_name, best_model, best_metrics = self.select_best_model(results)
-            # Save model artifacts
-            if not self.save_model_artifacts(best_model, best_model_name, best_metrics):
                 return False, "Failed to save model artifacts"
             # Finish progress tracking
             self.progress_tracker.finish()
             success_message = (
-                f"Model training completed successfully. "
                 f"Best model: {best_model_name} "
-                f"(F1: {best_metrics['f1']:.4f}, Accuracy: {best_metrics['accuracy']:.4f})"
             )
             logger.info(success_message)
@@ -773,23 +1005,29 @@ class RobustModelTrainer:
         except Exception as e:
             if self.progress_tracker:
                 print()  # New line after progress bar
-            error_message = f"Model training failed: {str(e)}"
             logger.error(error_message)
             return False, error_message
 def main():
-    """Main execution function"""
     import argparse
     # Parse command line arguments
-    parser = argparse.ArgumentParser(description='Train fake news detection model')
     parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
     parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
     args = parser.parse_args()
     trainer = RobustModelTrainer()
     # Load custom configuration if provided
     if args.config_path and Path(args.config_path).exists():
         try:
@@ -799,6 +1037,7 @@ def main():
             # Apply configuration
             trainer.test_size = config.get('test_size', trainer.test_size)
             trainer.cv_folds = config.get('cv_folds', trainer.cv_folds)
             trainer.max_features = config.get('max_features', trainer.max_features)
             trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range))
@@ -811,7 +1050,7 @@ def main():
             # Update feature selection based on max_features
             trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features)
-            logger.info(f"Applied custom configuration: {config}")
         except Exception as e:
             logger.warning(f"Failed to load configuration: {e}, using defaults")

+# File: model/train.py (MODIFIED)
+# Enhanced version with comprehensive cross-validation implementation
 import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.feature_selection import SelectKBest, chi2
 )
 from sklearn.model_selection import (
     train_test_split, cross_val_score, GridSearchCV,
+    StratifiedKFold, validation_curve, cross_validate
 )
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
 import os
 import time
 from datetime import datetime, timedelta
+from typing import Dict, Tuple, Optional, Any, List
 import warnings
 import re
 warnings.filterwarnings('ignore')
         print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
+def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5) -> Dict:
     """Estimate training time based on dataset characteristics"""
     # Base time estimates (in seconds) based on empirical testing
     estimates['vectorization'] = base_times['vectorization']
     estimates['feature_selection'] = base_times['feature_selection']
+    # Model training (now includes CV)
     for model_name, multiplier in tuning_multipliers.items():
         model_time = base_times['simple_training'] * multiplier * cv_multiplier
         estimates[f'{model_name}_training'] = model_time
         estimates[f'{model_name}_evaluation'] = base_times['evaluation']
+    # Cross-validation overhead
+    estimates['cross_validation'] = base_times['simple_training'] * cv_folds * 0.5
     # Model saving
     estimates['model_saving'] = 1.0
     }
+class CrossValidationManager:
+    """Advanced cross-validation management with comprehensive metrics"""
+    def __init__(self, cv_folds: int = 5, random_state: int = 42):
+        self.cv_folds = cv_folds
+        self.random_state = random_state
+        self.cv_results = {}
+    def create_cv_strategy(self, X, y) -> StratifiedKFold:
+        """Create appropriate CV strategy based on data characteristics"""
+        # Calculate appropriate CV folds for small datasets
+        n_samples = len(X)
+        min_samples_per_fold = 3  # Minimum samples per fold
+        max_folds = n_samples // min_samples_per_fold
+        # Adjust folds based on data size and class distribution
+        unique_classes = np.unique(y)
+        min_class_count = min([np.sum(y == cls) for cls in unique_classes])
+        # Ensure each fold has at least one sample from each class
+        max_folds_by_class = min_class_count
+        actual_folds = max(2, min(self.cv_folds, max_folds, max_folds_by_class))
+        logger.info(f"Using {actual_folds} CV folds (requested: {self.cv_folds})")
+        return StratifiedKFold(
+            n_splits=actual_folds,
+            shuffle=True,
+            random_state=self.random_state
+        )
+    def perform_cross_validation(self, pipeline, X, y, cv_strategy=None) -> Dict:
+        """Perform comprehensive cross-validation with multiple metrics"""
+        if cv_strategy is None:
+            cv_strategy = self.create_cv_strategy(X, y)
+        logger.info(f"Starting cross-validation with {cv_strategy.n_splits} folds...")
+        # Define scoring metrics
+        scoring_metrics = {
+            'accuracy': 'accuracy',
+            'precision': 'precision_weighted',
+            'recall': 'recall_weighted',
+            'f1': 'f1_weighted',
+            'roc_auc': 'roc_auc'
+        }
+        try:
+            # Perform cross-validation
+            cv_scores = cross_validate(
+                pipeline, X, y,
+                cv=cv_strategy,
+                scoring=scoring_metrics,
+                return_train_score=True,
+                n_jobs=1,  # Use single job for stability
+                verbose=0
+            )
+            # Process results
+            cv_results = {
+                'n_splits': cv_strategy.n_splits,
+                'test_scores': {},
+                'train_scores': {},
+                'fold_results': []
+            }
+            # Calculate statistics for each metric
+            for metric_name in scoring_metrics.keys():
+                test_key = f'test_{metric_name}'
+                train_key = f'train_{metric_name}'
+                if test_key in cv_scores:
+                    test_scores = cv_scores[test_key]
+                    cv_results['test_scores'][metric_name] = {
+                        'mean': float(np.mean(test_scores)),
+                        'std': float(np.std(test_scores)),
+                        'min': float(np.min(test_scores)),
+                        'max': float(np.max(test_scores)),
+                        'scores': test_scores.tolist()
+                    }
+                if train_key in cv_scores:
+                    train_scores = cv_scores[train_key]
+                    cv_results['train_scores'][metric_name] = {
+                        'mean': float(np.mean(train_scores)),
+                        'std': float(np.std(train_scores)),
+                        'min': float(np.min(train_scores)),
+                        'max': float(np.max(train_scores)),
+                        'scores': train_scores.tolist()
+                    }
+            # Store individual fold results
+            for fold_idx in range(cv_strategy.n_splits):
+                fold_result = {
+                    'fold': fold_idx + 1,
+                    'test_scores': {},
+                    'train_scores': {}
+                }
+                for metric_name in scoring_metrics.keys():
+                    test_key = f'test_{metric_name}'
+                    train_key = f'train_{metric_name}'
+                    if test_key in cv_scores:
+                        fold_result['test_scores'][metric_name] = float(cv_scores[test_key][fold_idx])
+                    if train_key in cv_scores:
+                        fold_result['train_scores'][metric_name] = float(cv_scores[train_key][fold_idx])
+                cv_results['fold_results'].append(fold_result)
+            # Calculate overfitting indicators
+            if 'accuracy' in cv_results['test_scores'] and 'accuracy' in cv_results['train_scores']:
+                train_mean = cv_results['train_scores']['accuracy']['mean']
+                test_mean = cv_results['test_scores']['accuracy']['mean']
+                cv_results['overfitting_score'] = float(train_mean - test_mean)
+            # Calculate stability metrics
+            if 'accuracy' in cv_results['test_scores']:
+                test_std = cv_results['test_scores']['accuracy']['std']
+                test_mean = cv_results['test_scores']['accuracy']['mean']
+                cv_results['stability_score'] = float(1 - (test_std / test_mean)) if test_mean > 0 else 0
+            logger.info(f"Cross-validation completed successfully")
+            logger.info(f"Mean test accuracy: {cv_results['test_scores'].get('accuracy', {}).get('mean', 'N/A'):.4f}")
+            logger.info(f"Mean test F1: {cv_results['test_scores'].get('f1', {}).get('mean', 'N/A'):.4f}")
+            return cv_results
+        except Exception as e:
+            logger.error(f"Cross-validation failed: {e}")
+            return {
+                'error': str(e),
+                'n_splits': cv_strategy.n_splits if cv_strategy else self.cv_folds,
+                'fallback': True
+            }
+    def compare_cv_results(self, results1: Dict, results2: Dict, metric: str = 'f1') -> Dict:
+        """Compare cross-validation results between two models"""
+        try:
+            if 'error' in results1 or 'error' in results2:
+                return {'error': 'Cannot compare results with errors'}
+            scores1 = results1['test_scores'][metric]['scores']
+            scores2 = results2['test_scores'][metric]['scores']
+            # Paired t-test
+            from scipy import stats
+            t_stat, p_value = stats.ttest_rel(scores1, scores2)
+            comparison = {
+                'metric': metric,
+                'model1_mean': results1['test_scores'][metric]['mean'],
+                'model2_mean': results2['test_scores'][metric]['mean'],
+                'model1_std': results1['test_scores'][metric]['std'],
+                'model2_std': results2['test_scores'][metric]['std'],
+                'difference': results2['test_scores'][metric]['mean'] - results1['test_scores'][metric]['mean'],
+                'paired_ttest': {
+                    't_statistic': float(t_stat),
+                    'p_value': float(p_value),
+                    'significant': p_value < 0.05
+                },
+                'effect_size': float(abs(t_stat) / np.sqrt(len(scores1))) if len(scores1) > 0 else 0
+            }
+            return comparison
+        except Exception as e:
+            logger.error(f"CV comparison failed: {e}")
+            return {'error': str(e)}
 class RobustModelTrainer:
+    """Production-ready model trainer with comprehensive cross-validation"""
     def __init__(self):
         self.setup_paths()
         self.setup_training_config()
         self.setup_models()
         self.progress_tracker = None
+        self.cv_manager = CrossValidationManager()
     def setup_paths(self):
         """Setup all necessary paths with proper permissions"""
         self.evaluation_path = self.results_dir / "evaluation_results.json"
     def setup_training_config(self):
+        """Setup training configuration with CV parameters"""
         self.test_size = 0.2
         self.validation_size = 0.1
         self.random_state = 42
+        self.cv_folds = 5  # Primary CV folds
         self.max_features = 5000  # Reduced for speed
         self.min_df = 1  # More lenient for small datasets
         self.max_df = 0.95
             if len(unique_labels) < 2:
                 return False, None, f"Need at least 2 classes, found: {unique_labels}"
+            # Check minimum sample size for CV
+            min_samples_for_cv = self.cv_folds * 2  # At least 2 samples per fold
+            if len(df) < min_samples_for_cv:
+                logger.warning(f"Dataset size ({len(df)}) is small for {self.cv_folds}-fold CV")
+                # Adjust CV folds for small datasets
+                self.cv_manager.cv_folds = max(2, len(df) // 3)
+                logger.info(f"Adjusted CV folds to {self.cv_manager.cv_folds}")
             # Check class balance
             label_counts = df['label'].value_counts()
         return pipeline
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
+        """Comprehensive model evaluation with cross-validation integration"""
         if self.progress_tracker:
             self.progress_tracker.update("Evaluating model")
         cm = confusion_matrix(y_test, y_pred)
         metrics['confusion_matrix'] = cm.tolist()
+        # Cross-validation on full dataset
+        if X_train is not None and y_train is not None:
+            # Combine train and test for full dataset CV
+            X_full = np.concatenate([X_train, X_test])
+            y_full = np.concatenate([y_train, y_test])
+            logger.info("Performing cross-validation on full dataset...")
+            cv_results = self.cv_manager.perform_cross_validation(model, X_full, y_full)
+            metrics['cross_validation'] = cv_results
+            # Log CV results
+            if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
+                cv_f1_mean = cv_results['test_scores']['f1']['mean']
+                cv_f1_std = cv_results['test_scores']['f1']['std']
+                logger.info(f"CV F1 Score: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
         # Training accuracy for overfitting detection
         try:
             if X_train is not None and y_train is not None:
         return metrics
+    def hyperparameter_tuning_with_cv(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]:
+        """Perform hyperparameter tuning with nested cross-validation"""
         if self.progress_tracker:
+            self.progress_tracker.update(f"Tuning {model_name} with CV")
         try:
             # Set the model in the pipeline
             if len(X_train) < 20:
                 logger.info(f"Skipping hyperparameter tuning for {model_name} due to small dataset")
                 pipeline.fit(X_train, y_train)
+                # Still perform CV evaluation
+                cv_results = self.cv_manager.perform_cross_validation(pipeline, X_train, y_train)
                 return pipeline, {
                     'best_params': 'default_parameters',
+                    'best_score': cv_results.get('test_scores', {}).get('f1', {}).get('mean', 'not_calculated'),
                     'best_estimator': pipeline,
+                    'cross_validation': cv_results,
                     'note': 'Hyperparameter tuning skipped for small dataset'
                 }
             # Get parameter grid
             param_grid = self.models[model_name]['param_grid']
+            # Create CV strategy
+            cv_strategy = self.cv_manager.create_cv_strategy(X_train, y_train)
+            # Create GridSearchCV with nested cross-validation
             grid_search = GridSearchCV(
                 pipeline,
                 param_grid,
+                cv=cv_strategy,
                 scoring='f1_weighted',
+                n_jobs=1,  # Single job for stability
+                verbose=0,  # Reduce verbosity for speed
+                return_train_score=True  # For overfitting analysis
             )
             # Fit grid search
+            logger.info(f"Starting hyperparameter tuning for {model_name}...")
             grid_search.fit(X_train, y_train)
+            # Perform additional CV on best model
+            logger.info(f"Performing final CV evaluation for {model_name}...")
+            best_cv_results = self.cv_manager.perform_cross_validation(
+                grid_search.best_estimator_, X_train, y_train, cv_strategy
+            )
             # Extract results
             tuning_results = {
                 'best_params': grid_search.best_params_,
                 'best_score': float(grid_search.best_score_),
                 'best_estimator': grid_search.best_estimator_,
+                'cv_folds_used': cv_strategy.n_splits,
+                'cross_validation': best_cv_results,
+                'grid_search_results': {
                     'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(),
                     'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(),
+                    'mean_train_scores': grid_search.cv_results_.get('mean_train_score', []).tolist() if 'mean_train_score' in grid_search.cv_results_ else [],
                     'params': grid_search.cv_results_['params']
                 }
             }
             logger.info(f"Hyperparameter tuning completed for {model_name}")
+            logger.info(f"Best CV score: {grid_search.best_score_:.4f}")
             logger.info(f"Best params: {grid_search.best_params_}")
+            if 'test_scores' in best_cv_results and 'f1' in best_cv_results['test_scores']:
+                final_f1 = best_cv_results['test_scores']['f1']['mean']
+                final_f1_std = best_cv_results['test_scores']['f1']['std']
+                logger.info(f"Final CV F1: {final_f1:.4f} (±{final_f1_std:.4f})")
             return grid_search.best_estimator_, tuning_results
             try:
                 pipeline.set_params(model=self.models[model_name]['model'])
                 pipeline.fit(X_train, y_train)
+                # Perform basic CV
+                cv_results = self.cv_manager.perform_cross_validation(pipeline, X_train, y_train)
+                return pipeline, {
+                    'error': str(e),
+                    'fallback': 'simple_training',
+                    'cross_validation': cv_results
+                }
             except Exception as e2:
                 logger.error(f"Fallback training also failed for {model_name}: {str(e2)}")
                 raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
+        """Train and evaluate multiple models with comprehensive CV"""
         results = {}
         for model_name in self.models.keys():
+            logger.info(f"Training {model_name} with cross-validation...")
             try:
                 # Create pipeline
                 pipeline = self.create_preprocessing_pipeline()
+                # Hyperparameter tuning with CV
+                best_model, tuning_results = self.hyperparameter_tuning_with_cv(
                     pipeline, X_train, y_train, model_name
                 )
+                # Comprehensive evaluation (includes additional CV)
                 evaluation_metrics = self.comprehensive_evaluation(
                     best_model, X_test, y_test, X_train, y_train
                 )
                     'training_time': datetime.now().isoformat()
                 }
+                # Log results
+                test_f1 = evaluation_metrics['f1']
+                cv_results = evaluation_metrics.get('cross_validation', {})
+                cv_f1_mean = cv_results.get('test_scores', {}).get('f1', {}).get('mean', 'N/A')
+                cv_f1_std = cv_results.get('test_scores', {}).get('f1', {}).get('std', 'N/A')
+                logger.info(f"Model {model_name} - Test F1: {test_f1:.4f}, "
+                            f"CV F1: {cv_f1_mean:.4f if cv_f1_mean != 'N/A' else cv_f1_mean} "
+                            f"(±{cv_f1_std:.4f if cv_f1_std != 'N/A' else cv_f1_std})")
             except Exception as e:
                 logger.error(f"Training failed for {model_name}: {str(e)}")
         return results
     def select_best_model(self, results: Dict) -> Tuple[str, Any, Dict]:
+        """Select the best performing model based on CV results"""
         if self.progress_tracker:
             self.progress_tracker.update("Selecting best model")
             if 'error' in result:
                 continue
+            # Prioritize CV F1 score if available, fallback to test F1
+            cv_results = result['evaluation_metrics'].get('cross_validation', {})
+            if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
+                f1_score = cv_results['test_scores']['f1']['mean']
+                score_type = "CV F1"
+            else:
+                f1_score = result['evaluation_metrics']['f1']
+                score_type = "Test F1"
             if f1_score > best_score:
                 best_score = f1_score
         if best_model_name is None:
             raise ValueError("No models trained successfully")
+        logger.info(f"Best model: {best_model_name} with {score_type} score: {best_score:.4f}")
         return best_model_name, best_model, best_metrics
+    def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
+        """Save model artifacts and enhanced metadata with CV results"""
         try:
             if self.progress_tracker:
                 self.progress_tracker.update("Saving model")
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
+            # Extract CV results
+            cv_results = metrics.get('cross_validation', {})
+            # Create enhanced metadata with CV information
             metadata = {
                 'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                 'model_type': model_name,
                 'test_recall': metrics['recall'],
                 'test_roc_auc': metrics['roc_auc'],
                 'overfitting_score': metrics.get('overfitting_score', 'Unknown'),
                 'timestamp': datetime.now().isoformat(),
                 'training_config': {
                     'test_size': self.test_size,
                     'feature_selection_k': self.feature_selection_k
                 }
             }
+            # Add comprehensive CV results to metadata
+            if cv_results and 'test_scores' in cv_results:
+                metadata['cross_validation'] = {
+                    'n_splits': cv_results.get('n_splits', self.cv_folds),
+                    'test_scores': cv_results['test_scores'],
+                    'train_scores': cv_results.get('train_scores', {}),
+                    'overfitting_score': cv_results.get('overfitting_score', 'Unknown'),
+                    'stability_score': cv_results.get('stability_score', 'Unknown'),
+                    'individual_fold_results': cv_results.get('fold_results', [])
+                }
+                # Add summary statistics
+                if 'f1' in cv_results['test_scores']:
+                    metadata['cv_f1_mean'] = cv_results['test_scores']['f1']['mean']
+                    metadata['cv_f1_std'] = cv_results['test_scores']['f1']['std']
+                    metadata['cv_f1_min'] = cv_results['test_scores']['f1']['min']
+                    metadata['cv_f1_max'] = cv_results['test_scores']['f1']['max']
+                if 'accuracy' in cv_results['test_scores']:
+                    metadata['cv_accuracy_mean'] = cv_results['test_scores']['accuracy']['mean']
+                    metadata['cv_accuracy_std'] = cv_results['test_scores']['accuracy']['std']
+            # Add model comparison results if available
+            if len(results) > 1:
+                model_comparison = {}
+                for other_model_name, other_result in results.items():
+                    if other_model_name != model_name and 'error' not in other_result:
+                        other_cv = other_result['evaluation_metrics'].get('cross_validation', {})
+                        if cv_results and other_cv:
+                            comparison = self.cv_manager.compare_cv_results(cv_results, other_cv)
+                            model_comparison[other_model_name] = comparison
+                if model_comparison:
+                    metadata['model_comparison'] = model_comparison
             # Save metadata with error handling
             try:
                 with open(self.metadata_path, 'w') as f:
                     json.dump(metadata, f, indent=2)
+                logger.info(f"✅ Saved enhanced metadata to {self.metadata_path}")
             except Exception as e:
                 logger.warning(f"Could not save metadata: {e}")
+            logger.info(f"✅ Model artifacts saved successfully with CV results")
             return True
         except Exception as e:
                 return False
     def train_model(self, data_path: str = None) -> Tuple[bool, str]:
+        """Main training function with comprehensive CV pipeline"""
         try:
+            logger.info("Starting enhanced model training with cross-validation...")
             # Override data path if provided
             if data_path:
                 cv_folds=self.cv_folds
             )
+            print(f"\n📊 Enhanced Training Configuration:")
             print(f"Dataset size: {len(df)} samples")
+            print(f"Cross-validation folds: {self.cv_folds}")
             print(f"Estimated time: {time_estimate['total_formatted']}")
             print(f"Models to train: {len(self.models)}")
+            print(f"Hyperparameter tuning: Enabled")
             print()
+            # Setup progress tracker (increased steps for CV)
+            total_steps = 4 + (len(self.models) * 3) + 1  # Load, split, 3*models (tune+cv+eval), select, save
+            self.progress_tracker = ProgressTracker(total_steps, "CV Training Progress")
             # Prepare data
             X = df['text'].values
             # Additional validation for very small datasets
             if len(X_train) < 3:
+                logger.warning(f"Very small training set: {len(X_train)} samples. CV results may be unreliable.")
             if len(X_test) < 1:
                 return False, "Cannot create test set. Dataset too small."
+            # Train and evaluate models with CV
+            results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
             # Select best model
             best_model_name, best_model, best_metrics = self.select_best_model(results)
+            # Save model artifacts with CV results
+            if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
                 return False, "Failed to save model artifacts"
             # Finish progress tracking
             self.progress_tracker.finish()
+            # Create success message with CV information
+            cv_results = best_metrics.get('cross_validation', {})
+            cv_info = ""
+            if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
+                cv_f1_mean = cv_results['test_scores']['f1']['mean']
+                cv_f1_std = cv_results['test_scores']['f1']['std']
+                cv_info = f", CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})"
             success_message = (
+                f"Enhanced model training completed successfully. "
                 f"Best model: {best_model_name} "
+                f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info})"
             )
             logger.info(success_message)
         except Exception as e:
             if self.progress_tracker:
                 print()  # New line after progress bar
+            error_message = f"Enhanced model training failed: {str(e)}"
             logger.error(error_message)
             return False, error_message
 def main():
+    """Main execution function with enhanced CV support"""
     import argparse
     # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Train fake news detection model with cross-validation')
     parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
     parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
+    parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
     args = parser.parse_args()
     trainer = RobustModelTrainer()
+    # Apply CV folds from command line
+    if args.cv_folds:
+        trainer.cv_folds = args.cv_folds
+        trainer.cv_manager.cv_folds = args.cv_folds
     # Load custom configuration if provided
     if args.config_path and Path(args.config_path).exists():
         try:
             # Apply configuration
             trainer.test_size = config.get('test_size', trainer.test_size)
             trainer.cv_folds = config.get('cv_folds', trainer.cv_folds)
+            trainer.cv_manager.cv_folds = trainer.cv_folds
             trainer.max_features = config.get('max_features', trainer.max_features)
             trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range))
             # Update feature selection based on max_features
             trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features)
+            logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds: {config}")
         except Exception as e:
             logger.warning(f"Failed to load configuration: {e}, using defaults")