Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 21

Commit

8a926b4

1 Parent(s): 0cfbe2d

Update model/train.py

Browse files

Adding Enhanced Feature Engineering Pipeline

Files changed (1) hide show

model/train.py +312 -94

model/train.py CHANGED Viewed

@@ -1,5 +1,4 @@
-# File: model/train.py (MODIFIED)
-# Enhanced version with comprehensive cross-validation implementation
 import seaborn as sns
 import matplotlib.pyplot as plt
@@ -34,6 +33,21 @@ import warnings
 import re
 warnings.filterwarnings('ignore')
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
@@ -112,7 +126,7 @@ class ProgressTracker:
         # Create progress bar
         bar_length = 30
         filled_length = int(bar_length * self.current_step // self.total_steps)
-        bar = '█' * filled_length + '░' * (bar_length - filled_length)
         # Print progress (this will be visible in Streamlit logs)
         status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
@@ -146,8 +160,9 @@ class ProgressTracker:
         print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
-def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5) -> Dict:
-    """Estimate training time based on dataset characteristics"""
     # Base time estimates (in seconds) based on empirical testing
     base_times = {
@@ -158,6 +173,13 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
         'evaluation': max(0.5, dataset_size * 0.01),       # ~10ms per sample
     }
     # Hyperparameter tuning multipliers
     tuning_multipliers = {
         'logistic_regression': 8 if enable_tuning else 1,  # 8 param combinations
@@ -174,6 +196,10 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
     estimates['data_loading'] = 0.5
     estimates['preprocessing'] = base_times['preprocessing']
     estimates['vectorization'] = base_times['vectorization']
     estimates['feature_selection'] = base_times['feature_selection']
     # Model training (now includes CV)
@@ -191,8 +217,9 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
     # Total estimate
     total_estimate = sum(estimates.values())
-    # Add 20% buffer for overhead
-    total_estimate *= 1.2
     return {
         'detailed_estimates': estimates,
@@ -200,7 +227,8 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
         'total_formatted': str(timedelta(seconds=int(total_estimate))),
         'dataset_size': dataset_size,
         'enable_tuning': enable_tuning,
-        'cv_folds': cv_folds
     }
@@ -378,15 +406,25 @@ class CrossValidationManager:
             return {'error': str(e)}
-class RobustModelTrainer:
-    """Production-ready model trainer with comprehensive cross-validation"""
-    def __init__(self):
         self.setup_paths()
         self.setup_training_config()
         self.setup_models()
         self.progress_tracker = None
         self.cv_manager = CrossValidationManager()
     def setup_paths(self):
         """Setup all necessary paths with proper permissions"""
@@ -394,9 +432,10 @@ class RobustModelTrainer:
         self.data_dir = self.base_dir / "data"
         self.model_dir = self.base_dir / "model"
         self.results_dir = self.base_dir / "results"
         # Create directories with proper permissions
-        for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
             dir_path.mkdir(parents=True, exist_ok=True)
             # Ensure write permissions
             try:
@@ -406,25 +445,39 @@ class RobustModelTrainer:
         # File paths
         self.data_path = self.data_dir / "combined_dataset.csv"
-        self.model_path = Path("/tmp/model.pkl")  # Direct path to avoid permission issues
         self.vectorizer_path = Path("/tmp/vectorizer.pkl")
         self.pipeline_path = Path("/tmp/pipeline.pkl")
         self.metadata_path = Path("/tmp/metadata.json")
         self.evaluation_path = self.results_dir / "evaluation_results.json"
     def setup_training_config(self):
-        """Setup training configuration with CV parameters"""
         self.test_size = 0.2
         self.validation_size = 0.1
         self.random_state = 42
-        self.cv_folds = 5  # Primary CV folds
-        self.max_features = 5000  # Reduced for speed
-        self.min_df = 1  # More lenient for small datasets
         self.max_df = 0.95
-        self.ngram_range = (1, 2)  # Reduced for speed
-        self.max_iter = 500  # Reduced for speed
         self.class_weight = 'balanced'
-        self.feature_selection_k = 2000  # Reduced for speed
     def setup_models(self):
         """Setup model configurations for comparison"""
@@ -434,22 +487,22 @@ class RobustModelTrainer:
                     max_iter=self.max_iter,
                     class_weight=self.class_weight,
                     random_state=self.random_state,
-                    n_jobs=-1  # Use all cores
                 ),
                 'param_grid': {
-                    'model__C': [0.1, 1, 10],  # Reduced grid
                     'model__penalty': ['l2']
                 }
             },
             'random_forest': {
                 'model': RandomForestClassifier(
-                    n_estimators=50,  # Reduced for speed
                     class_weight=self.class_weight,
                     random_state=self.random_state,
-                    n_jobs=-1  # Use all cores
                 ),
                 'param_grid': {
-                    'model__n_estimators': [50, 100],  # Reduced grid
                     'model__max_depth': [10, None]
                 }
             }
@@ -494,10 +547,9 @@ class RobustModelTrainer:
                 return False, None, f"Need at least 2 classes, found: {unique_labels}"
             # Check minimum sample size for CV
-            min_samples_for_cv = self.cv_folds * 2  # At least 2 samples per fold
             if len(df) < min_samples_for_cv:
                 logger.warning(f"Dataset size ({len(df)}) is small for {self.cv_folds}-fold CV")
-                # Adjust CV folds for small datasets
                 self.cv_manager.cv_folds = max(2, len(df) // 3)
                 logger.info(f"Adjusted CV folds to {self.cv_manager.cv_folds}")
@@ -519,47 +571,79 @@ class RobustModelTrainer:
             logger.error(error_msg)
             return False, None, error_msg
-    def create_preprocessing_pipeline(self) -> Pipeline:
-        """Create preprocessing pipeline"""
         if self.progress_tracker:
-            self.progress_tracker.update("Creating pipeline")
-        # Use the standalone function instead of lambda
-        text_preprocessor = FunctionTransformer(
-            func=preprocess_text_function,
-            validate=False
-        )
-        # TF-IDF vectorization with optimized parameters
-        vectorizer = TfidfVectorizer(
-            max_features=self.max_features,
-            min_df=self.min_df,
-            max_df=self.max_df,
-            ngram_range=self.ngram_range,
-            stop_words='english',
-            sublinear_tf=True,
-            norm='l2'
-        )
-        # Feature selection
-        feature_selector = SelectKBest(
-            score_func=chi2,
-            k=min(self.feature_selection_k, self.max_features)
-        )
-        # Create pipeline
-        pipeline = Pipeline([
-            ('preprocess', text_preprocessor),
-            ('vectorize', vectorizer),
-            ('feature_select', feature_selector),
-            ('model', None)  # Will be set during training
-        ])
         return pipeline
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
-        """Comprehensive model evaluation with cross-validation integration"""
         if self.progress_tracker:
             self.progress_tracker.update("Evaluating model")
@@ -597,6 +681,25 @@ class RobustModelTrainer:
                 cv_f1_std = cv_results['test_scores']['f1']['std']
                 logger.info(f"CV F1 Score: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
         # Training accuracy for overfitting detection
         try:
             if X_train is not None and y_train is not None:
@@ -614,7 +717,8 @@ class RobustModelTrainer:
         """Perform hyperparameter tuning with nested cross-validation"""
         if self.progress_tracker:
-            self.progress_tracker.update(f"Tuning {model_name} with CV")
         try:
             # Set the model in the pipeline
@@ -709,15 +813,15 @@ class RobustModelTrainer:
                 raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
-        """Train and evaluate multiple models with comprehensive CV"""
         results = {}
         for model_name in self.models.keys():
-            logger.info(f"Training {model_name} with cross-validation...")
             try:
-                # Create pipeline
                 pipeline = self.create_preprocessing_pipeline()
                 # Hyperparameter tuning with CV
@@ -735,7 +839,8 @@ class RobustModelTrainer:
                     'model': best_model,
                     'tuning_results': tuning_results,
                     'evaluation_metrics': evaluation_metrics,
-                    'training_time': datetime.now().isoformat()
                 }
                 # Log results
@@ -791,7 +896,7 @@ class RobustModelTrainer:
         return best_model_name, best_model, best_metrics
     def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
-        """Save model artifacts and enhanced metadata with CV results"""
         try:
             if self.progress_tracker:
                 self.progress_tracker.update("Saving model")
@@ -807,20 +912,37 @@ class RobustModelTrainer:
                 joblib.dump(model, alt_pipeline_path)
                 logger.info(f"✅ Saved pipeline to {alt_pipeline_path}")
-            # Save individual components for backward compatibility
-            try:
-                if hasattr(model, 'named_steps') and 'model' in model.named_steps:
-                    joblib.dump(model.named_steps['model'], self.model_path)
-                    logger.info(f"✅ Saved model to {self.model_path}")
-            except Exception as e:
-                logger.warning(f"Could not save model component: {e}")
             try:
-                if hasattr(model, 'named_steps') and 'vectorize' in model.named_steps:
-                    joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
-                    logger.info(f"✅ Saved vectorizer to {self.vectorizer_path}")
             except Exception as e:
-                logger.warning(f"Could not save vectorizer component: {e}")
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
@@ -828,10 +950,15 @@ class RobustModelTrainer:
             # Extract CV results
             cv_results = metrics.get('cross_validation', {})
-            # Create enhanced metadata with CV information
             metadata = {
                 'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                 'model_type': model_name,
                 'data_version': data_hash,
                 'test_accuracy': metrics['accuracy'],
                 'test_f1': metrics['f1'],
@@ -845,10 +972,42 @@ class RobustModelTrainer:
                     'cv_folds': self.cv_folds,
                     'max_features': self.max_features,
                     'ngram_range': self.ngram_range,
-                    'feature_selection_k': self.feature_selection_k
                 }
             }
             # Add comprehensive CV results to metadata
             if cv_results and 'test_scores' in cv_results:
                 metadata['cross_validation'] = {
@@ -892,7 +1051,14 @@ class RobustModelTrainer:
             except Exception as e:
                 logger.warning(f"Could not save metadata: {e}")
-            logger.info(f"✅ Model artifacts saved successfully with CV results")
             return True
         except Exception as e:
@@ -906,10 +1072,18 @@ class RobustModelTrainer:
                 logger.error(f"Failed to save backup pipeline: {str(e2)}")
                 return False
-    def train_model(self, data_path: str = None) -> Tuple[bool, str]:
-        """Main training function with comprehensive CV pipeline"""
         try:
-            logger.info("Starting enhanced model training with cross-validation...")
             # Override data path if provided
             if data_path:
@@ -924,20 +1098,26 @@ class RobustModelTrainer:
             time_estimate = estimate_training_time(
                 len(df),
                 enable_tuning=True,
-                cv_folds=self.cv_folds
             )
             print(f"\n📊 Enhanced Training Configuration:")
             print(f"Dataset size: {len(df)} samples")
             print(f"Cross-validation folds: {self.cv_folds}")
             print(f"Estimated time: {time_estimate['total_formatted']}")
             print(f"Models to train: {len(self.models)}")
             print(f"Hyperparameter tuning: Enabled")
             print()
-            # Setup progress tracker (increased steps for CV)
-            total_steps = 4 + (len(self.models) * 3) + 1  # Load, split, 3*models (tune+cv+eval), select, save
-            self.progress_tracker = ProgressTracker(total_steps, "CV Training Progress")
             # Prepare data
             X = df['text'].values
@@ -972,20 +1152,20 @@ class RobustModelTrainer:
             if len(X_test) < 1:
                 return False, "Cannot create test set. Dataset too small."
-            # Train and evaluate models with CV
             results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
             # Select best model
             best_model_name, best_model, best_metrics = self.select_best_model(results)
-            # Save model artifacts with CV results
             if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
                 return False, "Failed to save model artifacts"
             # Finish progress tracking
             self.progress_tracker.finish()
-            # Create success message with CV information
             cv_results = best_metrics.get('cross_validation', {})
             cv_info = ""
             if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
@@ -993,10 +1173,18 @@ class RobustModelTrainer:
                 cv_f1_std = cv_results['test_scores']['f1']['std']
                 cv_info = f", CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})"
             success_message = (
-                f"Enhanced model training completed successfully. "
                 f"Best model: {best_model_name} "
-                f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info})"
             )
             logger.info(success_message)
@@ -1011,17 +1199,30 @@ class RobustModelTrainer:
 def main():
-    """Main execution function with enhanced CV support"""
     import argparse
     # Parse command line arguments
-    parser = argparse.ArgumentParser(description='Train fake news detection model with cross-validation')
     parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
     parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
     parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
     args = parser.parse_args()
-    trainer = RobustModelTrainer()
     # Apply CV folds from command line
     if args.cv_folds:
@@ -1041,6 +1242,10 @@ def main():
             trainer.max_features = config.get('max_features', trainer.max_features)
             trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range))
             # Filter models if specified
             selected_models = config.get('selected_models')
             if selected_models and len(selected_models) < len(trainer.models):
@@ -1050,7 +1255,9 @@ def main():
             # Update feature selection based on max_features
             trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features)
-            logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds: {config}")
         except Exception as e:
             logger.warning(f"Failed to load configuration: {e}, using defaults")
@@ -1059,6 +1266,17 @@ def main():
     if success:
         print(f"✅ {message}")
     else:
         print(f"❌ {message}")
         exit(1)

+# Enhanced version with comprehensive cross-validation and advanced feature engineering
 import seaborn as sns
 import matplotlib.pyplot as plt
 import re
 warnings.filterwarnings('ignore')
+# Import enhanced feature engineering components
+try:
+    from features.feature_engineer import AdvancedFeatureEngineer, create_enhanced_pipeline, analyze_feature_importance
+    from features.sentiment_analyzer import SentimentAnalyzer
+    from features.readability_analyzer import ReadabilityAnalyzer
+    from features.entity_analyzer import EntityAnalyzer
+    from features.linguistic_analyzer import LinguisticAnalyzer
+    ENHANCED_FEATURES_AVAILABLE = True
+    logger = logging.getLogger(__name__)
+    logger.info("Enhanced feature engineering components loaded successfully")
+except ImportError as e:
+    ENHANCED_FEATURES_AVAILABLE = False
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Enhanced features not available, falling back to basic TF-IDF: {e}")
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
         # Create progress bar
         bar_length = 30
         filled_length = int(bar_length * self.current_step // self.total_steps)
+        bar = '█' * filled_length + '▒' * (bar_length - filled_length)
         # Print progress (this will be visible in Streamlit logs)
         status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
         print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
+def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5,
+                          use_enhanced_features: bool = False) -> Dict:
+    """Estimate training time based on dataset characteristics and feature complexity"""
     # Base time estimates (in seconds) based on empirical testing
     base_times = {
         'evaluation': max(0.5, dataset_size * 0.01),       # ~10ms per sample
     }
+    # Enhanced feature engineering time multipliers
+    if use_enhanced_features:
+        base_times['preprocessing'] *= 2.5  # More complex preprocessing
+        base_times['vectorization'] *= 1.5  # Additional feature extraction
+        base_times['feature_selection'] *= 2.0  # More features to select from
+        base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05)  # New step
     # Hyperparameter tuning multipliers
     tuning_multipliers = {
         'logistic_regression': 8 if enable_tuning else 1,  # 8 param combinations
     estimates['data_loading'] = 0.5
     estimates['preprocessing'] = base_times['preprocessing']
     estimates['vectorization'] = base_times['vectorization']
+    if use_enhanced_features:
+        estimates['enhanced_feature_extraction'] = base_times['enhanced_feature_extraction']
     estimates['feature_selection'] = base_times['feature_selection']
     # Model training (now includes CV)
     # Total estimate
     total_estimate = sum(estimates.values())
+    # Add buffer for overhead (more for enhanced features)
+    buffer_multiplier = 1.4 if use_enhanced_features else 1.2
+    total_estimate *= buffer_multiplier
     return {
         'detailed_estimates': estimates,
         'total_formatted': str(timedelta(seconds=int(total_estimate))),
         'dataset_size': dataset_size,
         'enable_tuning': enable_tuning,
+        'cv_folds': cv_folds,
+        'use_enhanced_features': use_enhanced_features
     }
             return {'error': str(e)}
+class EnhancedModelTrainer:
+    """Production-ready model trainer with enhanced feature engineering and comprehensive CV"""
+    def __init__(self, use_enhanced_features: bool = None):
+        # Auto-detect enhanced features if not specified
+        if use_enhanced_features is None:
+            self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE
+        else:
+            self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE
         self.setup_paths()
         self.setup_training_config()
         self.setup_models()
         self.progress_tracker = None
         self.cv_manager = CrossValidationManager()
+        # Enhanced feature tracking
+        self.feature_engineer = None
+        self.feature_importance_results = {}
     def setup_paths(self):
         """Setup all necessary paths with proper permissions"""
         self.data_dir = self.base_dir / "data"
         self.model_dir = self.base_dir / "model"
         self.results_dir = self.base_dir / "results"
+        self.features_dir = self.base_dir / "features"  # New for enhanced features
         # Create directories with proper permissions
+        for dir_path in [self.data_dir, self.model_dir, self.results_dir, self.features_dir]:
             dir_path.mkdir(parents=True, exist_ok=True)
             # Ensure write permissions
             try:
         # File paths
         self.data_path = self.data_dir / "combined_dataset.csv"
+        self.model_path = Path("/tmp/model.pkl")
         self.vectorizer_path = Path("/tmp/vectorizer.pkl")
         self.pipeline_path = Path("/tmp/pipeline.pkl")
         self.metadata_path = Path("/tmp/metadata.json")
         self.evaluation_path = self.results_dir / "evaluation_results.json"
+        # Enhanced feature paths
+        self.feature_engineer_path = Path("/tmp/feature_engineer.pkl")
+        self.feature_importance_path = self.results_dir / "feature_importance.json"
     def setup_training_config(self):
+        """Setup training configuration with enhanced feature parameters"""
         self.test_size = 0.2
         self.validation_size = 0.1
         self.random_state = 42
+        self.cv_folds = 5
+        # Enhanced feature configuration
+        if self.use_enhanced_features:
+            self.max_features = 7500  # Increased for enhanced features
+            self.feature_selection_k = 3000  # More features to select from
+            logger.info("Using enhanced feature engineering pipeline")
+        else:
+            self.max_features = 5000  # Standard TF-IDF
+            self.feature_selection_k = 2000
+            logger.info("Using standard TF-IDF feature pipeline")
+        # Common parameters
+        self.min_df = 1
         self.max_df = 0.95
+        self.ngram_range = (1, 2)
+        self.max_iter = 500
         self.class_weight = 'balanced'
     def setup_models(self):
         """Setup model configurations for comparison"""
                     max_iter=self.max_iter,
                     class_weight=self.class_weight,
                     random_state=self.random_state,
+                    n_jobs=-1
                 ),
                 'param_grid': {
+                    'model__C': [0.1, 1, 10],
                     'model__penalty': ['l2']
                 }
             },
             'random_forest': {
                 'model': RandomForestClassifier(
+                    n_estimators=50,
                     class_weight=self.class_weight,
                     random_state=self.random_state,
+                    n_jobs=-1
                 ),
                 'param_grid': {
+                    'model__n_estimators': [50, 100],
                     'model__max_depth': [10, None]
                 }
             }
                 return False, None, f"Need at least 2 classes, found: {unique_labels}"
             # Check minimum sample size for CV
+            min_samples_for_cv = self.cv_folds * 2
             if len(df) < min_samples_for_cv:
                 logger.warning(f"Dataset size ({len(df)}) is small for {self.cv_folds}-fold CV")
                 self.cv_manager.cv_folds = max(2, len(df) // 3)
                 logger.info(f"Adjusted CV folds to {self.cv_manager.cv_folds}")
             logger.error(error_msg)
             return False, None, error_msg
+    def create_preprocessing_pipeline(self, use_enhanced: bool = None) -> Pipeline:
+        """Create preprocessing pipeline with optional enhanced features"""
+        if use_enhanced is None:
+            use_enhanced = self.use_enhanced_features
         if self.progress_tracker:
+            feature_type = "enhanced" if use_enhanced else "standard"
+            self.progress_tracker.update(f"Creating {feature_type} pipeline")
+        if use_enhanced and ENHANCED_FEATURES_AVAILABLE:
+            logger.info("Creating enhanced feature engineering pipeline...")
+            # Create enhanced feature engineer
+            feature_engineer = AdvancedFeatureEngineer(
+                enable_sentiment=True,
+                enable_readability=True,
+                enable_entities=True,
+                enable_linguistic=True,
+                feature_selection_k=self.feature_selection_k,
+                tfidf_max_features=self.max_features,
+                ngram_range=self.ngram_range,
+                min_df=self.min_df,
+                max_df=self.max_df
+            )
+            # Create pipeline with enhanced features
+            pipeline = Pipeline([
+                ('enhanced_features', feature_engineer),
+                ('model', None)  # Will be set during training
+            ])
+            # Store reference for later use
+            self.feature_engineer = feature_engineer
+        else:
+            logger.info("Creating standard TF-IDF pipeline...")
+            # Use the standalone function instead of lambda
+            text_preprocessor = FunctionTransformer(
+                func=preprocess_text_function,
+                validate=False
+            )
+            # TF-IDF vectorization with optimized parameters
+            vectorizer = TfidfVectorizer(
+                max_features=self.max_features,
+                min_df=self.min_df,
+                max_df=self.max_df,
+                ngram_range=self.ngram_range,
+                stop_words='english',
+                sublinear_tf=True,
+                norm='l2'
+            )
+            # Feature selection
+            feature_selector = SelectKBest(
+                score_func=chi2,
+                k=min(self.feature_selection_k, self.max_features)
+            )
+            # Create standard pipeline
+            pipeline = Pipeline([
+                ('preprocess', text_preprocessor),
+                ('vectorize', vectorizer),
+                ('feature_select', feature_selector),
+                ('model', None)  # Will be set during training
+            ])
         return pipeline
     def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
+        """Comprehensive model evaluation with enhanced feature analysis"""
         if self.progress_tracker:
             self.progress_tracker.update("Evaluating model")
                 cv_f1_std = cv_results['test_scores']['f1']['std']
                 logger.info(f"CV F1 Score: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
+        # Enhanced feature analysis
+        if self.use_enhanced_features and self.feature_engineer is not None:
+            try:
+                # Get feature importance if available
+                if hasattr(self.feature_engineer, 'get_feature_importance'):
+                    feature_importance = self.feature_engineer.get_feature_importance(top_k=20)
+                    metrics['top_features'] = feature_importance
+                # Get feature metadata
+                if hasattr(self.feature_engineer, 'get_feature_metadata'):
+                    feature_metadata = self.feature_engineer.get_feature_metadata()
+                    metrics['feature_metadata'] = feature_metadata
+                    logger.info(f"Enhanced features used: {feature_metadata['total_features']}")
+                    logger.info(f"Feature breakdown: {feature_metadata['feature_types']}")
+            except Exception as e:
+                logger.warning(f"Enhanced feature analysis failed: {e}")
         # Training accuracy for overfitting detection
         try:
             if X_train is not None and y_train is not None:
         """Perform hyperparameter tuning with nested cross-validation"""
         if self.progress_tracker:
+            feature_type = "enhanced" if self.use_enhanced_features else "standard"
+            self.progress_tracker.update(f"Tuning {model_name} with {feature_type} features")
         try:
             # Set the model in the pipeline
                 raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
     def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
+        """Train and evaluate multiple models with enhanced features and comprehensive CV"""
         results = {}
         for model_name in self.models.keys():
+            logger.info(f"Training {model_name} with {'enhanced' if self.use_enhanced_features else 'standard'} features...")
             try:
+                # Create pipeline (enhanced or standard)
                 pipeline = self.create_preprocessing_pipeline()
                 # Hyperparameter tuning with CV
                     'model': best_model,
                     'tuning_results': tuning_results,
                     'evaluation_metrics': evaluation_metrics,
+                    'training_time': datetime.now().isoformat(),
+                    'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
                 }
                 # Log results
         return best_model_name, best_model, best_metrics
     def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
+        """Save model artifacts and enhanced metadata with feature engineering results"""
         try:
             if self.progress_tracker:
                 self.progress_tracker.update("Saving model")
                 joblib.dump(model, alt_pipeline_path)
                 logger.info(f"✅ Saved pipeline to {alt_pipeline_path}")
+            # Save enhanced feature engineer if available
+            if self.use_enhanced_features and self.feature_engineer is not None:
+                try:
+                    self.feature_engineer.save_pipeline(self.feature_engineer_path)
+                    logger.info(f"✅ Saved feature engineer to {self.feature_engineer_path}")
+                except Exception as e:
+                    logger.warning(f"Could not save feature engineer: {e}")
+            # Save individual components for backward compatibility
             try:
+                if hasattr(model, 'named_steps'):
+                    if 'model' in model.named_steps:
+                        joblib.dump(model.named_steps['model'], self.model_path)
+                        logger.info(f"✅ Saved model component to {self.model_path}")
+                    # Save vectorizer (standard pipeline) or enhanced features reference
+                    if 'vectorize' in model.named_steps:
+                        joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
+                        logger.info(f"✅ Saved vectorizer to {self.vectorizer_path}")
+                    elif 'enhanced_features' in model.named_steps:
+                        # Save reference to enhanced features
+                        enhanced_ref = {
+                            'type': 'enhanced_features',
+                            'feature_engineer_path': str(self.feature_engineer_path),
+                            'metadata': self.feature_engineer.get_feature_metadata() if self.feature_engineer else {}
+                        }
+                        joblib.dump(enhanced_ref, self.vectorizer_path)
+                        logger.info(f"✅ Saved enhanced features reference to {self.vectorizer_path}")
             except Exception as e:
+                logger.warning(f"Could not save individual components: {e}")
             # Generate data hash
             data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
             # Extract CV results
             cv_results = metrics.get('cross_validation', {})
+            # Create enhanced metadata with feature engineering information
             metadata = {
                 'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                 'model_type': model_name,
+                'feature_engineering': {
+                    'type': 'enhanced' if self.use_enhanced_features else 'standard',
+                    'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
+                    'enhanced_features_used': self.use_enhanced_features
+                },
                 'data_version': data_hash,
                 'test_accuracy': metrics['accuracy'],
                 'test_f1': metrics['f1'],
                     'cv_folds': self.cv_folds,
                     'max_features': self.max_features,
                     'ngram_range': self.ngram_range,
+                    'feature_selection_k': self.feature_selection_k,
+                    'use_enhanced_features': self.use_enhanced_features
                 }
             }
+            # Add enhanced feature metadata
+            if self.use_enhanced_features:
+                feature_metadata = metrics.get('feature_metadata', {})
+                if feature_metadata:
+                    metadata['enhanced_features'] = {
+                        'total_features': feature_metadata.get('total_features', 0),
+                        'feature_types': feature_metadata.get('feature_types', {}),
+                        'configuration': feature_metadata.get('configuration', {})
+                    }
+                # Add top features if available
+                top_features = metrics.get('top_features', {})
+                if top_features:
+                    metadata['top_features'] = dict(list(top_features.items())[:10])  # Top 10 features
+                    # Save detailed feature importance
+                    try:
+                        feature_analysis = {
+                            'top_features': top_features,
+                            'feature_metadata': feature_metadata,
+                            'timestamp': datetime.now().isoformat(),
+                            'model_version': metadata['model_version']
+                        }
+                        with open(self.feature_importance_path, 'w') as f:
+                            json.dump(feature_analysis, f, indent=2)
+                        logger.info(f"✅ Saved feature importance analysis to {self.feature_importance_path}")
+                    except Exception as e:
+                        logger.warning(f"Could not save feature importance: {e}")
             # Add comprehensive CV results to metadata
             if cv_results and 'test_scores' in cv_results:
                 metadata['cross_validation'] = {
             except Exception as e:
                 logger.warning(f"Could not save metadata: {e}")
+            # Log feature engineering summary
+            if self.use_enhanced_features and feature_metadata:
+                logger.info(f"✅ Enhanced features summary:")
+                logger.info(f"   Total features: {feature_metadata.get('total_features', 0)}")
+                for feature_type, count in feature_metadata.get('feature_types', {}).items():
+                    logger.info(f"   {feature_type}: {count}")
+            logger.info(f"✅ Model artifacts saved successfully with {'enhanced' if self.use_enhanced_features else 'standard'} features")
             return True
         except Exception as e:
                 logger.error(f"Failed to save backup pipeline: {str(e2)}")
                 return False
+    def train_model(self, data_path: str = None, force_enhanced: bool = None) -> Tuple[bool, str]:
+        """Main training function with enhanced feature engineering pipeline"""
         try:
+            # Override enhanced features setting if specified
+            if force_enhanced is not None:
+                original_setting = self.use_enhanced_features
+                self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE
+                if force_enhanced and not ENHANCED_FEATURES_AVAILABLE:
+                    logger.warning("Enhanced features requested but not available, using standard features")
+            feature_type = "enhanced" if self.use_enhanced_features else "standard"
+            logger.info(f"Starting {feature_type} model training with cross-validation...")
             # Override data path if provided
             if data_path:
             time_estimate = estimate_training_time(
                 len(df),
                 enable_tuning=True,
+                cv_folds=self.cv_folds,
+                use_enhanced_features=self.use_enhanced_features
             )
             print(f"\n📊 Enhanced Training Configuration:")
             print(f"Dataset size: {len(df)} samples")
+            print(f"Feature engineering: {feature_type.title()}")
             print(f"Cross-validation folds: {self.cv_folds}")
             print(f"Estimated time: {time_estimate['total_formatted']}")
             print(f"Models to train: {len(self.models)}")
             print(f"Hyperparameter tuning: Enabled")
+            if self.use_enhanced_features:
+                print(f"Enhanced features: Sentiment, Readability, Entities, Linguistic")
             print()
+            # Setup progress tracker (adjusted for enhanced features)
+            base_steps = 4 + (len(self.models) * 3) + 1  # Basic steps
+            enhanced_steps = 2 if self.use_enhanced_features else 0  # Feature engineering steps
+            total_steps = base_steps + enhanced_steps
+            self.progress_tracker = ProgressTracker(total_steps, f"{feature_type.title()} Training Progress")
             # Prepare data
             X = df['text'].values
             if len(X_test) < 1:
                 return False, "Cannot create test set. Dataset too small."
+            # Train and evaluate models with enhanced features
             results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
             # Select best model
             best_model_name, best_model, best_metrics = self.select_best_model(results)
+            # Save model artifacts with enhanced feature information
             if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
                 return False, "Failed to save model artifacts"
             # Finish progress tracking
             self.progress_tracker.finish()
+            # Create success message with enhanced feature information
             cv_results = best_metrics.get('cross_validation', {})
             cv_info = ""
             if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
                 cv_f1_std = cv_results['test_scores']['f1']['std']
                 cv_info = f", CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})"
+            # Enhanced features summary
+            feature_info = ""
+            if self.use_enhanced_features:
+                feature_metadata = best_metrics.get('feature_metadata', {})
+                if feature_metadata:
+                    total_features = feature_metadata.get('total_features', 0)
+                    feature_info = f", Enhanced Features: {total_features}"
             success_message = (
+                f"{feature_type.title()} model training completed successfully. "
                 f"Best model: {best_model_name} "
+                f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info})"
             )
             logger.info(success_message)
 def main():
+    """Main execution function with enhanced feature engineering support"""
     import argparse
     # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Train fake news detection model with enhanced features')
     parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
     parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
     parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
+    parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features')
+    parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only')
     args = parser.parse_args()
+    # Determine feature engineering mode
+    use_enhanced = None
+    if args.enhanced_features and args.standard_features:
+        logger.warning("Both --enhanced_features and --standard_features specified. Using auto-detection.")
+    elif args.enhanced_features:
+        use_enhanced = True
+        logger.info("Enhanced features explicitly requested")
+    elif args.standard_features:
+        use_enhanced = False
+        logger.info("Standard features explicitly requested")
+    trainer = EnhancedModelTrainer(use_enhanced_features=use_enhanced)
     # Apply CV folds from command line
     if args.cv_folds:
             trainer.max_features = config.get('max_features', trainer.max_features)
             trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range))
+            # Enhanced feature configuration
+            if 'enhanced_features' in config and use_enhanced is None:
+                trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE
             # Filter models if specified
             selected_models = config.get('selected_models')
             if selected_models and len(selected_models) < len(trainer.models):
             # Update feature selection based on max_features
             trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features)
+            logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds")
+            if trainer.use_enhanced_features:
+                logger.info("Enhanced features enabled via configuration")
         except Exception as e:
             logger.warning(f"Failed to load configuration: {e}, using defaults")
     if success:
         print(f"✅ {message}")
+        # Print feature engineering summary
+        if trainer.use_enhanced_features and trainer.feature_engineer:
+            try:
+                metadata = trainer.feature_engineer.get_feature_metadata()
+                print(f"\n📈 Enhanced Feature Engineering Summary:")
+                print(f"Total features generated: {metadata['total_features']}")
+                for feature_type, count in metadata['feature_types'].items():
+                    print(f"  {feature_type}: {count}")
+            except Exception as e:
+                logger.warning(f"Could not display feature summary: {e}")
     else:
         print(f"❌ {message}")
         exit(1)