Commit
Β·
8a926b4
1
Parent(s):
0cfbe2d
Update model/train.py
Browse filesAdding Enhanced Feature Engineering Pipeline
- model/train.py +312 -94
model/train.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
-
# Enhanced version with comprehensive cross-validation implementation
|
| 3 |
|
| 4 |
import seaborn as sns
|
| 5 |
import matplotlib.pyplot as plt
|
|
@@ -34,6 +33,21 @@ import warnings
|
|
| 34 |
import re
|
| 35 |
warnings.filterwarnings('ignore')
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# Configure logging
|
| 38 |
logging.basicConfig(
|
| 39 |
level=logging.INFO,
|
|
@@ -112,7 +126,7 @@ class ProgressTracker:
|
|
| 112 |
# Create progress bar
|
| 113 |
bar_length = 30
|
| 114 |
filled_length = int(bar_length * self.current_step // self.total_steps)
|
| 115 |
-
bar = 'β' * filled_length + '
|
| 116 |
|
| 117 |
# Print progress (this will be visible in Streamlit logs)
|
| 118 |
status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
|
|
@@ -146,8 +160,9 @@ class ProgressTracker:
|
|
| 146 |
print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
|
| 147 |
|
| 148 |
|
| 149 |
-
def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5
|
| 150 |
-
|
|
|
|
| 151 |
|
| 152 |
# Base time estimates (in seconds) based on empirical testing
|
| 153 |
base_times = {
|
|
@@ -158,6 +173,13 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
|
|
| 158 |
'evaluation': max(0.5, dataset_size * 0.01), # ~10ms per sample
|
| 159 |
}
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
# Hyperparameter tuning multipliers
|
| 162 |
tuning_multipliers = {
|
| 163 |
'logistic_regression': 8 if enable_tuning else 1, # 8 param combinations
|
|
@@ -174,6 +196,10 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
|
|
| 174 |
estimates['data_loading'] = 0.5
|
| 175 |
estimates['preprocessing'] = base_times['preprocessing']
|
| 176 |
estimates['vectorization'] = base_times['vectorization']
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
estimates['feature_selection'] = base_times['feature_selection']
|
| 178 |
|
| 179 |
# Model training (now includes CV)
|
|
@@ -191,8 +217,9 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
|
|
| 191 |
# Total estimate
|
| 192 |
total_estimate = sum(estimates.values())
|
| 193 |
|
| 194 |
-
# Add
|
| 195 |
-
|
|
|
|
| 196 |
|
| 197 |
return {
|
| 198 |
'detailed_estimates': estimates,
|
|
@@ -200,7 +227,8 @@ def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_fol
|
|
| 200 |
'total_formatted': str(timedelta(seconds=int(total_estimate))),
|
| 201 |
'dataset_size': dataset_size,
|
| 202 |
'enable_tuning': enable_tuning,
|
| 203 |
-
'cv_folds': cv_folds
|
|
|
|
| 204 |
}
|
| 205 |
|
| 206 |
|
|
@@ -378,15 +406,25 @@ class CrossValidationManager:
|
|
| 378 |
return {'error': str(e)}
|
| 379 |
|
| 380 |
|
| 381 |
-
class
|
| 382 |
-
"""Production-ready model trainer with comprehensive
|
| 383 |
|
| 384 |
-
def __init__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
self.setup_paths()
|
| 386 |
self.setup_training_config()
|
| 387 |
self.setup_models()
|
| 388 |
self.progress_tracker = None
|
| 389 |
self.cv_manager = CrossValidationManager()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
|
| 391 |
def setup_paths(self):
|
| 392 |
"""Setup all necessary paths with proper permissions"""
|
|
@@ -394,9 +432,10 @@ class RobustModelTrainer:
|
|
| 394 |
self.data_dir = self.base_dir / "data"
|
| 395 |
self.model_dir = self.base_dir / "model"
|
| 396 |
self.results_dir = self.base_dir / "results"
|
|
|
|
| 397 |
|
| 398 |
# Create directories with proper permissions
|
| 399 |
-
for dir_path in [self.data_dir, self.model_dir, self.results_dir]:
|
| 400 |
dir_path.mkdir(parents=True, exist_ok=True)
|
| 401 |
# Ensure write permissions
|
| 402 |
try:
|
|
@@ -406,25 +445,39 @@ class RobustModelTrainer:
|
|
| 406 |
|
| 407 |
# File paths
|
| 408 |
self.data_path = self.data_dir / "combined_dataset.csv"
|
| 409 |
-
self.model_path = Path("/tmp/model.pkl")
|
| 410 |
self.vectorizer_path = Path("/tmp/vectorizer.pkl")
|
| 411 |
self.pipeline_path = Path("/tmp/pipeline.pkl")
|
| 412 |
self.metadata_path = Path("/tmp/metadata.json")
|
| 413 |
self.evaluation_path = self.results_dir / "evaluation_results.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
|
| 415 |
def setup_training_config(self):
|
| 416 |
-
"""Setup training configuration with
|
| 417 |
self.test_size = 0.2
|
| 418 |
self.validation_size = 0.1
|
| 419 |
self.random_state = 42
|
| 420 |
-
self.cv_folds = 5
|
| 421 |
-
|
| 422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
self.max_df = 0.95
|
| 424 |
-
self.ngram_range = (1, 2)
|
| 425 |
-
self.max_iter = 500
|
| 426 |
self.class_weight = 'balanced'
|
| 427 |
-
self.feature_selection_k = 2000 # Reduced for speed
|
| 428 |
|
| 429 |
def setup_models(self):
|
| 430 |
"""Setup model configurations for comparison"""
|
|
@@ -434,22 +487,22 @@ class RobustModelTrainer:
|
|
| 434 |
max_iter=self.max_iter,
|
| 435 |
class_weight=self.class_weight,
|
| 436 |
random_state=self.random_state,
|
| 437 |
-
n_jobs=-1
|
| 438 |
),
|
| 439 |
'param_grid': {
|
| 440 |
-
'model__C': [0.1, 1, 10],
|
| 441 |
'model__penalty': ['l2']
|
| 442 |
}
|
| 443 |
},
|
| 444 |
'random_forest': {
|
| 445 |
'model': RandomForestClassifier(
|
| 446 |
-
n_estimators=50,
|
| 447 |
class_weight=self.class_weight,
|
| 448 |
random_state=self.random_state,
|
| 449 |
-
n_jobs=-1
|
| 450 |
),
|
| 451 |
'param_grid': {
|
| 452 |
-
'model__n_estimators': [50, 100],
|
| 453 |
'model__max_depth': [10, None]
|
| 454 |
}
|
| 455 |
}
|
|
@@ -494,10 +547,9 @@ class RobustModelTrainer:
|
|
| 494 |
return False, None, f"Need at least 2 classes, found: {unique_labels}"
|
| 495 |
|
| 496 |
# Check minimum sample size for CV
|
| 497 |
-
min_samples_for_cv = self.cv_folds * 2
|
| 498 |
if len(df) < min_samples_for_cv:
|
| 499 |
logger.warning(f"Dataset size ({len(df)}) is small for {self.cv_folds}-fold CV")
|
| 500 |
-
# Adjust CV folds for small datasets
|
| 501 |
self.cv_manager.cv_folds = max(2, len(df) // 3)
|
| 502 |
logger.info(f"Adjusted CV folds to {self.cv_manager.cv_folds}")
|
| 503 |
|
|
@@ -519,47 +571,79 @@ class RobustModelTrainer:
|
|
| 519 |
logger.error(error_msg)
|
| 520 |
return False, None, error_msg
|
| 521 |
|
| 522 |
-
def create_preprocessing_pipeline(self) -> Pipeline:
|
| 523 |
-
"""Create preprocessing pipeline"""
|
|
|
|
|
|
|
|
|
|
| 524 |
|
| 525 |
if self.progress_tracker:
|
| 526 |
-
|
|
|
|
| 527 |
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
|
| 559 |
return pipeline
|
| 560 |
|
| 561 |
def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
|
| 562 |
-
"""Comprehensive model evaluation with
|
| 563 |
|
| 564 |
if self.progress_tracker:
|
| 565 |
self.progress_tracker.update("Evaluating model")
|
|
@@ -597,6 +681,25 @@ class RobustModelTrainer:
|
|
| 597 |
cv_f1_std = cv_results['test_scores']['f1']['std']
|
| 598 |
logger.info(f"CV F1 Score: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})")
|
| 599 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
# Training accuracy for overfitting detection
|
| 601 |
try:
|
| 602 |
if X_train is not None and y_train is not None:
|
|
@@ -614,7 +717,8 @@ class RobustModelTrainer:
|
|
| 614 |
"""Perform hyperparameter tuning with nested cross-validation"""
|
| 615 |
|
| 616 |
if self.progress_tracker:
|
| 617 |
-
self.
|
|
|
|
| 618 |
|
| 619 |
try:
|
| 620 |
# Set the model in the pipeline
|
|
@@ -709,15 +813,15 @@ class RobustModelTrainer:
|
|
| 709 |
raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
|
| 710 |
|
| 711 |
def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
|
| 712 |
-
"""Train and evaluate multiple models with comprehensive CV"""
|
| 713 |
|
| 714 |
results = {}
|
| 715 |
|
| 716 |
for model_name in self.models.keys():
|
| 717 |
-
logger.info(f"Training {model_name} with
|
| 718 |
|
| 719 |
try:
|
| 720 |
-
# Create pipeline
|
| 721 |
pipeline = self.create_preprocessing_pipeline()
|
| 722 |
|
| 723 |
# Hyperparameter tuning with CV
|
|
@@ -735,7 +839,8 @@ class RobustModelTrainer:
|
|
| 735 |
'model': best_model,
|
| 736 |
'tuning_results': tuning_results,
|
| 737 |
'evaluation_metrics': evaluation_metrics,
|
| 738 |
-
'training_time': datetime.now().isoformat()
|
|
|
|
| 739 |
}
|
| 740 |
|
| 741 |
# Log results
|
|
@@ -791,7 +896,7 @@ class RobustModelTrainer:
|
|
| 791 |
return best_model_name, best_model, best_metrics
|
| 792 |
|
| 793 |
def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
|
| 794 |
-
"""Save model artifacts and enhanced metadata with
|
| 795 |
try:
|
| 796 |
if self.progress_tracker:
|
| 797 |
self.progress_tracker.update("Saving model")
|
|
@@ -807,20 +912,37 @@ class RobustModelTrainer:
|
|
| 807 |
joblib.dump(model, alt_pipeline_path)
|
| 808 |
logger.info(f"β
Saved pipeline to {alt_pipeline_path}")
|
| 809 |
|
| 810 |
-
# Save
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
logger.info(f"β
Saved
|
| 815 |
-
|
| 816 |
-
|
| 817 |
|
|
|
|
| 818 |
try:
|
| 819 |
-
if hasattr(model, 'named_steps')
|
| 820 |
-
|
| 821 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 822 |
except Exception as e:
|
| 823 |
-
logger.warning(f"Could not save
|
| 824 |
|
| 825 |
# Generate data hash
|
| 826 |
data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
|
|
@@ -828,10 +950,15 @@ class RobustModelTrainer:
|
|
| 828 |
# Extract CV results
|
| 829 |
cv_results = metrics.get('cross_validation', {})
|
| 830 |
|
| 831 |
-
# Create enhanced metadata with
|
| 832 |
metadata = {
|
| 833 |
'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
| 834 |
'model_type': model_name,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 835 |
'data_version': data_hash,
|
| 836 |
'test_accuracy': metrics['accuracy'],
|
| 837 |
'test_f1': metrics['f1'],
|
|
@@ -845,10 +972,42 @@ class RobustModelTrainer:
|
|
| 845 |
'cv_folds': self.cv_folds,
|
| 846 |
'max_features': self.max_features,
|
| 847 |
'ngram_range': self.ngram_range,
|
| 848 |
-
'feature_selection_k': self.feature_selection_k
|
|
|
|
| 849 |
}
|
| 850 |
}
|
| 851 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
# Add comprehensive CV results to metadata
|
| 853 |
if cv_results and 'test_scores' in cv_results:
|
| 854 |
metadata['cross_validation'] = {
|
|
@@ -892,7 +1051,14 @@ class RobustModelTrainer:
|
|
| 892 |
except Exception as e:
|
| 893 |
logger.warning(f"Could not save metadata: {e}")
|
| 894 |
|
| 895 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
return True
|
| 897 |
|
| 898 |
except Exception as e:
|
|
@@ -906,10 +1072,18 @@ class RobustModelTrainer:
|
|
| 906 |
logger.error(f"Failed to save backup pipeline: {str(e2)}")
|
| 907 |
return False
|
| 908 |
|
| 909 |
-
def train_model(self, data_path: str = None) -> Tuple[bool, str]:
|
| 910 |
-
"""Main training function with
|
| 911 |
try:
|
| 912 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 913 |
|
| 914 |
# Override data path if provided
|
| 915 |
if data_path:
|
|
@@ -924,20 +1098,26 @@ class RobustModelTrainer:
|
|
| 924 |
time_estimate = estimate_training_time(
|
| 925 |
len(df),
|
| 926 |
enable_tuning=True,
|
| 927 |
-
cv_folds=self.cv_folds
|
|
|
|
| 928 |
)
|
| 929 |
|
| 930 |
print(f"\nπ Enhanced Training Configuration:")
|
| 931 |
print(f"Dataset size: {len(df)} samples")
|
|
|
|
| 932 |
print(f"Cross-validation folds: {self.cv_folds}")
|
| 933 |
print(f"Estimated time: {time_estimate['total_formatted']}")
|
| 934 |
print(f"Models to train: {len(self.models)}")
|
| 935 |
print(f"Hyperparameter tuning: Enabled")
|
|
|
|
|
|
|
| 936 |
print()
|
| 937 |
|
| 938 |
-
# Setup progress tracker (
|
| 939 |
-
|
| 940 |
-
self.
|
|
|
|
|
|
|
| 941 |
|
| 942 |
# Prepare data
|
| 943 |
X = df['text'].values
|
|
@@ -972,20 +1152,20 @@ class RobustModelTrainer:
|
|
| 972 |
if len(X_test) < 1:
|
| 973 |
return False, "Cannot create test set. Dataset too small."
|
| 974 |
|
| 975 |
-
# Train and evaluate models with
|
| 976 |
results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
|
| 977 |
|
| 978 |
# Select best model
|
| 979 |
best_model_name, best_model, best_metrics = self.select_best_model(results)
|
| 980 |
|
| 981 |
-
# Save model artifacts with
|
| 982 |
if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
|
| 983 |
return False, "Failed to save model artifacts"
|
| 984 |
|
| 985 |
# Finish progress tracking
|
| 986 |
self.progress_tracker.finish()
|
| 987 |
|
| 988 |
-
# Create success message with
|
| 989 |
cv_results = best_metrics.get('cross_validation', {})
|
| 990 |
cv_info = ""
|
| 991 |
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
|
|
@@ -993,10 +1173,18 @@ class RobustModelTrainer:
|
|
| 993 |
cv_f1_std = cv_results['test_scores']['f1']['std']
|
| 994 |
cv_info = f", CV F1: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})"
|
| 995 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 996 |
success_message = (
|
| 997 |
-
f"
|
| 998 |
f"Best model: {best_model_name} "
|
| 999 |
-
f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info})"
|
| 1000 |
)
|
| 1001 |
|
| 1002 |
logger.info(success_message)
|
|
@@ -1011,17 +1199,30 @@ class RobustModelTrainer:
|
|
| 1011 |
|
| 1012 |
|
| 1013 |
def main():
|
| 1014 |
-
"""Main execution function with enhanced
|
| 1015 |
import argparse
|
| 1016 |
|
| 1017 |
# Parse command line arguments
|
| 1018 |
-
parser = argparse.ArgumentParser(description='Train fake news detection model with
|
| 1019 |
parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
|
| 1020 |
parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
|
| 1021 |
parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
|
|
|
|
|
|
|
| 1022 |
args = parser.parse_args()
|
| 1023 |
|
| 1024 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
|
| 1026 |
# Apply CV folds from command line
|
| 1027 |
if args.cv_folds:
|
|
@@ -1041,6 +1242,10 @@ def main():
|
|
| 1041 |
trainer.max_features = config.get('max_features', trainer.max_features)
|
| 1042 |
trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range))
|
| 1043 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1044 |
# Filter models if specified
|
| 1045 |
selected_models = config.get('selected_models')
|
| 1046 |
if selected_models and len(selected_models) < len(trainer.models):
|
|
@@ -1050,7 +1255,9 @@ def main():
|
|
| 1050 |
# Update feature selection based on max_features
|
| 1051 |
trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features)
|
| 1052 |
|
| 1053 |
-
logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds
|
|
|
|
|
|
|
| 1054 |
|
| 1055 |
except Exception as e:
|
| 1056 |
logger.warning(f"Failed to load configuration: {e}, using defaults")
|
|
@@ -1059,6 +1266,17 @@ def main():
|
|
| 1059 |
|
| 1060 |
if success:
|
| 1061 |
print(f"β
{message}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1062 |
else:
|
| 1063 |
print(f"β {message}")
|
| 1064 |
exit(1)
|
|
|
|
| 1 |
+
# Enhanced version with comprehensive cross-validation and advanced feature engineering
|
|
|
|
| 2 |
|
| 3 |
import seaborn as sns
|
| 4 |
import matplotlib.pyplot as plt
|
|
|
|
| 33 |
import re
|
| 34 |
warnings.filterwarnings('ignore')
|
| 35 |
|
| 36 |
+
# Import enhanced feature engineering components
|
| 37 |
+
try:
|
| 38 |
+
from features.feature_engineer import AdvancedFeatureEngineer, create_enhanced_pipeline, analyze_feature_importance
|
| 39 |
+
from features.sentiment_analyzer import SentimentAnalyzer
|
| 40 |
+
from features.readability_analyzer import ReadabilityAnalyzer
|
| 41 |
+
from features.entity_analyzer import EntityAnalyzer
|
| 42 |
+
from features.linguistic_analyzer import LinguisticAnalyzer
|
| 43 |
+
ENHANCED_FEATURES_AVAILABLE = True
|
| 44 |
+
logger = logging.getLogger(__name__)
|
| 45 |
+
logger.info("Enhanced feature engineering components loaded successfully")
|
| 46 |
+
except ImportError as e:
|
| 47 |
+
ENHANCED_FEATURES_AVAILABLE = False
|
| 48 |
+
logger = logging.getLogger(__name__)
|
| 49 |
+
logger.warning(f"Enhanced features not available, falling back to basic TF-IDF: {e}")
|
| 50 |
+
|
| 51 |
# Configure logging
|
| 52 |
logging.basicConfig(
|
| 53 |
level=logging.INFO,
|
|
|
|
| 126 |
# Create progress bar
|
| 127 |
bar_length = 30
|
| 128 |
filled_length = int(bar_length * self.current_step // self.total_steps)
|
| 129 |
+
bar = 'β' * filled_length + 'β' * (bar_length - filled_length)
|
| 130 |
|
| 131 |
# Print progress (this will be visible in Streamlit logs)
|
| 132 |
status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}"
|
|
|
|
| 160 |
print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}")
|
| 161 |
|
| 162 |
|
| 163 |
+
def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5,
|
| 164 |
+
use_enhanced_features: bool = False) -> Dict:
|
| 165 |
+
"""Estimate training time based on dataset characteristics and feature complexity"""
|
| 166 |
|
| 167 |
# Base time estimates (in seconds) based on empirical testing
|
| 168 |
base_times = {
|
|
|
|
| 173 |
'evaluation': max(0.5, dataset_size * 0.01), # ~10ms per sample
|
| 174 |
}
|
| 175 |
|
| 176 |
+
# Enhanced feature engineering time multipliers
|
| 177 |
+
if use_enhanced_features:
|
| 178 |
+
base_times['preprocessing'] *= 2.5 # More complex preprocessing
|
| 179 |
+
base_times['vectorization'] *= 1.5 # Additional feature extraction
|
| 180 |
+
base_times['feature_selection'] *= 2.0 # More features to select from
|
| 181 |
+
base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05) # New step
|
| 182 |
+
|
| 183 |
# Hyperparameter tuning multipliers
|
| 184 |
tuning_multipliers = {
|
| 185 |
'logistic_regression': 8 if enable_tuning else 1, # 8 param combinations
|
|
|
|
| 196 |
estimates['data_loading'] = 0.5
|
| 197 |
estimates['preprocessing'] = base_times['preprocessing']
|
| 198 |
estimates['vectorization'] = base_times['vectorization']
|
| 199 |
+
|
| 200 |
+
if use_enhanced_features:
|
| 201 |
+
estimates['enhanced_feature_extraction'] = base_times['enhanced_feature_extraction']
|
| 202 |
+
|
| 203 |
estimates['feature_selection'] = base_times['feature_selection']
|
| 204 |
|
| 205 |
# Model training (now includes CV)
|
|
|
|
| 217 |
# Total estimate
|
| 218 |
total_estimate = sum(estimates.values())
|
| 219 |
|
| 220 |
+
# Add buffer for overhead (more for enhanced features)
|
| 221 |
+
buffer_multiplier = 1.4 if use_enhanced_features else 1.2
|
| 222 |
+
total_estimate *= buffer_multiplier
|
| 223 |
|
| 224 |
return {
|
| 225 |
'detailed_estimates': estimates,
|
|
|
|
| 227 |
'total_formatted': str(timedelta(seconds=int(total_estimate))),
|
| 228 |
'dataset_size': dataset_size,
|
| 229 |
'enable_tuning': enable_tuning,
|
| 230 |
+
'cv_folds': cv_folds,
|
| 231 |
+
'use_enhanced_features': use_enhanced_features
|
| 232 |
}
|
| 233 |
|
| 234 |
|
|
|
|
| 406 |
return {'error': str(e)}
|
| 407 |
|
| 408 |
|
| 409 |
+
class EnhancedModelTrainer:
|
| 410 |
+
"""Production-ready model trainer with enhanced feature engineering and comprehensive CV"""
|
| 411 |
|
| 412 |
+
def __init__(self, use_enhanced_features: bool = None):
|
| 413 |
+
# Auto-detect enhanced features if not specified
|
| 414 |
+
if use_enhanced_features is None:
|
| 415 |
+
self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE
|
| 416 |
+
else:
|
| 417 |
+
self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE
|
| 418 |
+
|
| 419 |
self.setup_paths()
|
| 420 |
self.setup_training_config()
|
| 421 |
self.setup_models()
|
| 422 |
self.progress_tracker = None
|
| 423 |
self.cv_manager = CrossValidationManager()
|
| 424 |
+
|
| 425 |
+
# Enhanced feature tracking
|
| 426 |
+
self.feature_engineer = None
|
| 427 |
+
self.feature_importance_results = {}
|
| 428 |
|
| 429 |
def setup_paths(self):
|
| 430 |
"""Setup all necessary paths with proper permissions"""
|
|
|
|
| 432 |
self.data_dir = self.base_dir / "data"
|
| 433 |
self.model_dir = self.base_dir / "model"
|
| 434 |
self.results_dir = self.base_dir / "results"
|
| 435 |
+
self.features_dir = self.base_dir / "features" # New for enhanced features
|
| 436 |
|
| 437 |
# Create directories with proper permissions
|
| 438 |
+
for dir_path in [self.data_dir, self.model_dir, self.results_dir, self.features_dir]:
|
| 439 |
dir_path.mkdir(parents=True, exist_ok=True)
|
| 440 |
# Ensure write permissions
|
| 441 |
try:
|
|
|
|
| 445 |
|
| 446 |
# File paths
|
| 447 |
self.data_path = self.data_dir / "combined_dataset.csv"
|
| 448 |
+
self.model_path = Path("/tmp/model.pkl")
|
| 449 |
self.vectorizer_path = Path("/tmp/vectorizer.pkl")
|
| 450 |
self.pipeline_path = Path("/tmp/pipeline.pkl")
|
| 451 |
self.metadata_path = Path("/tmp/metadata.json")
|
| 452 |
self.evaluation_path = self.results_dir / "evaluation_results.json"
|
| 453 |
+
|
| 454 |
+
# Enhanced feature paths
|
| 455 |
+
self.feature_engineer_path = Path("/tmp/feature_engineer.pkl")
|
| 456 |
+
self.feature_importance_path = self.results_dir / "feature_importance.json"
|
| 457 |
|
| 458 |
def setup_training_config(self):
|
| 459 |
+
"""Setup training configuration with enhanced feature parameters"""
|
| 460 |
self.test_size = 0.2
|
| 461 |
self.validation_size = 0.1
|
| 462 |
self.random_state = 42
|
| 463 |
+
self.cv_folds = 5
|
| 464 |
+
|
| 465 |
+
# Enhanced feature configuration
|
| 466 |
+
if self.use_enhanced_features:
|
| 467 |
+
self.max_features = 7500 # Increased for enhanced features
|
| 468 |
+
self.feature_selection_k = 3000 # More features to select from
|
| 469 |
+
logger.info("Using enhanced feature engineering pipeline")
|
| 470 |
+
else:
|
| 471 |
+
self.max_features = 5000 # Standard TF-IDF
|
| 472 |
+
self.feature_selection_k = 2000
|
| 473 |
+
logger.info("Using standard TF-IDF feature pipeline")
|
| 474 |
+
|
| 475 |
+
# Common parameters
|
| 476 |
+
self.min_df = 1
|
| 477 |
self.max_df = 0.95
|
| 478 |
+
self.ngram_range = (1, 2)
|
| 479 |
+
self.max_iter = 500
|
| 480 |
self.class_weight = 'balanced'
|
|
|
|
| 481 |
|
| 482 |
def setup_models(self):
|
| 483 |
"""Setup model configurations for comparison"""
|
|
|
|
| 487 |
max_iter=self.max_iter,
|
| 488 |
class_weight=self.class_weight,
|
| 489 |
random_state=self.random_state,
|
| 490 |
+
n_jobs=-1
|
| 491 |
),
|
| 492 |
'param_grid': {
|
| 493 |
+
'model__C': [0.1, 1, 10],
|
| 494 |
'model__penalty': ['l2']
|
| 495 |
}
|
| 496 |
},
|
| 497 |
'random_forest': {
|
| 498 |
'model': RandomForestClassifier(
|
| 499 |
+
n_estimators=50,
|
| 500 |
class_weight=self.class_weight,
|
| 501 |
random_state=self.random_state,
|
| 502 |
+
n_jobs=-1
|
| 503 |
),
|
| 504 |
'param_grid': {
|
| 505 |
+
'model__n_estimators': [50, 100],
|
| 506 |
'model__max_depth': [10, None]
|
| 507 |
}
|
| 508 |
}
|
|
|
|
| 547 |
return False, None, f"Need at least 2 classes, found: {unique_labels}"
|
| 548 |
|
| 549 |
# Check minimum sample size for CV
|
| 550 |
+
min_samples_for_cv = self.cv_folds * 2
|
| 551 |
if len(df) < min_samples_for_cv:
|
| 552 |
logger.warning(f"Dataset size ({len(df)}) is small for {self.cv_folds}-fold CV")
|
|
|
|
| 553 |
self.cv_manager.cv_folds = max(2, len(df) // 3)
|
| 554 |
logger.info(f"Adjusted CV folds to {self.cv_manager.cv_folds}")
|
| 555 |
|
|
|
|
| 571 |
logger.error(error_msg)
|
| 572 |
return False, None, error_msg
|
| 573 |
|
| 574 |
+
def create_preprocessing_pipeline(self, use_enhanced: bool = None) -> Pipeline:
|
| 575 |
+
"""Create preprocessing pipeline with optional enhanced features"""
|
| 576 |
+
|
| 577 |
+
if use_enhanced is None:
|
| 578 |
+
use_enhanced = self.use_enhanced_features
|
| 579 |
|
| 580 |
if self.progress_tracker:
|
| 581 |
+
feature_type = "enhanced" if use_enhanced else "standard"
|
| 582 |
+
self.progress_tracker.update(f"Creating {feature_type} pipeline")
|
| 583 |
|
| 584 |
+
if use_enhanced and ENHANCED_FEATURES_AVAILABLE:
|
| 585 |
+
logger.info("Creating enhanced feature engineering pipeline...")
|
| 586 |
+
|
| 587 |
+
# Create enhanced feature engineer
|
| 588 |
+
feature_engineer = AdvancedFeatureEngineer(
|
| 589 |
+
enable_sentiment=True,
|
| 590 |
+
enable_readability=True,
|
| 591 |
+
enable_entities=True,
|
| 592 |
+
enable_linguistic=True,
|
| 593 |
+
feature_selection_k=self.feature_selection_k,
|
| 594 |
+
tfidf_max_features=self.max_features,
|
| 595 |
+
ngram_range=self.ngram_range,
|
| 596 |
+
min_df=self.min_df,
|
| 597 |
+
max_df=self.max_df
|
| 598 |
+
)
|
| 599 |
+
|
| 600 |
+
# Create pipeline with enhanced features
|
| 601 |
+
pipeline = Pipeline([
|
| 602 |
+
('enhanced_features', feature_engineer),
|
| 603 |
+
('model', None) # Will be set during training
|
| 604 |
+
])
|
| 605 |
+
|
| 606 |
+
# Store reference for later use
|
| 607 |
+
self.feature_engineer = feature_engineer
|
| 608 |
+
|
| 609 |
+
else:
|
| 610 |
+
logger.info("Creating standard TF-IDF pipeline...")
|
| 611 |
+
|
| 612 |
+
# Use the standalone function instead of lambda
|
| 613 |
+
text_preprocessor = FunctionTransformer(
|
| 614 |
+
func=preprocess_text_function,
|
| 615 |
+
validate=False
|
| 616 |
+
)
|
| 617 |
|
| 618 |
+
# TF-IDF vectorization with optimized parameters
|
| 619 |
+
vectorizer = TfidfVectorizer(
|
| 620 |
+
max_features=self.max_features,
|
| 621 |
+
min_df=self.min_df,
|
| 622 |
+
max_df=self.max_df,
|
| 623 |
+
ngram_range=self.ngram_range,
|
| 624 |
+
stop_words='english',
|
| 625 |
+
sublinear_tf=True,
|
| 626 |
+
norm='l2'
|
| 627 |
+
)
|
| 628 |
|
| 629 |
+
# Feature selection
|
| 630 |
+
feature_selector = SelectKBest(
|
| 631 |
+
score_func=chi2,
|
| 632 |
+
k=min(self.feature_selection_k, self.max_features)
|
| 633 |
+
)
|
| 634 |
|
| 635 |
+
# Create standard pipeline
|
| 636 |
+
pipeline = Pipeline([
|
| 637 |
+
('preprocess', text_preprocessor),
|
| 638 |
+
('vectorize', vectorizer),
|
| 639 |
+
('feature_select', feature_selector),
|
| 640 |
+
('model', None) # Will be set during training
|
| 641 |
+
])
|
| 642 |
|
| 643 |
return pipeline
|
| 644 |
|
| 645 |
def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
|
| 646 |
+
"""Comprehensive model evaluation with enhanced feature analysis"""
|
| 647 |
|
| 648 |
if self.progress_tracker:
|
| 649 |
self.progress_tracker.update("Evaluating model")
|
|
|
|
| 681 |
cv_f1_std = cv_results['test_scores']['f1']['std']
|
| 682 |
logger.info(f"CV F1 Score: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})")
|
| 683 |
|
| 684 |
+
# Enhanced feature analysis
|
| 685 |
+
if self.use_enhanced_features and self.feature_engineer is not None:
|
| 686 |
+
try:
|
| 687 |
+
# Get feature importance if available
|
| 688 |
+
if hasattr(self.feature_engineer, 'get_feature_importance'):
|
| 689 |
+
feature_importance = self.feature_engineer.get_feature_importance(top_k=20)
|
| 690 |
+
metrics['top_features'] = feature_importance
|
| 691 |
+
|
| 692 |
+
# Get feature metadata
|
| 693 |
+
if hasattr(self.feature_engineer, 'get_feature_metadata'):
|
| 694 |
+
feature_metadata = self.feature_engineer.get_feature_metadata()
|
| 695 |
+
metrics['feature_metadata'] = feature_metadata
|
| 696 |
+
|
| 697 |
+
logger.info(f"Enhanced features used: {feature_metadata['total_features']}")
|
| 698 |
+
logger.info(f"Feature breakdown: {feature_metadata['feature_types']}")
|
| 699 |
+
|
| 700 |
+
except Exception as e:
|
| 701 |
+
logger.warning(f"Enhanced feature analysis failed: {e}")
|
| 702 |
+
|
| 703 |
# Training accuracy for overfitting detection
|
| 704 |
try:
|
| 705 |
if X_train is not None and y_train is not None:
|
|
|
|
| 717 |
"""Perform hyperparameter tuning with nested cross-validation"""
|
| 718 |
|
| 719 |
if self.progress_tracker:
|
| 720 |
+
feature_type = "enhanced" if self.use_enhanced_features else "standard"
|
| 721 |
+
self.progress_tracker.update(f"Tuning {model_name} with {feature_type} features")
|
| 722 |
|
| 723 |
try:
|
| 724 |
# Set the model in the pipeline
|
|
|
|
| 813 |
raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}")
|
| 814 |
|
| 815 |
def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict:
|
| 816 |
+
"""Train and evaluate multiple models with enhanced features and comprehensive CV"""
|
| 817 |
|
| 818 |
results = {}
|
| 819 |
|
| 820 |
for model_name in self.models.keys():
|
| 821 |
+
logger.info(f"Training {model_name} with {'enhanced' if self.use_enhanced_features else 'standard'} features...")
|
| 822 |
|
| 823 |
try:
|
| 824 |
+
# Create pipeline (enhanced or standard)
|
| 825 |
pipeline = self.create_preprocessing_pipeline()
|
| 826 |
|
| 827 |
# Hyperparameter tuning with CV
|
|
|
|
| 839 |
'model': best_model,
|
| 840 |
'tuning_results': tuning_results,
|
| 841 |
'evaluation_metrics': evaluation_metrics,
|
| 842 |
+
'training_time': datetime.now().isoformat(),
|
| 843 |
+
'feature_type': 'enhanced' if self.use_enhanced_features else 'standard'
|
| 844 |
}
|
| 845 |
|
| 846 |
# Log results
|
|
|
|
| 896 |
return best_model_name, best_model, best_metrics
|
| 897 |
|
| 898 |
def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict) -> bool:
|
| 899 |
+
"""Save model artifacts and enhanced metadata with feature engineering results"""
|
| 900 |
try:
|
| 901 |
if self.progress_tracker:
|
| 902 |
self.progress_tracker.update("Saving model")
|
|
|
|
| 912 |
joblib.dump(model, alt_pipeline_path)
|
| 913 |
logger.info(f"β
Saved pipeline to {alt_pipeline_path}")
|
| 914 |
|
| 915 |
+
# Save enhanced feature engineer if available
|
| 916 |
+
if self.use_enhanced_features and self.feature_engineer is not None:
|
| 917 |
+
try:
|
| 918 |
+
self.feature_engineer.save_pipeline(self.feature_engineer_path)
|
| 919 |
+
logger.info(f"β
Saved feature engineer to {self.feature_engineer_path}")
|
| 920 |
+
except Exception as e:
|
| 921 |
+
logger.warning(f"Could not save feature engineer: {e}")
|
| 922 |
|
| 923 |
+
# Save individual components for backward compatibility
|
| 924 |
try:
|
| 925 |
+
if hasattr(model, 'named_steps'):
|
| 926 |
+
if 'model' in model.named_steps:
|
| 927 |
+
joblib.dump(model.named_steps['model'], self.model_path)
|
| 928 |
+
logger.info(f"β
Saved model component to {self.model_path}")
|
| 929 |
+
|
| 930 |
+
# Save vectorizer (standard pipeline) or enhanced features reference
|
| 931 |
+
if 'vectorize' in model.named_steps:
|
| 932 |
+
joblib.dump(model.named_steps['vectorize'], self.vectorizer_path)
|
| 933 |
+
logger.info(f"β
Saved vectorizer to {self.vectorizer_path}")
|
| 934 |
+
elif 'enhanced_features' in model.named_steps:
|
| 935 |
+
# Save reference to enhanced features
|
| 936 |
+
enhanced_ref = {
|
| 937 |
+
'type': 'enhanced_features',
|
| 938 |
+
'feature_engineer_path': str(self.feature_engineer_path),
|
| 939 |
+
'metadata': self.feature_engineer.get_feature_metadata() if self.feature_engineer else {}
|
| 940 |
+
}
|
| 941 |
+
joblib.dump(enhanced_ref, self.vectorizer_path)
|
| 942 |
+
logger.info(f"β
Saved enhanced features reference to {self.vectorizer_path}")
|
| 943 |
+
|
| 944 |
except Exception as e:
|
| 945 |
+
logger.warning(f"Could not save individual components: {e}")
|
| 946 |
|
| 947 |
# Generate data hash
|
| 948 |
data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest()
|
|
|
|
| 950 |
# Extract CV results
|
| 951 |
cv_results = metrics.get('cross_validation', {})
|
| 952 |
|
| 953 |
+
# Create enhanced metadata with feature engineering information
|
| 954 |
metadata = {
|
| 955 |
'model_version': f"v1.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
| 956 |
'model_type': model_name,
|
| 957 |
+
'feature_engineering': {
|
| 958 |
+
'type': 'enhanced' if self.use_enhanced_features else 'standard',
|
| 959 |
+
'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE,
|
| 960 |
+
'enhanced_features_used': self.use_enhanced_features
|
| 961 |
+
},
|
| 962 |
'data_version': data_hash,
|
| 963 |
'test_accuracy': metrics['accuracy'],
|
| 964 |
'test_f1': metrics['f1'],
|
|
|
|
| 972 |
'cv_folds': self.cv_folds,
|
| 973 |
'max_features': self.max_features,
|
| 974 |
'ngram_range': self.ngram_range,
|
| 975 |
+
'feature_selection_k': self.feature_selection_k,
|
| 976 |
+
'use_enhanced_features': self.use_enhanced_features
|
| 977 |
}
|
| 978 |
}
|
| 979 |
|
| 980 |
+
# Add enhanced feature metadata
|
| 981 |
+
if self.use_enhanced_features:
|
| 982 |
+
feature_metadata = metrics.get('feature_metadata', {})
|
| 983 |
+
if feature_metadata:
|
| 984 |
+
metadata['enhanced_features'] = {
|
| 985 |
+
'total_features': feature_metadata.get('total_features', 0),
|
| 986 |
+
'feature_types': feature_metadata.get('feature_types', {}),
|
| 987 |
+
'configuration': feature_metadata.get('configuration', {})
|
| 988 |
+
}
|
| 989 |
+
|
| 990 |
+
# Add top features if available
|
| 991 |
+
top_features = metrics.get('top_features', {})
|
| 992 |
+
if top_features:
|
| 993 |
+
metadata['top_features'] = dict(list(top_features.items())[:10]) # Top 10 features
|
| 994 |
+
|
| 995 |
+
# Save detailed feature importance
|
| 996 |
+
try:
|
| 997 |
+
feature_analysis = {
|
| 998 |
+
'top_features': top_features,
|
| 999 |
+
'feature_metadata': feature_metadata,
|
| 1000 |
+
'timestamp': datetime.now().isoformat(),
|
| 1001 |
+
'model_version': metadata['model_version']
|
| 1002 |
+
}
|
| 1003 |
+
|
| 1004 |
+
with open(self.feature_importance_path, 'w') as f:
|
| 1005 |
+
json.dump(feature_analysis, f, indent=2)
|
| 1006 |
+
logger.info(f"β
Saved feature importance analysis to {self.feature_importance_path}")
|
| 1007 |
+
|
| 1008 |
+
except Exception as e:
|
| 1009 |
+
logger.warning(f"Could not save feature importance: {e}")
|
| 1010 |
+
|
| 1011 |
# Add comprehensive CV results to metadata
|
| 1012 |
if cv_results and 'test_scores' in cv_results:
|
| 1013 |
metadata['cross_validation'] = {
|
|
|
|
| 1051 |
except Exception as e:
|
| 1052 |
logger.warning(f"Could not save metadata: {e}")
|
| 1053 |
|
| 1054 |
+
# Log feature engineering summary
|
| 1055 |
+
if self.use_enhanced_features and feature_metadata:
|
| 1056 |
+
logger.info(f"β
Enhanced features summary:")
|
| 1057 |
+
logger.info(f" Total features: {feature_metadata.get('total_features', 0)}")
|
| 1058 |
+
for feature_type, count in feature_metadata.get('feature_types', {}).items():
|
| 1059 |
+
logger.info(f" {feature_type}: {count}")
|
| 1060 |
+
|
| 1061 |
+
logger.info(f"β
Model artifacts saved successfully with {'enhanced' if self.use_enhanced_features else 'standard'} features")
|
| 1062 |
return True
|
| 1063 |
|
| 1064 |
except Exception as e:
|
|
|
|
| 1072 |
logger.error(f"Failed to save backup pipeline: {str(e2)}")
|
| 1073 |
return False
|
| 1074 |
|
| 1075 |
+
def train_model(self, data_path: str = None, force_enhanced: bool = None) -> Tuple[bool, str]:
|
| 1076 |
+
"""Main training function with enhanced feature engineering pipeline"""
|
| 1077 |
try:
|
| 1078 |
+
# Override enhanced features setting if specified
|
| 1079 |
+
if force_enhanced is not None:
|
| 1080 |
+
original_setting = self.use_enhanced_features
|
| 1081 |
+
self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE
|
| 1082 |
+
if force_enhanced and not ENHANCED_FEATURES_AVAILABLE:
|
| 1083 |
+
logger.warning("Enhanced features requested but not available, using standard features")
|
| 1084 |
+
|
| 1085 |
+
feature_type = "enhanced" if self.use_enhanced_features else "standard"
|
| 1086 |
+
logger.info(f"Starting {feature_type} model training with cross-validation...")
|
| 1087 |
|
| 1088 |
# Override data path if provided
|
| 1089 |
if data_path:
|
|
|
|
| 1098 |
time_estimate = estimate_training_time(
|
| 1099 |
len(df),
|
| 1100 |
enable_tuning=True,
|
| 1101 |
+
cv_folds=self.cv_folds,
|
| 1102 |
+
use_enhanced_features=self.use_enhanced_features
|
| 1103 |
)
|
| 1104 |
|
| 1105 |
print(f"\nπ Enhanced Training Configuration:")
|
| 1106 |
print(f"Dataset size: {len(df)} samples")
|
| 1107 |
+
print(f"Feature engineering: {feature_type.title()}")
|
| 1108 |
print(f"Cross-validation folds: {self.cv_folds}")
|
| 1109 |
print(f"Estimated time: {time_estimate['total_formatted']}")
|
| 1110 |
print(f"Models to train: {len(self.models)}")
|
| 1111 |
print(f"Hyperparameter tuning: Enabled")
|
| 1112 |
+
if self.use_enhanced_features:
|
| 1113 |
+
print(f"Enhanced features: Sentiment, Readability, Entities, Linguistic")
|
| 1114 |
print()
|
| 1115 |
|
| 1116 |
+
# Setup progress tracker (adjusted for enhanced features)
|
| 1117 |
+
base_steps = 4 + (len(self.models) * 3) + 1 # Basic steps
|
| 1118 |
+
enhanced_steps = 2 if self.use_enhanced_features else 0 # Feature engineering steps
|
| 1119 |
+
total_steps = base_steps + enhanced_steps
|
| 1120 |
+
self.progress_tracker = ProgressTracker(total_steps, f"{feature_type.title()} Training Progress")
|
| 1121 |
|
| 1122 |
# Prepare data
|
| 1123 |
X = df['text'].values
|
|
|
|
| 1152 |
if len(X_test) < 1:
|
| 1153 |
return False, "Cannot create test set. Dataset too small."
|
| 1154 |
|
| 1155 |
+
# Train and evaluate models with enhanced features
|
| 1156 |
results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test)
|
| 1157 |
|
| 1158 |
# Select best model
|
| 1159 |
best_model_name, best_model, best_metrics = self.select_best_model(results)
|
| 1160 |
|
| 1161 |
+
# Save model artifacts with enhanced feature information
|
| 1162 |
if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results):
|
| 1163 |
return False, "Failed to save model artifacts"
|
| 1164 |
|
| 1165 |
# Finish progress tracking
|
| 1166 |
self.progress_tracker.finish()
|
| 1167 |
|
| 1168 |
+
# Create success message with enhanced feature information
|
| 1169 |
cv_results = best_metrics.get('cross_validation', {})
|
| 1170 |
cv_info = ""
|
| 1171 |
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
|
|
|
|
| 1173 |
cv_f1_std = cv_results['test_scores']['f1']['std']
|
| 1174 |
cv_info = f", CV F1: {cv_f1_mean:.4f} (Β±{cv_f1_std:.4f})"
|
| 1175 |
|
| 1176 |
+
# Enhanced features summary
|
| 1177 |
+
feature_info = ""
|
| 1178 |
+
if self.use_enhanced_features:
|
| 1179 |
+
feature_metadata = best_metrics.get('feature_metadata', {})
|
| 1180 |
+
if feature_metadata:
|
| 1181 |
+
total_features = feature_metadata.get('total_features', 0)
|
| 1182 |
+
feature_info = f", Enhanced Features: {total_features}"
|
| 1183 |
+
|
| 1184 |
success_message = (
|
| 1185 |
+
f"{feature_type.title()} model training completed successfully. "
|
| 1186 |
f"Best model: {best_model_name} "
|
| 1187 |
+
f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info})"
|
| 1188 |
)
|
| 1189 |
|
| 1190 |
logger.info(success_message)
|
|
|
|
| 1199 |
|
| 1200 |
|
| 1201 |
def main():
|
| 1202 |
+
"""Main execution function with enhanced feature engineering support"""
|
| 1203 |
import argparse
|
| 1204 |
|
| 1205 |
# Parse command line arguments
|
| 1206 |
+
parser = argparse.ArgumentParser(description='Train fake news detection model with enhanced features')
|
| 1207 |
parser.add_argument('--data_path', type=str, help='Path to training data CSV file')
|
| 1208 |
parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file')
|
| 1209 |
parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds')
|
| 1210 |
+
parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features')
|
| 1211 |
+
parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only')
|
| 1212 |
args = parser.parse_args()
|
| 1213 |
|
| 1214 |
+
# Determine feature engineering mode
|
| 1215 |
+
use_enhanced = None
|
| 1216 |
+
if args.enhanced_features and args.standard_features:
|
| 1217 |
+
logger.warning("Both --enhanced_features and --standard_features specified. Using auto-detection.")
|
| 1218 |
+
elif args.enhanced_features:
|
| 1219 |
+
use_enhanced = True
|
| 1220 |
+
logger.info("Enhanced features explicitly requested")
|
| 1221 |
+
elif args.standard_features:
|
| 1222 |
+
use_enhanced = False
|
| 1223 |
+
logger.info("Standard features explicitly requested")
|
| 1224 |
+
|
| 1225 |
+
trainer = EnhancedModelTrainer(use_enhanced_features=use_enhanced)
|
| 1226 |
|
| 1227 |
# Apply CV folds from command line
|
| 1228 |
if args.cv_folds:
|
|
|
|
| 1242 |
trainer.max_features = config.get('max_features', trainer.max_features)
|
| 1243 |
trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range))
|
| 1244 |
|
| 1245 |
+
# Enhanced feature configuration
|
| 1246 |
+
if 'enhanced_features' in config and use_enhanced is None:
|
| 1247 |
+
trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE
|
| 1248 |
+
|
| 1249 |
# Filter models if specified
|
| 1250 |
selected_models = config.get('selected_models')
|
| 1251 |
if selected_models and len(selected_models) < len(trainer.models):
|
|
|
|
| 1255 |
# Update feature selection based on max_features
|
| 1256 |
trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features)
|
| 1257 |
|
| 1258 |
+
logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds")
|
| 1259 |
+
if trainer.use_enhanced_features:
|
| 1260 |
+
logger.info("Enhanced features enabled via configuration")
|
| 1261 |
|
| 1262 |
except Exception as e:
|
| 1263 |
logger.warning(f"Failed to load configuration: {e}, using defaults")
|
|
|
|
| 1266 |
|
| 1267 |
if success:
|
| 1268 |
print(f"β
{message}")
|
| 1269 |
+
|
| 1270 |
+
# Print feature engineering summary
|
| 1271 |
+
if trainer.use_enhanced_features and trainer.feature_engineer:
|
| 1272 |
+
try:
|
| 1273 |
+
metadata = trainer.feature_engineer.get_feature_metadata()
|
| 1274 |
+
print(f"\nπ Enhanced Feature Engineering Summary:")
|
| 1275 |
+
print(f"Total features generated: {metadata['total_features']}")
|
| 1276 |
+
for feature_type, count in metadata['feature_types'].items():
|
| 1277 |
+
print(f" {feature_type}: {count}")
|
| 1278 |
+
except Exception as e:
|
| 1279 |
+
logger.warning(f"Could not display feature summary: {e}")
|
| 1280 |
else:
|
| 1281 |
print(f"β {message}")
|
| 1282 |
exit(1)
|