|
|
|
|
|
|
|
|
import seaborn as sns |
|
|
import matplotlib.pyplot as plt |
|
|
from sklearn.feature_selection import SelectKBest, chi2 |
|
|
from sklearn.preprocessing import FunctionTransformer |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.metrics import ( |
|
|
accuracy_score, precision_score, recall_score, f1_score, |
|
|
roc_auc_score, confusion_matrix, classification_report, |
|
|
precision_recall_curve, roc_curve |
|
|
) |
|
|
from sklearn.model_selection import ( |
|
|
train_test_split, cross_val_score, GridSearchCV, |
|
|
StratifiedKFold, validation_curve, cross_validate |
|
|
) |
|
|
from sklearn.ensemble import RandomForestClassifier, VotingClassifier |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from pathlib import Path |
|
|
import logging |
|
|
import json |
|
|
import joblib |
|
|
import hashlib |
|
|
import sys |
|
|
import os |
|
|
import time |
|
|
from datetime import datetime, timedelta |
|
|
from typing import Dict, Tuple, Optional, Any, List |
|
|
import warnings |
|
|
import re |
|
|
|
|
|
|
|
|
try: |
|
|
import lightgbm as lgb |
|
|
LIGHTGBM_AVAILABLE = True |
|
|
logging.info("LightGBM available for ensemble training") |
|
|
except ImportError: |
|
|
LIGHTGBM_AVAILABLE = False |
|
|
logging.warning("LightGBM not available - ensemble training will use alternative algorithms") |
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
try: |
|
|
from features.feature_engineer import AdvancedFeatureEngineer, create_enhanced_pipeline, analyze_feature_importance |
|
|
from features.sentiment_analyzer import SentimentAnalyzer |
|
|
from features.readability_analyzer import ReadabilityAnalyzer |
|
|
from features.entity_analyzer import EntityAnalyzer |
|
|
from features.linguistic_analyzer import LinguisticAnalyzer |
|
|
ENHANCED_FEATURES_AVAILABLE = True |
|
|
logger = logging.getLogger(__name__) |
|
|
logger.info("Enhanced feature engineering components loaded successfully") |
|
|
except ImportError as e: |
|
|
ENHANCED_FEATURES_AVAILABLE = False |
|
|
logger = logging.getLogger(__name__) |
|
|
logger.warning(f"Enhanced features not available, falling back to basic TF-IDF: {e}") |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
|
handlers=[ |
|
|
logging.FileHandler('/tmp/model_training.log'), |
|
|
logging.StreamHandler() |
|
|
] |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class EnsembleModelTrainer: |
|
|
"""Production-ready ensemble model trainer with LightGBM integration""" |
|
|
|
|
|
def __init__(self, use_enhanced_features: bool = None, use_ensemble: bool = True): |
|
|
|
|
|
if use_enhanced_features is None: |
|
|
self.use_enhanced_features = ENHANCED_FEATURES_AVAILABLE |
|
|
else: |
|
|
self.use_enhanced_features = use_enhanced_features and ENHANCED_FEATURES_AVAILABLE |
|
|
|
|
|
self.use_ensemble = use_ensemble and LIGHTGBM_AVAILABLE |
|
|
|
|
|
self.setup_paths() |
|
|
self.setup_training_config() |
|
|
self.setup_models() |
|
|
self.progress_tracker = None |
|
|
self.cv_manager = CrossValidationManager() |
|
|
|
|
|
|
|
|
self.feature_engineer = None |
|
|
self.feature_importance_results = {} |
|
|
|
|
|
logger.info(f"Ensemble trainer initialized - Enhanced features: {self.use_enhanced_features}, " |
|
|
f"LightGBM ensemble: {self.use_ensemble}") |
|
|
|
|
|
def setup_paths(self): |
|
|
"""Setup all necessary paths with proper permissions""" |
|
|
self.base_dir = Path("/tmp") |
|
|
self.data_dir = self.base_dir / "data" |
|
|
self.model_dir = self.base_dir / "model" |
|
|
self.results_dir = self.base_dir / "results" |
|
|
self.features_dir = self.base_dir / "features" |
|
|
|
|
|
|
|
|
for dir_path in [self.data_dir, self.model_dir, self.results_dir, self.features_dir]: |
|
|
dir_path.mkdir(parents=True, exist_ok=True) |
|
|
try: |
|
|
dir_path.chmod(0o755) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
self.data_path = self.data_dir / "combined_dataset.csv" |
|
|
self.model_path = Path("/tmp/model.pkl") |
|
|
self.vectorizer_path = Path("/tmp/vectorizer.pkl") |
|
|
self.pipeline_path = Path("/tmp/pipeline.pkl") |
|
|
self.metadata_path = Path("/tmp/metadata.json") |
|
|
self.evaluation_path = self.results_dir / "evaluation_results.json" |
|
|
|
|
|
|
|
|
self.feature_engineer_path = Path("/tmp/feature_engineer.pkl") |
|
|
self.feature_importance_path = self.results_dir / "feature_importance.json" |
|
|
|
|
|
|
|
|
self.ensemble_path = Path("/tmp/ensemble.pkl") |
|
|
self.ensemble_metadata_path = Path("/tmp/ensemble_metadata.json") |
|
|
|
|
|
def setup_training_config(self): |
|
|
"""Setup training configuration with ensemble parameters""" |
|
|
self.test_size = 0.2 |
|
|
self.validation_size = 0.1 |
|
|
self.random_state = 42 |
|
|
self.cv_folds = 5 |
|
|
|
|
|
|
|
|
if self.use_enhanced_features: |
|
|
self.max_features = 7500 |
|
|
self.feature_selection_k = 3000 |
|
|
logger.info("Using enhanced feature engineering pipeline") |
|
|
else: |
|
|
self.max_features = 5000 |
|
|
self.feature_selection_k = 2000 |
|
|
logger.info("Using standard TF-IDF feature pipeline") |
|
|
|
|
|
|
|
|
self.min_df = 1 |
|
|
self.max_df = 0.95 |
|
|
self.ngram_range = (1, 2) |
|
|
self.max_iter = 500 |
|
|
self.class_weight = 'balanced' |
|
|
|
|
|
|
|
|
self.lgb_params = { |
|
|
'objective': 'binary', |
|
|
'metric': 'binary_logloss', |
|
|
'boosting_type': 'gbdt', |
|
|
'num_leaves': 31, |
|
|
'learning_rate': 0.1, |
|
|
'feature_fraction': 0.8, |
|
|
'bagging_fraction': 0.8, |
|
|
'bagging_freq': 5, |
|
|
'verbose': -1, |
|
|
'random_state': self.random_state, |
|
|
'class_weight': 'balanced' |
|
|
} |
|
|
|
|
|
def setup_models(self): |
|
|
"""Setup model configurations including LightGBM ensemble""" |
|
|
|
|
|
self.models = { |
|
|
'logistic_regression': { |
|
|
'model': LogisticRegression( |
|
|
max_iter=self.max_iter, |
|
|
class_weight=self.class_weight, |
|
|
random_state=self.random_state, |
|
|
n_jobs=-1 |
|
|
), |
|
|
'param_grid': { |
|
|
'model__C': [0.1, 1, 10], |
|
|
'model__penalty': ['l2'] |
|
|
} |
|
|
}, |
|
|
'random_forest': { |
|
|
'model': RandomForestClassifier( |
|
|
n_estimators=50, |
|
|
class_weight=self.class_weight, |
|
|
random_state=self.random_state, |
|
|
n_jobs=-1 |
|
|
), |
|
|
'param_grid': { |
|
|
'model__n_estimators': [50, 100], |
|
|
'model__max_depth': [10, None] |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if LIGHTGBM_AVAILABLE and self.use_ensemble: |
|
|
self.models['lightgbm'] = { |
|
|
'model': lgb.LGBMClassifier( |
|
|
**self.lgb_params, |
|
|
n_estimators=100 |
|
|
), |
|
|
'param_grid': { |
|
|
'model__n_estimators': [50, 100], |
|
|
'model__learning_rate': [0.05, 0.1], |
|
|
'model__num_leaves': [31, 63] |
|
|
} |
|
|
} |
|
|
|
|
|
def create_lightgbm_ensemble(self, models_dict: Dict, X_train, y_train) -> VotingClassifier: |
|
|
"""Create ensemble with LightGBM and traditional models""" |
|
|
if not LIGHTGBM_AVAILABLE: |
|
|
logger.warning("LightGBM not available for ensemble creation") |
|
|
return None |
|
|
|
|
|
logger.info("Creating LightGBM ensemble model...") |
|
|
|
|
|
|
|
|
estimators = [] |
|
|
|
|
|
for model_name, model_info in models_dict.items(): |
|
|
if 'best_estimator' in model_info: |
|
|
model = model_info['best_estimator'] |
|
|
|
|
|
if hasattr(model, 'named_steps') and 'model' in model.named_steps: |
|
|
actual_model = model.named_steps['model'] |
|
|
else: |
|
|
actual_model = model |
|
|
|
|
|
estimators.append((model_name, actual_model)) |
|
|
|
|
|
if len(estimators) < 2: |
|
|
logger.warning("Not enough models for ensemble creation") |
|
|
return None |
|
|
|
|
|
|
|
|
ensemble = VotingClassifier( |
|
|
estimators=estimators, |
|
|
voting='soft' |
|
|
) |
|
|
|
|
|
logger.info(f"Ensemble created with {len(estimators)} models: {[name for name, _ in estimators]}") |
|
|
return ensemble |
|
|
|
|
|
def train_ensemble_model(self, X_train, X_test, y_train, y_test, individual_results: Dict) -> Dict: |
|
|
"""Train and evaluate ensemble model""" |
|
|
if not self.use_ensemble or not LIGHTGBM_AVAILABLE: |
|
|
logger.info("Ensemble training skipped - using best individual model") |
|
|
return {} |
|
|
|
|
|
logger.info("Training ensemble model with LightGBM integration...") |
|
|
|
|
|
try: |
|
|
|
|
|
ensemble = self.create_lightgbm_ensemble(individual_results, X_train, y_train) |
|
|
|
|
|
if ensemble is None: |
|
|
return {'error': 'Failed to create ensemble'} |
|
|
|
|
|
|
|
|
logger.info("Training ensemble voting mechanism...") |
|
|
|
|
|
|
|
|
|
|
|
pipeline = self.create_preprocessing_pipeline() |
|
|
X_train_processed = pipeline.fit_transform(X_train, y_train) |
|
|
X_test_processed = pipeline.transform(X_test) |
|
|
|
|
|
|
|
|
ensemble.fit(X_train_processed, y_train) |
|
|
|
|
|
|
|
|
ensemble_metrics = self.comprehensive_evaluation_ensemble( |
|
|
ensemble, X_test_processed, y_test, X_train_processed, y_train |
|
|
) |
|
|
|
|
|
|
|
|
ensemble_pipeline = Pipeline([ |
|
|
('preprocessing', pipeline.steps[0][1]), |
|
|
('ensemble', ensemble) |
|
|
]) |
|
|
|
|
|
ensemble_results = { |
|
|
'ensemble': ensemble_pipeline, |
|
|
'evaluation_metrics': ensemble_metrics, |
|
|
'component_models': list(individual_results.keys()), |
|
|
'ensemble_type': 'voting_classifier_with_lightgbm' if 'lightgbm' in individual_results else 'voting_classifier', |
|
|
'training_time': datetime.now().isoformat(), |
|
|
'feature_type': 'enhanced' if self.use_enhanced_features else 'standard' |
|
|
} |
|
|
|
|
|
logger.info(f"Ensemble training completed - F1: {ensemble_metrics.get('f1', 'N/A'):.4f}") |
|
|
return ensemble_results |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Ensemble training failed: {str(e)}") |
|
|
return {'error': str(e)} |
|
|
|
|
|
def comprehensive_evaluation_ensemble(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict: |
|
|
"""Comprehensive evaluation specifically for ensemble models""" |
|
|
|
|
|
logger.info("Evaluating ensemble model...") |
|
|
|
|
|
|
|
|
y_pred = model.predict(X_test) |
|
|
y_pred_proba = model.predict_proba(X_test)[:, 1] |
|
|
|
|
|
|
|
|
metrics = { |
|
|
'accuracy': float(accuracy_score(y_test, y_pred)), |
|
|
'precision': float(precision_score(y_test, y_pred, average='weighted')), |
|
|
'recall': float(recall_score(y_test, y_pred, average='weighted')), |
|
|
'f1': float(f1_score(y_test, y_pred, average='weighted')), |
|
|
'roc_auc': float(roc_auc_score(y_test, y_pred_proba)) |
|
|
} |
|
|
|
|
|
|
|
|
cm = confusion_matrix(y_test, y_pred) |
|
|
metrics['confusion_matrix'] = cm.tolist() |
|
|
|
|
|
|
|
|
if X_train is not None and y_train is not None: |
|
|
X_full = np.concatenate([X_train, X_test]) |
|
|
y_full = np.concatenate([y_train, y_test]) |
|
|
|
|
|
logger.info("Performing cross-validation on ensemble...") |
|
|
cv_results = self.cv_manager.perform_cross_validation(model, X_full, y_full) |
|
|
metrics['cross_validation'] = cv_results |
|
|
|
|
|
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']: |
|
|
cv_f1_mean = cv_results['test_scores']['f1']['mean'] |
|
|
cv_f1_std = cv_results['test_scores']['f1']['std'] |
|
|
logger.info(f"Ensemble CV F1 Score: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})") |
|
|
|
|
|
|
|
|
metrics['ensemble_info'] = { |
|
|
'model_type': 'ensemble', |
|
|
'voting_type': getattr(model, 'voting', 'unknown'), |
|
|
'n_estimators': len(getattr(model, 'estimators_', [])), |
|
|
'estimator_names': [name for name, _ in getattr(model, 'estimators', [])] |
|
|
} |
|
|
|
|
|
return metrics |
|
|
|
|
|
def select_best_model(self, results: Dict, ensemble_results: Dict = None) -> Tuple[str, Any, Dict]: |
|
|
"""Select the best performing model including ensemble option""" |
|
|
|
|
|
logger.info("Selecting best model from individual models and ensemble...") |
|
|
|
|
|
best_model_name = None |
|
|
best_model = None |
|
|
best_score = -1 |
|
|
best_metrics = None |
|
|
|
|
|
|
|
|
for model_name, result in results.items(): |
|
|
if 'error' in result: |
|
|
continue |
|
|
|
|
|
cv_results = result['evaluation_metrics'].get('cross_validation', {}) |
|
|
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']: |
|
|
f1_score = cv_results['test_scores']['f1']['mean'] |
|
|
score_type = "CV F1" |
|
|
else: |
|
|
f1_score = result['evaluation_metrics']['f1'] |
|
|
score_type = "Test F1" |
|
|
|
|
|
logger.info(f"Model {model_name}: {score_type} = {f1_score:.4f}") |
|
|
|
|
|
if f1_score > best_score: |
|
|
best_score = f1_score |
|
|
best_model_name = model_name |
|
|
best_model = result['model'] |
|
|
best_metrics = result['evaluation_metrics'] |
|
|
|
|
|
|
|
|
if ensemble_results and 'evaluation_metrics' in ensemble_results: |
|
|
ensemble_metrics = ensemble_results['evaluation_metrics'] |
|
|
|
|
|
cv_results = ensemble_metrics.get('cross_validation', {}) |
|
|
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']: |
|
|
ensemble_f1 = cv_results['test_scores']['f1']['mean'] |
|
|
score_type = "CV F1" |
|
|
else: |
|
|
ensemble_f1 = ensemble_metrics['f1'] |
|
|
score_type = "Test F1" |
|
|
|
|
|
logger.info(f"Ensemble model: {score_type} = {ensemble_f1:.4f}") |
|
|
|
|
|
if ensemble_f1 > best_score: |
|
|
best_score = ensemble_f1 |
|
|
best_model_name = "ensemble" |
|
|
best_model = ensemble_results['ensemble'] |
|
|
best_metrics = ensemble_metrics |
|
|
|
|
|
if best_model_name is None: |
|
|
raise ValueError("No models trained successfully") |
|
|
|
|
|
logger.info(f"Best model selected: {best_model_name} with F1 score: {best_score:.4f}") |
|
|
return best_model_name, best_model, best_metrics |
|
|
|
|
|
def save_model_artifacts(self, model, model_name: str, metrics: Dict, results: Dict, |
|
|
ensemble_results: Dict = None) -> bool: |
|
|
"""Enhanced model artifacts saving with ensemble support""" |
|
|
try: |
|
|
logger.info(f"Saving model artifacts for {model_name}...") |
|
|
|
|
|
|
|
|
if model_name == "ensemble": |
|
|
|
|
|
joblib.dump(model, self.ensemble_path) |
|
|
logger.info(f"Saved ensemble model to {self.ensemble_path}") |
|
|
|
|
|
|
|
|
joblib.dump(model, self.pipeline_path) |
|
|
logger.info(f"Saved ensemble as main pipeline to {self.pipeline_path}") |
|
|
|
|
|
|
|
|
ensemble_metadata = { |
|
|
'model_type': 'ensemble', |
|
|
'ensemble_type': ensemble_results.get('ensemble_type', 'voting_classifier'), |
|
|
'component_models': ensemble_results.get('component_models', []), |
|
|
'ensemble_info': metrics.get('ensemble_info', {}), |
|
|
'timestamp': datetime.now().isoformat() |
|
|
} |
|
|
|
|
|
with open(self.ensemble_metadata_path, 'w') as f: |
|
|
json.dump(ensemble_metadata, f, indent=2) |
|
|
logger.info(f"Saved ensemble metadata to {self.ensemble_metadata_path}") |
|
|
|
|
|
else: |
|
|
|
|
|
joblib.dump(model, self.pipeline_path) |
|
|
logger.info(f"Saved {model_name} pipeline to {self.pipeline_path}") |
|
|
|
|
|
|
|
|
try: |
|
|
if hasattr(model, 'named_steps'): |
|
|
if 'model' in model.named_steps: |
|
|
joblib.dump(model.named_steps['model'], self.model_path) |
|
|
elif 'ensemble' in model.named_steps: |
|
|
joblib.dump(model.named_steps['ensemble'], self.model_path) |
|
|
|
|
|
|
|
|
if 'vectorize' in model.named_steps: |
|
|
joblib.dump(model.named_steps['vectorize'], self.vectorizer_path) |
|
|
elif 'enhanced_features' in model.named_steps: |
|
|
enhanced_ref = { |
|
|
'type': 'enhanced_features', |
|
|
'feature_engineer_path': str(self.feature_engineer_path), |
|
|
'metadata': self.feature_engineer.get_feature_metadata() if self.feature_engineer else {} |
|
|
} |
|
|
joblib.dump(enhanced_ref, self.vectorizer_path) |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Could not save individual components: {e}") |
|
|
|
|
|
|
|
|
metadata = self._create_enhanced_metadata(model_name, metrics, results, ensemble_results) |
|
|
|
|
|
|
|
|
with open(self.metadata_path, 'w') as f: |
|
|
json.dump(metadata, f, indent=2) |
|
|
logger.info(f"Saved metadata to {self.metadata_path}") |
|
|
|
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to save model artifacts: {str(e)}") |
|
|
return False |
|
|
|
|
|
def _create_enhanced_metadata(self, model_name: str, metrics: Dict, results: Dict, |
|
|
ensemble_results: Dict = None) -> Dict: |
|
|
"""Create comprehensive metadata including ensemble information""" |
|
|
|
|
|
|
|
|
data_hash = hashlib.md5(str(datetime.now()).encode()).hexdigest() |
|
|
version_suffix = "ensemble" if model_name == "ensemble" else model_name |
|
|
|
|
|
metadata = { |
|
|
'model_version': f"v2.0_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{version_suffix}", |
|
|
'model_type': model_name, |
|
|
'is_ensemble': model_name == "ensemble", |
|
|
'data_version': data_hash, |
|
|
'test_accuracy': metrics['accuracy'], |
|
|
'test_f1': metrics['f1'], |
|
|
'test_precision': metrics['precision'], |
|
|
'test_recall': metrics['recall'], |
|
|
'test_roc_auc': metrics['roc_auc'], |
|
|
'timestamp': datetime.now().isoformat(), |
|
|
'training_method': 'enhanced_ensemble_training' if self.use_ensemble else 'enhanced_individual_training', |
|
|
'lightgbm_available': LIGHTGBM_AVAILABLE, |
|
|
'lightgbm_used': self.use_ensemble and LIGHTGBM_AVAILABLE |
|
|
} |
|
|
|
|
|
|
|
|
metadata['feature_engineering'] = { |
|
|
'type': 'enhanced' if self.use_enhanced_features else 'standard', |
|
|
'enhanced_features_available': ENHANCED_FEATURES_AVAILABLE, |
|
|
'enhanced_features_used': self.use_enhanced_features |
|
|
} |
|
|
|
|
|
|
|
|
if model_name == "ensemble" and ensemble_results: |
|
|
metadata['ensemble_details'] = { |
|
|
'ensemble_type': ensemble_results.get('ensemble_type', 'voting_classifier'), |
|
|
'component_models': ensemble_results.get('component_models', []), |
|
|
'ensemble_info': metrics.get('ensemble_info', {}), |
|
|
'voting_type': metrics.get('ensemble_info', {}).get('voting_type', 'soft') |
|
|
} |
|
|
|
|
|
|
|
|
metadata['component_performance'] = {} |
|
|
for comp_model_name in ensemble_results.get('component_models', []): |
|
|
if comp_model_name in results and 'evaluation_metrics' in results[comp_model_name]: |
|
|
comp_metrics = results[comp_model_name]['evaluation_metrics'] |
|
|
metadata['component_performance'][comp_model_name] = { |
|
|
'f1': comp_metrics.get('f1', 0), |
|
|
'accuracy': comp_metrics.get('accuracy', 0) |
|
|
} |
|
|
|
|
|
|
|
|
cv_results = metrics.get('cross_validation', {}) |
|
|
if cv_results and 'test_scores' in cv_results: |
|
|
metadata['cross_validation'] = { |
|
|
'n_splits': cv_results.get('n_splits', self.cv_folds), |
|
|
'test_scores': cv_results['test_scores'], |
|
|
'train_scores': cv_results.get('train_scores', {}), |
|
|
'overfitting_score': cv_results.get('overfitting_score', 'Unknown'), |
|
|
'stability_score': cv_results.get('stability_score', 'Unknown') |
|
|
} |
|
|
|
|
|
if 'f1' in cv_results['test_scores']: |
|
|
metadata.update({ |
|
|
'cv_f1_mean': cv_results['test_scores']['f1']['mean'], |
|
|
'cv_f1_std': cv_results['test_scores']['f1']['std'] |
|
|
}) |
|
|
|
|
|
|
|
|
metadata['training_config'] = { |
|
|
'test_size': self.test_size, |
|
|
'cv_folds': self.cv_folds, |
|
|
'max_features': self.max_features, |
|
|
'use_ensemble': self.use_ensemble, |
|
|
'use_enhanced_features': self.use_enhanced_features |
|
|
} |
|
|
|
|
|
return metadata |
|
|
|
|
|
def train_model(self, data_path: str = None, force_enhanced: bool = None, |
|
|
use_ensemble: bool = None) -> Tuple[bool, str]: |
|
|
"""Main training function with ensemble support""" |
|
|
try: |
|
|
|
|
|
if force_enhanced is not None: |
|
|
original_enhanced = self.use_enhanced_features |
|
|
self.use_enhanced_features = force_enhanced and ENHANCED_FEATURES_AVAILABLE |
|
|
|
|
|
if use_ensemble is not None: |
|
|
self.use_ensemble = use_ensemble and LIGHTGBM_AVAILABLE |
|
|
|
|
|
feature_type = "enhanced" if self.use_enhanced_features else "standard" |
|
|
training_type = "ensemble" if self.use_ensemble else "individual" |
|
|
|
|
|
logger.info(f"Starting {feature_type} {training_type} model training...") |
|
|
|
|
|
|
|
|
if data_path: |
|
|
self.data_path = Path(data_path) |
|
|
|
|
|
|
|
|
success, df, message = self.load_and_validate_data() |
|
|
if not success: |
|
|
return False, message |
|
|
|
|
|
|
|
|
time_estimate = estimate_training_time( |
|
|
len(df), |
|
|
enable_tuning=True, |
|
|
cv_folds=self.cv_folds, |
|
|
use_enhanced_features=self.use_enhanced_features, |
|
|
use_ensemble=self.use_ensemble |
|
|
) |
|
|
|
|
|
model_count = len(self.models) |
|
|
logger.info(f"Training Configuration:") |
|
|
logger.info(f" Dataset size: {len(df)} samples") |
|
|
logger.info(f" Feature engineering: {feature_type.title()}") |
|
|
logger.info(f" Training approach: {training_type.title()}") |
|
|
logger.info(f" Models to train: {model_count}") |
|
|
logger.info(f" LightGBM available: {LIGHTGBM_AVAILABLE}") |
|
|
logger.info(f" Estimated time: {time_estimate['total_formatted']}") |
|
|
|
|
|
|
|
|
base_steps = 4 + (model_count * 3) + 2 |
|
|
enhanced_steps = 2 if self.use_enhanced_features else 0 |
|
|
ensemble_steps = 3 if self.use_ensemble else 0 |
|
|
total_steps = base_steps + enhanced_steps + ensemble_steps |
|
|
|
|
|
self.progress_tracker = ProgressTracker( |
|
|
total_steps, |
|
|
f"{feature_type.title()} {training_type.title()} Training" |
|
|
) |
|
|
|
|
|
|
|
|
X = df['text'].values |
|
|
y = df['label'].values |
|
|
|
|
|
|
|
|
self.progress_tracker.update("Splitting data") |
|
|
|
|
|
if len(X) < 10: |
|
|
test_size = max(0.1, 1/len(X)) |
|
|
else: |
|
|
test_size = self.test_size |
|
|
|
|
|
label_counts = pd.Series(y).value_counts() |
|
|
min_class_count = label_counts.min() |
|
|
can_stratify = min_class_count >= 2 and len(y) >= 4 |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, |
|
|
test_size=test_size, |
|
|
stratify=y if can_stratify else None, |
|
|
random_state=self.random_state |
|
|
) |
|
|
|
|
|
logger.info(f"Data split: {len(X_train)} train, {len(X_test)} test") |
|
|
|
|
|
|
|
|
results = self.train_and_evaluate_models(X_train, X_test, y_train, y_test) |
|
|
|
|
|
|
|
|
ensemble_results = {} |
|
|
if self.use_ensemble and len([r for r in results.values() if 'error' not in r]) >= 2: |
|
|
self.progress_tracker.update("Creating ensemble model") |
|
|
ensemble_results = self.train_ensemble_model(X_train, X_test, y_train, y_test, results) |
|
|
|
|
|
if ensemble_results and 'error' not in ensemble_results: |
|
|
logger.info("Ensemble model trained successfully") |
|
|
else: |
|
|
logger.warning("Ensemble training failed, using best individual model") |
|
|
|
|
|
|
|
|
best_model_name, best_model, best_metrics = self.select_best_model(results, ensemble_results) |
|
|
|
|
|
|
|
|
if not self.save_model_artifacts(best_model, best_model_name, best_metrics, results, ensemble_results): |
|
|
return False, "Failed to save model artifacts" |
|
|
|
|
|
|
|
|
self.progress_tracker.finish() |
|
|
|
|
|
|
|
|
cv_results = best_metrics.get('cross_validation', {}) |
|
|
cv_info = "" |
|
|
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']: |
|
|
cv_f1_mean = cv_results['test_scores']['f1']['mean'] |
|
|
cv_f1_std = cv_results['test_scores']['f1']['std'] |
|
|
cv_info = f", CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})" |
|
|
|
|
|
|
|
|
feature_info = f", {feature_type.title()} Features" |
|
|
if self.use_enhanced_features: |
|
|
feature_metadata = best_metrics.get('feature_metadata', {}) |
|
|
if feature_metadata: |
|
|
total_features = feature_metadata.get('total_features', 0) |
|
|
feature_info = f", Enhanced Features: {total_features}" |
|
|
|
|
|
|
|
|
ensemble_info = "" |
|
|
if best_model_name == "ensemble": |
|
|
ensemble_details = best_metrics.get('ensemble_info', {}) |
|
|
n_models = ensemble_details.get('n_estimators', 0) |
|
|
ensemble_info = f", Ensemble: {n_models} models" |
|
|
|
|
|
success_message = ( |
|
|
f"{training_type.title()} model training completed successfully. " |
|
|
f"Best model: {best_model_name} " |
|
|
f"(Test F1: {best_metrics['f1']:.4f}, Test Accuracy: {best_metrics['accuracy']:.4f}{cv_info}{feature_info}{ensemble_info})" |
|
|
) |
|
|
|
|
|
logger.info(success_message) |
|
|
return True, success_message |
|
|
|
|
|
except Exception as e: |
|
|
if self.progress_tracker: |
|
|
print() |
|
|
error_message = f"Enhanced ensemble model training failed: {str(e)}" |
|
|
logger.error(error_message) |
|
|
return False, error_message |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def estimate_training_time(dataset_size: int, enable_tuning: bool = True, cv_folds: int = 5, |
|
|
use_enhanced_features: bool = False, use_ensemble: bool = False) -> Dict: |
|
|
"""Enhanced time estimation including ensemble training""" |
|
|
|
|
|
|
|
|
base_times = { |
|
|
'preprocessing': max(0.1, dataset_size * 0.001), |
|
|
'vectorization': max(0.5, dataset_size * 0.01), |
|
|
'feature_selection': max(0.2, dataset_size * 0.005), |
|
|
'simple_training': max(1.0, dataset_size * 0.02), |
|
|
'evaluation': max(0.5, dataset_size * 0.01), |
|
|
} |
|
|
|
|
|
|
|
|
if use_enhanced_features: |
|
|
base_times['preprocessing'] *= 2.5 |
|
|
base_times['vectorization'] *= 1.5 |
|
|
base_times['feature_selection'] *= 2.0 |
|
|
base_times['enhanced_feature_extraction'] = max(2.0, dataset_size * 0.05) |
|
|
|
|
|
|
|
|
if use_ensemble and LIGHTGBM_AVAILABLE: |
|
|
base_times['lightgbm_training'] = max(2.0, dataset_size * 0.03) |
|
|
base_times['ensemble_creation'] = max(1.0, dataset_size * 0.005) |
|
|
base_times['ensemble_evaluation'] = max(1.0, dataset_size * 0.015) |
|
|
|
|
|
|
|
|
tuning_multipliers = { |
|
|
'logistic_regression': 8 if enable_tuning else 1, |
|
|
'random_forest': 12 if enable_tuning else 1, |
|
|
} |
|
|
|
|
|
if use_ensemble and LIGHTGBM_AVAILABLE: |
|
|
tuning_multipliers['lightgbm'] = 10 if enable_tuning else 1 |
|
|
|
|
|
|
|
|
cv_multiplier = cv_folds if dataset_size > 100 else 1 |
|
|
|
|
|
|
|
|
estimates = {} |
|
|
|
|
|
|
|
|
estimates['data_loading'] = 0.5 |
|
|
estimates['preprocessing'] = base_times['preprocessing'] |
|
|
estimates['vectorization'] = base_times['vectorization'] |
|
|
|
|
|
if use_enhanced_features: |
|
|
estimates['enhanced_feature_extraction'] = base_times['enhanced_feature_extraction'] |
|
|
|
|
|
estimates['feature_selection'] = base_times['feature_selection'] |
|
|
|
|
|
|
|
|
for model_name, multiplier in tuning_multipliers.items(): |
|
|
model_time = base_times['simple_training'] * multiplier * cv_multiplier |
|
|
estimates[f'{model_name}_training'] = model_time |
|
|
estimates[f'{model_name}_evaluation'] = base_times['evaluation'] |
|
|
|
|
|
|
|
|
if use_ensemble and LIGHTGBM_AVAILABLE: |
|
|
estimates['ensemble_creation'] = base_times['ensemble_creation'] |
|
|
estimates['ensemble_evaluation'] = base_times['ensemble_evaluation'] |
|
|
estimates['ensemble_cross_validation'] = base_times['simple_training'] * cv_folds * 0.3 |
|
|
|
|
|
|
|
|
estimates['cross_validation'] = base_times['simple_training'] * cv_folds * 0.5 |
|
|
|
|
|
|
|
|
estimates['model_saving'] = 1.0 |
|
|
|
|
|
|
|
|
total_estimate = sum(estimates.values()) |
|
|
|
|
|
|
|
|
buffer_multiplier = 1.6 if use_ensemble else (1.4 if use_enhanced_features else 1.2) |
|
|
total_estimate *= buffer_multiplier |
|
|
|
|
|
return { |
|
|
'detailed_estimates': estimates, |
|
|
'total_seconds': total_estimate, |
|
|
'total_formatted': str(timedelta(seconds=int(total_estimate))), |
|
|
'dataset_size': dataset_size, |
|
|
'enable_tuning': enable_tuning, |
|
|
'cv_folds': cv_folds, |
|
|
'use_enhanced_features': use_enhanced_features, |
|
|
'use_ensemble': use_ensemble, |
|
|
'lightgbm_available': LIGHTGBM_AVAILABLE |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
class EnhancedModelTrainer(EnsembleModelTrainer): |
|
|
"""Complete enhanced model trainer inheriting from ensemble trainer""" |
|
|
|
|
|
def load_and_validate_data(self) -> Tuple[bool, Optional[pd.DataFrame], str]: |
|
|
"""Load and validate training data""" |
|
|
try: |
|
|
logger.info("Loading training data...") |
|
|
if self.progress_tracker: |
|
|
self.progress_tracker.update("Loading data") |
|
|
|
|
|
if not self.data_path.exists(): |
|
|
return False, None, f"Data file not found: {self.data_path}" |
|
|
|
|
|
|
|
|
df = pd.read_csv(self.data_path) |
|
|
|
|
|
|
|
|
if df.empty: |
|
|
return False, None, "Dataset is empty" |
|
|
|
|
|
required_columns = ['text', 'label'] |
|
|
missing_columns = [ |
|
|
col for col in required_columns if col not in df.columns] |
|
|
if missing_columns: |
|
|
return False, None, f"Missing required columns: {missing_columns}" |
|
|
|
|
|
|
|
|
initial_count = len(df) |
|
|
df = df.dropna(subset=required_columns) |
|
|
if len(df) < initial_count: |
|
|
logger.warning( |
|
|
f"Removed {initial_count - len(df)} rows with missing values") |
|
|
|
|
|
|
|
|
df = df[df['text'].astype(str).str.len() > 10] |
|
|
|
|
|
|
|
|
unique_labels = df['label'].unique() |
|
|
if len(unique_labels) < 2: |
|
|
return False, None, f"Need at least 2 classes, found: {unique_labels}" |
|
|
|
|
|
|
|
|
min_samples_for_cv = self.cv_folds * 2 |
|
|
if len(df) < min_samples_for_cv: |
|
|
logger.warning(f"Dataset size ({len(df)}) is small for {self.cv_folds}-fold CV") |
|
|
self.cv_manager.cv_folds = max(2, len(df) // 3) |
|
|
logger.info(f"Adjusted CV folds to {self.cv_manager.cv_folds}") |
|
|
|
|
|
|
|
|
label_counts = df['label'].value_counts() |
|
|
min_class_ratio = label_counts.min() / label_counts.max() |
|
|
if min_class_ratio < 0.1: |
|
|
logger.warning( |
|
|
f"Severe class imbalance detected: {min_class_ratio:.3f}") |
|
|
|
|
|
logger.info( |
|
|
f"Data validation successful: {len(df)} samples, {len(unique_labels)} classes") |
|
|
logger.info(f"Class distribution: {label_counts.to_dict()}") |
|
|
|
|
|
return True, df, "Data loaded successfully" |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Error loading data: {str(e)}" |
|
|
logger.error(error_msg) |
|
|
return False, None, error_msg |
|
|
|
|
|
def create_preprocessing_pipeline(self, use_enhanced: bool = None) -> Pipeline: |
|
|
"""Create preprocessing pipeline with optional enhanced features""" |
|
|
|
|
|
if use_enhanced is None: |
|
|
use_enhanced = self.use_enhanced_features |
|
|
|
|
|
if self.progress_tracker: |
|
|
feature_type = "enhanced" if use_enhanced else "standard" |
|
|
self.progress_tracker.update(f"Creating {feature_type} pipeline") |
|
|
|
|
|
if use_enhanced and ENHANCED_FEATURES_AVAILABLE: |
|
|
logger.info("Creating enhanced feature engineering pipeline...") |
|
|
|
|
|
|
|
|
feature_engineer = AdvancedFeatureEngineer( |
|
|
enable_sentiment=True, |
|
|
enable_readability=True, |
|
|
enable_entities=True, |
|
|
enable_linguistic=True, |
|
|
feature_selection_k=self.feature_selection_k, |
|
|
tfidf_max_features=self.max_features, |
|
|
ngram_range=self.ngram_range, |
|
|
min_df=self.min_df, |
|
|
max_df=self.max_df |
|
|
) |
|
|
|
|
|
|
|
|
pipeline = Pipeline([ |
|
|
('enhanced_features', feature_engineer), |
|
|
('model', None) |
|
|
]) |
|
|
|
|
|
|
|
|
self.feature_engineer = feature_engineer |
|
|
|
|
|
else: |
|
|
logger.info("Creating standard TF-IDF pipeline...") |
|
|
|
|
|
|
|
|
text_preprocessor = FunctionTransformer( |
|
|
func=preprocess_text_function, |
|
|
validate=False |
|
|
) |
|
|
|
|
|
|
|
|
vectorizer = TfidfVectorizer( |
|
|
max_features=self.max_features, |
|
|
min_df=self.min_df, |
|
|
max_df=self.max_df, |
|
|
ngram_range=self.ngram_range, |
|
|
stop_words='english', |
|
|
sublinear_tf=True, |
|
|
norm='l2' |
|
|
) |
|
|
|
|
|
|
|
|
feature_selector = SelectKBest( |
|
|
score_func=chi2, |
|
|
k=min(self.feature_selection_k, self.max_features) |
|
|
) |
|
|
|
|
|
|
|
|
pipeline = Pipeline([ |
|
|
('preprocess', text_preprocessor), |
|
|
('vectorize', vectorizer), |
|
|
('feature_select', feature_selector), |
|
|
('model', None) |
|
|
]) |
|
|
|
|
|
return pipeline |
|
|
|
|
|
def comprehensive_evaluation(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict: |
|
|
"""Comprehensive model evaluation with enhanced feature analysis""" |
|
|
|
|
|
if self.progress_tracker: |
|
|
self.progress_tracker.update("Evaluating model") |
|
|
|
|
|
|
|
|
y_pred = model.predict(X_test) |
|
|
y_pred_proba = model.predict_proba(X_test)[:, 1] |
|
|
|
|
|
|
|
|
metrics = { |
|
|
'accuracy': float(accuracy_score(y_test, y_pred)), |
|
|
'precision': float(precision_score(y_test, y_pred, average='weighted')), |
|
|
'recall': float(recall_score(y_test, y_pred, average='weighted')), |
|
|
'f1': float(f1_score(y_test, y_pred, average='weighted')), |
|
|
'roc_auc': float(roc_auc_score(y_test, y_pred_proba)) |
|
|
} |
|
|
|
|
|
|
|
|
cm = confusion_matrix(y_test, y_pred) |
|
|
metrics['confusion_matrix'] = cm.tolist() |
|
|
|
|
|
|
|
|
if X_train is not None and y_train is not None: |
|
|
|
|
|
X_full = np.concatenate([X_train, X_test]) |
|
|
y_full = np.concatenate([y_train, y_test]) |
|
|
|
|
|
logger.info("Performing cross-validation on full dataset...") |
|
|
cv_results = self.cv_manager.perform_cross_validation(model, X_full, y_full) |
|
|
metrics['cross_validation'] = cv_results |
|
|
|
|
|
|
|
|
if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']: |
|
|
cv_f1_mean = cv_results['test_scores']['f1']['mean'] |
|
|
cv_f1_std = cv_results['test_scores']['f1']['std'] |
|
|
logger.info(f"CV F1 Score: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})") |
|
|
|
|
|
|
|
|
if self.use_enhanced_features and self.feature_engineer is not None: |
|
|
try: |
|
|
|
|
|
if hasattr(self.feature_engineer, 'get_feature_importance'): |
|
|
feature_importance = self.feature_engineer.get_feature_importance(top_k=20) |
|
|
metrics['top_features'] = feature_importance |
|
|
|
|
|
|
|
|
if hasattr(self.feature_engineer, 'get_feature_metadata'): |
|
|
feature_metadata = self.feature_engineer.get_feature_metadata() |
|
|
metrics['feature_metadata'] = feature_metadata |
|
|
|
|
|
logger.info(f"Enhanced features used: {feature_metadata['total_features']}") |
|
|
logger.info(f"Feature breakdown: {feature_metadata['feature_types']}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Enhanced feature analysis failed: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
if X_train is not None and y_train is not None: |
|
|
y_train_pred = model.predict(X_train) |
|
|
train_accuracy = accuracy_score(y_train, y_train_pred) |
|
|
metrics['train_accuracy'] = float(train_accuracy) |
|
|
metrics['overfitting_score'] = float( |
|
|
train_accuracy - metrics['accuracy']) |
|
|
except Exception as e: |
|
|
logger.warning(f"Overfitting detection failed: {e}") |
|
|
|
|
|
return metrics |
|
|
|
|
|
def hyperparameter_tuning_with_cv(self, pipeline, X_train, y_train, model_name: str) -> Tuple[Any, Dict]: |
|
|
"""Perform hyperparameter tuning with nested cross-validation""" |
|
|
|
|
|
if self.progress_tracker: |
|
|
feature_type = "enhanced" if self.use_enhanced_features else "standard" |
|
|
self.progress_tracker.update(f"Tuning {model_name} with {feature_type} features") |
|
|
|
|
|
try: |
|
|
|
|
|
pipeline.set_params(model=self.models[model_name]['model']) |
|
|
|
|
|
|
|
|
if len(X_train) < 20: |
|
|
logger.info(f"Skipping hyperparameter tuning for {model_name} due to small dataset") |
|
|
pipeline.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
cv_results = self.cv_manager.perform_cross_validation(pipeline, X_train, y_train) |
|
|
|
|
|
return pipeline, { |
|
|
'best_params': 'default_parameters', |
|
|
'best_score': cv_results.get('test_scores', {}).get('f1', {}).get('mean', 'not_calculated'), |
|
|
'best_estimator': pipeline, |
|
|
'cross_validation': cv_results, |
|
|
'note': 'Hyperparameter tuning skipped for small dataset' |
|
|
} |
|
|
|
|
|
|
|
|
param_grid = self.models[model_name]['param_grid'] |
|
|
|
|
|
|
|
|
cv_strategy = self.cv_manager.create_cv_strategy(X_train, y_train) |
|
|
|
|
|
|
|
|
grid_search = GridSearchCV( |
|
|
pipeline, |
|
|
param_grid, |
|
|
cv=cv_strategy, |
|
|
scoring='f1_weighted', |
|
|
n_jobs=1, |
|
|
verbose=0, |
|
|
return_train_score=True |
|
|
) |
|
|
|
|
|
|
|
|
logger.info(f"Starting hyperparameter tuning for {model_name}...") |
|
|
grid_search.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
logger.info(f"Performing final CV evaluation for {model_name}...") |
|
|
best_cv_results = self.cv_manager.perform_cross_validation( |
|
|
grid_search.best_estimator_, X_train, y_train, cv_strategy |
|
|
) |
|
|
|
|
|
|
|
|
tuning_results = { |
|
|
'best_params': grid_search.best_params_, |
|
|
'best_score': float(grid_search.best_score_), |
|
|
'best_estimator': grid_search.best_estimator_, |
|
|
'cv_folds_used': cv_strategy.n_splits, |
|
|
'cross_validation': best_cv_results, |
|
|
'grid_search_results': { |
|
|
'mean_test_scores': grid_search.cv_results_['mean_test_score'].tolist(), |
|
|
'std_test_scores': grid_search.cv_results_['std_test_score'].tolist(), |
|
|
'mean_train_scores': grid_search.cv_results_.get('mean_train_score', []).tolist() if 'mean_train_score' in grid_search.cv_results_ else [], |
|
|
'params': grid_search.cv_results_['params'] |
|
|
} |
|
|
} |
|
|
|
|
|
logger.info(f"Hyperparameter tuning completed for {model_name}") |
|
|
logger.info(f"Best CV score: {grid_search.best_score_:.4f}") |
|
|
logger.info(f"Best params: {grid_search.best_params_}") |
|
|
|
|
|
if 'test_scores' in best_cv_results and 'f1' in best_cv_results['test_scores']: |
|
|
final_f1 = best_cv_results['test_scores']['f1']['mean'] |
|
|
final_f1_std = best_cv_results['test_scores']['f1']['std'] |
|
|
logger.info(f"Final CV F1: {final_f1:.4f} (±{final_f1_std:.4f})") |
|
|
|
|
|
return grid_search.best_estimator_, tuning_results |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Hyperparameter tuning failed for {model_name}: {str(e)}") |
|
|
|
|
|
try: |
|
|
pipeline.set_params(model=self.models[model_name]['model']) |
|
|
pipeline.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
cv_results = self.cv_manager.perform_cross_validation(pipeline, X_train, y_train) |
|
|
|
|
|
return pipeline, { |
|
|
'error': str(e), |
|
|
'fallback': 'simple_training', |
|
|
'cross_validation': cv_results |
|
|
} |
|
|
except Exception as e2: |
|
|
logger.error(f"Fallback training also failed for {model_name}: {str(e2)}") |
|
|
raise Exception(f"Both hyperparameter tuning and fallback training failed: {str(e)} | {str(e2)}") |
|
|
|
|
|
def train_and_evaluate_models(self, X_train, X_test, y_train, y_test) -> Dict: |
|
|
"""Train and evaluate multiple models with enhanced features and comprehensive CV""" |
|
|
|
|
|
results = {} |
|
|
|
|
|
for model_name in self.models.keys(): |
|
|
logger.info(f"Training {model_name} with {'enhanced' if self.use_enhanced_features else 'standard'} features...") |
|
|
|
|
|
try: |
|
|
|
|
|
pipeline = self.create_preprocessing_pipeline() |
|
|
|
|
|
|
|
|
best_model, tuning_results = self.hyperparameter_tuning_with_cv( |
|
|
pipeline, X_train, y_train, model_name |
|
|
) |
|
|
|
|
|
|
|
|
evaluation_metrics = self.comprehensive_evaluation( |
|
|
best_model, X_test, y_test, X_train, y_train |
|
|
) |
|
|
|
|
|
|
|
|
results[model_name] = { |
|
|
'model': best_model, |
|
|
'tuning_results': tuning_results, |
|
|
'evaluation_metrics': evaluation_metrics, |
|
|
'training_time': datetime.now().isoformat(), |
|
|
'feature_type': 'enhanced' if self.use_enhanced_features else 'standard' |
|
|
} |
|
|
|
|
|
|
|
|
test_f1 = evaluation_metrics['f1'] |
|
|
cv_results = evaluation_metrics.get('cross_validation', {}) |
|
|
cv_f1_mean = cv_results.get('test_scores', {}).get('f1', {}).get('mean', 'N/A') |
|
|
cv_f1_std = cv_results.get('test_scores', {}).get('f1', {}).get('std', 'N/A') |
|
|
|
|
|
logger.info(f"Model {model_name} - Test F1: {test_f1:.4f}, " |
|
|
f"CV F1: {cv_f1_mean:.4f if cv_f1_mean != 'N/A' else cv_f1_mean} " |
|
|
f"(±{cv_f1_std:.4f if cv_f1_std != 'N/A' else cv_f1_std})") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Training failed for {model_name}: {str(e)}") |
|
|
results[model_name] = {'error': str(e)} |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
|
|
|
class ProgressTracker: |
|
|
"""Progress tracking with time estimation""" |
|
|
|
|
|
def __init__(self, total_steps: int, description: str = "Training"): |
|
|
self.total_steps = total_steps |
|
|
self.current_step = 0 |
|
|
self.start_time = time.time() |
|
|
self.description = description |
|
|
self.step_times = [] |
|
|
|
|
|
def update(self, step_name: str = ""): |
|
|
"""Update progress and print status""" |
|
|
self.current_step += 1 |
|
|
current_time = time.time() |
|
|
elapsed = current_time - self.start_time |
|
|
|
|
|
|
|
|
progress_pct = (self.current_step / self.total_steps) * 100 |
|
|
|
|
|
|
|
|
if self.current_step > 0: |
|
|
avg_time_per_step = elapsed / self.current_step |
|
|
remaining_steps = self.total_steps - self.current_step |
|
|
eta_seconds = avg_time_per_step * remaining_steps |
|
|
eta = timedelta(seconds=int(eta_seconds)) |
|
|
else: |
|
|
eta = "calculating..." |
|
|
|
|
|
|
|
|
bar_length = 30 |
|
|
filled_length = int(bar_length * self.current_step // self.total_steps) |
|
|
bar = '█' * filled_length + '░' * (bar_length - filled_length) |
|
|
|
|
|
|
|
|
status_msg = f"\r{self.description}: [{bar}] {progress_pct:.1f}% | Step {self.current_step}/{self.total_steps}" |
|
|
if step_name: |
|
|
status_msg += f" | {step_name}" |
|
|
if eta != "calculating...": |
|
|
status_msg += f" | ETA: {eta}" |
|
|
|
|
|
print(status_msg, end='', flush=True) |
|
|
|
|
|
|
|
|
progress_json = { |
|
|
"type": "progress", |
|
|
"step": self.current_step, |
|
|
"total": self.total_steps, |
|
|
"percentage": progress_pct, |
|
|
"eta": str(eta) if eta != "calculating..." else None, |
|
|
"step_name": step_name, |
|
|
"elapsed": elapsed |
|
|
} |
|
|
print(f"\nPROGRESS_JSON: {json.dumps(progress_json)}") |
|
|
|
|
|
|
|
|
if len(self.step_times) >= 3: |
|
|
self.step_times.pop(0) |
|
|
self.step_times.append(current_time - (self.start_time + sum(self.step_times))) |
|
|
|
|
|
def finish(self): |
|
|
"""Complete progress tracking""" |
|
|
total_time = time.time() - self.start_time |
|
|
print(f"\n{self.description} completed in {timedelta(seconds=int(total_time))}") |
|
|
|
|
|
|
|
|
class CrossValidationManager: |
|
|
"""Advanced cross-validation management with comprehensive metrics""" |
|
|
|
|
|
def __init__(self, cv_folds: int = 5, random_state: int = 42): |
|
|
self.cv_folds = cv_folds |
|
|
self.random_state = random_state |
|
|
self.cv_results = {} |
|
|
|
|
|
def create_cv_strategy(self, X, y) -> StratifiedKFold: |
|
|
"""Create appropriate CV strategy based on data characteristics""" |
|
|
|
|
|
n_samples = len(X) |
|
|
min_samples_per_fold = 3 |
|
|
max_folds = n_samples // min_samples_per_fold |
|
|
|
|
|
|
|
|
unique_classes = np.unique(y) |
|
|
min_class_count = min([np.sum(y == cls) for cls in unique_classes]) |
|
|
|
|
|
|
|
|
max_folds_by_class = min_class_count |
|
|
|
|
|
actual_folds = max(2, min(self.cv_folds, max_folds, max_folds_by_class)) |
|
|
|
|
|
logger.info(f"Using {actual_folds} CV folds (requested: {self.cv_folds})") |
|
|
|
|
|
return StratifiedKFold( |
|
|
n_splits=actual_folds, |
|
|
shuffle=True, |
|
|
random_state=self.random_state |
|
|
) |
|
|
|
|
|
def perform_cross_validation(self, pipeline, X, y, cv_strategy=None) -> Dict: |
|
|
"""Perform comprehensive cross-validation with multiple metrics""" |
|
|
|
|
|
if cv_strategy is None: |
|
|
cv_strategy = self.create_cv_strategy(X, y) |
|
|
|
|
|
logger.info(f"Starting cross-validation with {cv_strategy.n_splits} folds...") |
|
|
|
|
|
|
|
|
scoring_metrics = { |
|
|
'accuracy': 'accuracy', |
|
|
'precision': 'precision_weighted', |
|
|
'recall': 'recall_weighted', |
|
|
'f1': 'f1_weighted', |
|
|
'roc_auc': 'roc_auc' |
|
|
} |
|
|
|
|
|
try: |
|
|
|
|
|
cv_scores = cross_validate( |
|
|
pipeline, X, y, |
|
|
cv=cv_strategy, |
|
|
scoring=scoring_metrics, |
|
|
return_train_score=True, |
|
|
n_jobs=1, |
|
|
verbose=0 |
|
|
) |
|
|
|
|
|
|
|
|
cv_results = { |
|
|
'n_splits': cv_strategy.n_splits, |
|
|
'test_scores': {}, |
|
|
'train_scores': {}, |
|
|
'fold_results': [] |
|
|
} |
|
|
|
|
|
|
|
|
for metric_name in scoring_metrics.keys(): |
|
|
test_key = f'test_{metric_name}' |
|
|
train_key = f'train_{metric_name}' |
|
|
|
|
|
if test_key in cv_scores: |
|
|
test_scores = cv_scores[test_key] |
|
|
cv_results['test_scores'][metric_name] = { |
|
|
'mean': float(np.mean(test_scores)), |
|
|
'std': float(np.std(test_scores)), |
|
|
'min': float(np.min(test_scores)), |
|
|
'max': float(np.max(test_scores)), |
|
|
'scores': test_scores.tolist() |
|
|
} |
|
|
|
|
|
if train_key in cv_scores: |
|
|
train_scores = cv_scores[train_key] |
|
|
cv_results['train_scores'][metric_name] = { |
|
|
'mean': float(np.mean(train_scores)), |
|
|
'std': float(np.std(train_scores)), |
|
|
'min': float(np.min(train_scores)), |
|
|
'max': float(np.max(train_scores)), |
|
|
'scores': train_scores.tolist() |
|
|
} |
|
|
|
|
|
|
|
|
for fold_idx in range(cv_strategy.n_splits): |
|
|
fold_result = { |
|
|
'fold': fold_idx + 1, |
|
|
'test_scores': {}, |
|
|
'train_scores': {} |
|
|
} |
|
|
|
|
|
for metric_name in scoring_metrics.keys(): |
|
|
test_key = f'test_{metric_name}' |
|
|
train_key = f'train_{metric_name}' |
|
|
|
|
|
if test_key in cv_scores: |
|
|
fold_result['test_scores'][metric_name] = float(cv_scores[test_key][fold_idx]) |
|
|
if train_key in cv_scores: |
|
|
fold_result['train_scores'][metric_name] = float(cv_scores[train_key][fold_idx]) |
|
|
|
|
|
cv_results['fold_results'].append(fold_result) |
|
|
|
|
|
|
|
|
if 'accuracy' in cv_results['test_scores'] and 'accuracy' in cv_results['train_scores']: |
|
|
train_mean = cv_results['train_scores']['accuracy']['mean'] |
|
|
test_mean = cv_results['test_scores']['accuracy']['mean'] |
|
|
cv_results['overfitting_score'] = float(train_mean - test_mean) |
|
|
|
|
|
|
|
|
if 'accuracy' in cv_results['test_scores']: |
|
|
test_std = cv_results['test_scores']['accuracy']['std'] |
|
|
test_mean = cv_results['test_scores']['accuracy']['mean'] |
|
|
cv_results['stability_score'] = float(1 - (test_std / test_mean)) if test_mean > 0 else 0 |
|
|
|
|
|
logger.info(f"Cross-validation completed successfully") |
|
|
logger.info(f"Mean test accuracy: {cv_results['test_scores'].get('accuracy', {}).get('mean', 'N/A'):.4f}") |
|
|
logger.info(f"Mean test F1: {cv_results['test_scores'].get('f1', {}).get('mean', 'N/A'):.4f}") |
|
|
|
|
|
return cv_results |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Cross-validation failed: {e}") |
|
|
return { |
|
|
'error': str(e), |
|
|
'n_splits': cv_strategy.n_splits if cv_strategy else self.cv_folds, |
|
|
'fallback': True |
|
|
} |
|
|
|
|
|
|
|
|
def preprocess_text_function(texts): |
|
|
""" |
|
|
Standalone function for text preprocessing - pickle-safe |
|
|
""" |
|
|
def clean_single_text(text): |
|
|
|
|
|
text = str(text) |
|
|
|
|
|
|
|
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'\S+@\S+', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'[!]{2,}', '!', text) |
|
|
text = re.sub(r'[?]{2,}', '?', text) |
|
|
text = re.sub(r'[.]{3,}', '...', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'[^a-zA-Z\s.!?]', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
return text.strip().lower() |
|
|
|
|
|
|
|
|
processed = [] |
|
|
for text in texts: |
|
|
processed.append(clean_single_text(text)) |
|
|
|
|
|
return processed |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main execution function with enhanced ensemble support""" |
|
|
import argparse |
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='Train fake news detection model with LightGBM ensemble') |
|
|
parser.add_argument('--data_path', type=str, help='Path to training data CSV file') |
|
|
parser.add_argument('--config_path', type=str, help='Path to training configuration JSON file') |
|
|
parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds') |
|
|
parser.add_argument('--enhanced_features', action='store_true', help='Force use of enhanced features') |
|
|
parser.add_argument('--standard_features', action='store_true', help='Force use of standard TF-IDF features only') |
|
|
parser.add_argument('--ensemble', action='store_true', help='Force use of LightGBM ensemble') |
|
|
parser.add_argument('--no_ensemble', action='store_true', help='Disable ensemble training') |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
use_enhanced = None |
|
|
if args.enhanced_features and args.standard_features: |
|
|
logger.warning("Both --enhanced_features and --standard_features specified. Using auto-detection.") |
|
|
elif args.enhanced_features: |
|
|
use_enhanced = True |
|
|
logger.info("Enhanced features explicitly requested") |
|
|
elif args.standard_features: |
|
|
use_enhanced = False |
|
|
logger.info("Standard features explicitly requested") |
|
|
|
|
|
|
|
|
use_ensemble = None |
|
|
if args.ensemble and args.no_ensemble: |
|
|
logger.warning("Both --ensemble and --no_ensemble specified. Using auto-detection.") |
|
|
elif args.ensemble: |
|
|
use_ensemble = True |
|
|
logger.info("LightGBM ensemble explicitly requested") |
|
|
elif args.no_ensemble: |
|
|
use_ensemble = False |
|
|
logger.info("Ensemble training explicitly disabled") |
|
|
|
|
|
trainer = EnhancedModelTrainer(use_enhanced_features=use_enhanced, use_ensemble=use_ensemble) |
|
|
|
|
|
|
|
|
if args.cv_folds: |
|
|
trainer.cv_folds = args.cv_folds |
|
|
trainer.cv_manager.cv_folds = args.cv_folds |
|
|
|
|
|
|
|
|
if args.config_path and Path(args.config_path).exists(): |
|
|
try: |
|
|
with open(args.config_path, 'r') as f: |
|
|
config = json.load(f) |
|
|
|
|
|
|
|
|
trainer.test_size = config.get('test_size', trainer.test_size) |
|
|
trainer.cv_folds = config.get('cv_folds', trainer.cv_folds) |
|
|
trainer.cv_manager.cv_folds = trainer.cv_folds |
|
|
trainer.max_features = config.get('max_features', trainer.max_features) |
|
|
trainer.ngram_range = tuple(config.get('ngram_range', trainer.ngram_range)) |
|
|
|
|
|
|
|
|
if 'enhanced_features' in config and use_enhanced is None: |
|
|
trainer.use_enhanced_features = config['enhanced_features'] and ENHANCED_FEATURES_AVAILABLE |
|
|
|
|
|
|
|
|
if 'use_ensemble' in config and use_ensemble is None: |
|
|
trainer.use_ensemble = config['use_ensemble'] and LIGHTGBM_AVAILABLE |
|
|
|
|
|
|
|
|
if 'lightgbm_params' in config: |
|
|
trainer.lgb_params.update(config['lightgbm_params']) |
|
|
|
|
|
|
|
|
selected_models = config.get('selected_models') |
|
|
if selected_models and len(selected_models) < len(trainer.models): |
|
|
all_models = trainer.models.copy() |
|
|
trainer.models = {k: v for k, v in all_models.items() if k in selected_models} |
|
|
|
|
|
|
|
|
trainer.feature_selection_k = min(trainer.feature_selection_k, trainer.max_features) |
|
|
|
|
|
logger.info(f"Applied custom configuration with {trainer.cv_folds} CV folds") |
|
|
if trainer.use_enhanced_features: |
|
|
logger.info("Enhanced features enabled via configuration") |
|
|
if trainer.use_ensemble: |
|
|
logger.info("LightGBM ensemble enabled via configuration") |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Failed to load configuration: {e}, using defaults") |
|
|
|
|
|
|
|
|
logger.info("Final Training Configuration:") |
|
|
logger.info(f" Enhanced Features: {trainer.use_enhanced_features} (Available: {ENHANCED_FEATURES_AVAILABLE})") |
|
|
logger.info(f" LightGBM Ensemble: {trainer.use_ensemble} (Available: {LIGHTGBM_AVAILABLE})") |
|
|
logger.info(f" Models to train: {list(trainer.models.keys())}") |
|
|
logger.info(f" Cross-validation folds: {trainer.cv_folds}") |
|
|
|
|
|
success, message = trainer.train_model(data_path=args.data_path) |
|
|
|
|
|
if success: |
|
|
print(f"✅ {message}") |
|
|
|
|
|
|
|
|
if trainer.use_enhanced_features and trainer.feature_engineer: |
|
|
try: |
|
|
metadata = trainer.feature_engineer.get_feature_metadata() |
|
|
print(f"\n📈 Enhanced Feature Engineering Summary:") |
|
|
print(f"Total features generated: {metadata['total_features']}") |
|
|
for feature_type, count in metadata['feature_types'].items(): |
|
|
print(f" {feature_type}: {count}") |
|
|
except Exception as e: |
|
|
logger.warning(f"Could not display feature summary: {e}") |
|
|
|
|
|
|
|
|
if trainer.use_ensemble and LIGHTGBM_AVAILABLE: |
|
|
try: |
|
|
ensemble_metadata_path = Path("/tmp/ensemble_metadata.json") |
|
|
if ensemble_metadata_path.exists(): |
|
|
with open(ensemble_metadata_path, 'r') as f: |
|
|
ensemble_metadata = json.load(f) |
|
|
|
|
|
print(f"\n🎯 Ensemble Model Summary:") |
|
|
print(f"Ensemble type: {ensemble_metadata.get('ensemble_type', 'unknown')}") |
|
|
print(f"Component models: {', '.join(ensemble_metadata.get('component_models', []))}") |
|
|
else: |
|
|
print(f"\n🎯 Individual Model Selected (Ensemble not used)") |
|
|
except Exception as e: |
|
|
logger.warning(f"Could not display ensemble summary: {e}") |
|
|
|
|
|
else: |
|
|
print(f"❌ {message}") |
|
|
exit(1) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |