Ahmedik95316 commited on
Commit
dbb9a1a
·
1 Parent(s): ead9c37

Update model/retrain.py

Browse files

Cross Validation Implementation

Files changed (1) hide show
  1. model/retrain.py +529 -153
model/retrain.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import pandas as pd
2
  import numpy as np
3
  import joblib
@@ -17,7 +20,9 @@ from sklearn.metrics import (
17
  accuracy_score, precision_score, recall_score, f1_score,
18
  roc_auc_score, confusion_matrix, classification_report
19
  )
20
- from sklearn.model_selection import cross_val_score, StratifiedKFold
 
 
21
  from sklearn.feature_extraction.text import TfidfVectorizer
22
  from sklearn.linear_model import LogisticRegression
23
  from sklearn.ensemble import RandomForestClassifier
@@ -36,13 +41,322 @@ logging.basicConfig(
36
  )
37
  logger = logging.getLogger(__name__)
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  class RobustModelRetrainer:
40
- """Production-ready model retraining with statistical validation and A/B testing"""
41
 
42
  def __init__(self):
43
  self.setup_paths()
44
  self.setup_retraining_config()
45
  self.setup_statistical_tests()
 
46
 
47
  def setup_paths(self):
48
  """Setup all necessary paths"""
@@ -81,7 +395,7 @@ class RobustModelRetrainer:
81
  self.min_new_samples = 50
82
  self.improvement_threshold = 0.01 # 1% improvement required
83
  self.significance_level = 0.05
84
- self.cv_folds = 5
85
  self.test_size = 0.2
86
  self.random_state = 42
87
  self.max_retries = 3
@@ -90,9 +404,9 @@ class RobustModelRetrainer:
90
  def setup_statistical_tests(self):
91
  """Setup statistical test configurations"""
92
  self.statistical_tests = {
93
- 'mcnemar': {'alpha': 0.05, 'name': "McNemar's Test"},
94
  'paired_ttest': {'alpha': 0.05, 'name': "Paired T-Test"},
95
- 'wilcoxon': {'alpha': 0.05, 'name': "Wilcoxon Signed-Rank Test"}
 
96
  }
97
 
98
  def load_existing_metadata(self) -> Optional[Dict]:
@@ -246,146 +560,129 @@ class RobustModelRetrainer:
246
  return pipeline
247
 
248
  def train_candidate_model(self, df: pd.DataFrame) -> Tuple[bool, Optional[Any], Dict]:
249
- """Train candidate model with comprehensive evaluation"""
250
  try:
251
- logger.info("Training candidate model...")
252
 
253
  # Prepare data
254
  X = df['text'].values
255
  y = df['label'].values
256
 
257
- # Train-test split
258
- from sklearn.model_selection import train_test_split
259
- X_train, X_test, y_train, y_test = train_test_split(
260
- X, y, test_size=self.test_size, stratify=y, random_state=self.random_state
261
- )
262
-
263
  # Create and train pipeline
264
  pipeline = self.create_advanced_pipeline()
265
- pipeline.fit(X_train, y_train)
266
 
267
- # Evaluate candidate model
268
- evaluation_results = self.evaluate_model(pipeline, X_test, y_test, X_train, y_train)
 
269
 
270
- # Save candidate model
271
- joblib.dump(pipeline, self.candidate_pipeline_path)
272
- joblib.dump(pipeline.named_steps['model'], self.candidate_model_path)
273
- joblib.dump(pipeline.named_steps['vectorize'], self.candidate_vectorizer_path)
274
 
275
- logger.info(f"Candidate model training completed")
276
- logger.info(f"Candidate F1 Score: {evaluation_results['f1']:.4f}")
277
- logger.info(f"Candidate Accuracy: {evaluation_results['accuracy']:.4f}")
 
278
 
279
- return True, pipeline, evaluation_results
 
280
 
281
- except Exception as e:
282
- error_msg = f"Candidate model training failed: {str(e)}"
283
- logger.error(error_msg)
284
- return False, None, {'error': error_msg}
285
-
286
- def evaluate_model(self, model, X_test, y_test, X_train=None, y_train=None) -> Dict:
287
- """Comprehensive model evaluation"""
288
- try:
289
- # Predictions
290
- y_pred = model.predict(X_test)
291
- y_pred_proba = model.predict_proba(X_test)[:, 1]
292
 
293
- # Basic metrics
294
- metrics = {
295
  'accuracy': float(accuracy_score(y_test, y_pred)),
296
  'precision': float(precision_score(y_test, y_pred, average='weighted')),
297
  'recall': float(recall_score(y_test, y_pred, average='weighted')),
298
  'f1': float(f1_score(y_test, y_pred, average='weighted')),
299
- 'roc_auc': float(roc_auc_score(y_test, y_pred_proba)),
300
- 'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
301
- 'evaluation_timestamp': datetime.now().isoformat()
302
  }
303
 
304
- # Cross-validation
305
- if X_train is not None and y_train is not None:
306
- try:
307
- cv_scores = cross_val_score(
308
- model, X_train, y_train,
309
- cv=StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state),
310
- scoring='f1_weighted'
311
- )
312
- metrics['cv_f1_mean'] = float(cv_scores.mean())
313
- metrics['cv_f1_std'] = float(cv_scores.std())
314
- except Exception as e:
315
- logger.warning(f"Cross-validation failed: {e}")
316
 
317
- return metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
  except Exception as e:
320
- logger.error(f"Model evaluation failed: {str(e)}")
321
- return {'error': str(e)}
 
322
 
323
- def compare_models_statistically(self, prod_model, candidate_model, X_test, y_test) -> Dict:
324
- """Statistical comparison of models"""
 
 
 
325
  try:
326
- logger.info("Performing statistical model comparison...")
327
-
328
- # Get predictions
329
- prod_pred = prod_model.predict(X_test)
330
- candidate_pred = candidate_model.predict(X_test)
331
 
332
- # Calculate accuracies
333
- prod_accuracy = accuracy_score(y_test, prod_pred)
334
- candidate_accuracy = accuracy_score(y_test, candidate_pred)
335
 
336
- comparison_results = {
337
- 'production_accuracy': float(prod_accuracy),
338
- 'candidate_accuracy': float(candidate_accuracy),
339
- 'absolute_improvement': float(candidate_accuracy - prod_accuracy),
340
- 'relative_improvement': float((candidate_accuracy - prod_accuracy) / prod_accuracy * 100),
341
- 'statistical_tests': {}
342
- }
343
-
344
- # McNemar's test for paired predictions
345
- try:
346
- # Create contingency table
347
- prod_correct = (prod_pred == y_test)
348
- candidate_correct = (candidate_pred == y_test)
349
-
350
- both_correct = np.sum(prod_correct & candidate_correct)
351
- prod_only = np.sum(prod_correct & ~candidate_correct)
352
- candidate_only = np.sum(~prod_correct & candidate_correct)
353
- both_wrong = np.sum(~prod_correct & ~candidate_correct)
354
-
355
- # McNemar's test
356
- if prod_only + candidate_only > 0:
357
- mcnemar_stat = (abs(prod_only - candidate_only) - 1) ** 2 / (prod_only + candidate_only)
358
- p_value = 1 - stats.chi2.cdf(mcnemar_stat, 1)
359
-
360
- comparison_results['statistical_tests']['mcnemar'] = {
361
- 'statistic': float(mcnemar_stat),
362
- 'p_value': float(p_value),
363
- 'significant': p_value < self.significance_level,
364
- 'contingency_table': {
365
- 'both_correct': int(both_correct),
366
- 'prod_only': int(prod_only),
367
- 'candidate_only': int(candidate_only),
368
- 'both_wrong': int(both_wrong)
369
- }
370
- }
371
-
372
- except Exception as e:
373
- logger.warning(f"McNemar's test failed: {e}")
374
-
375
- # Practical significance test
376
- comparison_results['practical_significance'] = {
377
- 'meets_threshold': comparison_results['absolute_improvement'] >= self.improvement_threshold,
378
- 'threshold': self.improvement_threshold,
379
- 'recommendation': 'promote' if (
380
- comparison_results['absolute_improvement'] >= self.improvement_threshold and
381
- comparison_results['statistical_tests'].get('mcnemar', {}).get('significant', False)
382
- ) else 'keep_current'
383
  }
384
 
385
- return comparison_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
  except Exception as e:
388
- logger.error(f"Statistical comparison failed: {str(e)}")
389
  return {'error': str(e)}
390
 
391
  def create_backup(self) -> bool:
@@ -415,7 +712,7 @@ class RobustModelRetrainer:
415
  return False
416
 
417
  def promote_candidate_model(self, candidate_model, candidate_metrics: Dict, comparison_results: Dict) -> bool:
418
- """Promote candidate model to production"""
419
  try:
420
  logger.info("Promoting candidate model to production...")
421
 
@@ -429,7 +726,7 @@ class RobustModelRetrainer:
429
  shutil.copy2(self.candidate_vectorizer_path, self.prod_vectorizer_path)
430
  shutil.copy2(self.candidate_pipeline_path, self.prod_pipeline_path)
431
 
432
- # Update metadata
433
  metadata = self.load_existing_metadata() or {}
434
 
435
  # Increment version
@@ -443,27 +740,78 @@ class RobustModelRetrainer:
443
  else:
444
  new_version = f"v1.{int(datetime.now().timestamp()) % 1000}"
445
 
446
- # Update metadata
 
 
 
 
447
  metadata.update({
448
  'model_version': new_version,
449
- 'model_type': 'retrained_pipeline',
450
  'previous_version': old_version,
451
- 'test_accuracy': candidate_metrics['accuracy'],
452
- 'test_f1': candidate_metrics['f1'],
453
- 'test_precision': candidate_metrics['precision'],
454
- 'test_recall': candidate_metrics['recall'],
455
- 'test_roc_auc': candidate_metrics['roc_auc'],
456
- 'improvement_over_previous': comparison_results['absolute_improvement'],
457
- 'statistical_significance': comparison_results['statistical_tests'].get('mcnemar', {}).get('significant', False),
458
  'promotion_timestamp': datetime.now().isoformat(),
459
- 'retrain_trigger': 'scheduled_retrain'
 
 
460
  })
461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  # Save updated metadata
463
  with open(self.metadata_path, 'w') as f:
464
  json.dump(metadata, f, indent=2)
465
 
466
  logger.info(f"Model promoted successfully to {new_version}")
 
467
  return True
468
 
469
  except Exception as e:
@@ -471,12 +819,13 @@ class RobustModelRetrainer:
471
  return False
472
 
473
  def log_retraining_session(self, results: Dict):
474
- """Log retraining session results"""
475
  try:
476
  log_entry = {
477
  'timestamp': datetime.now().isoformat(),
478
  'results': results,
479
- 'session_id': hashlib.md5(str(datetime.now()).encode()).hexdigest()[:8]
 
480
  }
481
 
482
  # Load existing logs
@@ -499,13 +848,36 @@ class RobustModelRetrainer:
499
  with open(self.retraining_log_path, 'w') as f:
500
  json.dump(logs, f, indent=2)
501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  except Exception as e:
503
  logger.error(f"Failed to log retraining session: {str(e)}")
504
 
505
  def retrain_model(self) -> Tuple[bool, str]:
506
- """Main retraining function with comprehensive validation"""
507
  try:
508
- logger.info("Starting model retraining process...")
509
 
510
  # Load existing metadata
511
  existing_metadata = self.load_existing_metadata()
@@ -528,22 +900,18 @@ class RobustModelRetrainer:
528
  if len(df) < self.min_new_samples:
529
  return False, f"Insufficient new data: {len(df)} < {self.min_new_samples}"
530
 
531
- # Train candidate model
532
  candidate_success, candidate_model, candidate_metrics = self.train_candidate_model(df)
533
  if not candidate_success:
534
  return False, f"Candidate training failed: {candidate_metrics.get('error', 'Unknown error')}"
535
 
536
- # Prepare test data for comparison
537
  X = df['text'].values
538
  y = df['label'].values
539
- from sklearn.model_selection import train_test_split
540
- _, X_test, _, y_test = train_test_split(
541
- X, y, test_size=self.test_size, stratify=y, random_state=self.random_state
542
- )
543
 
544
- # Compare models
545
- comparison_results = self.compare_models_statistically(
546
- prod_model, candidate_model, X_test, y_test
547
  )
548
 
549
  # Log results
@@ -551,16 +919,15 @@ class RobustModelRetrainer:
551
  'candidate_metrics': candidate_metrics,
552
  'comparison_results': comparison_results,
553
  'data_size': len(df),
554
- 'test_size': len(X_test)
 
555
  }
556
 
557
  self.log_retraining_session(session_results)
558
 
559
- # Decide whether to promote
560
- should_promote = (
561
- comparison_results['absolute_improvement'] >= self.improvement_threshold and
562
- comparison_results.get('statistical_tests', {}).get('mcnemar', {}).get('significant', False)
563
- )
564
 
565
  if should_promote:
566
  # Promote candidate model
@@ -569,10 +936,16 @@ class RobustModelRetrainer:
569
  )
570
 
571
  if promotion_success:
 
 
 
 
 
572
  success_msg = (
573
- f"Model promoted successfully! "
574
- f"Improvement: {comparison_results['absolute_improvement']:.4f} "
575
- f"(F1: {candidate_metrics['f1']:.4f})"
 
576
  )
577
  logger.info(success_msg)
578
  return True, success_msg
@@ -580,21 +953,24 @@ class RobustModelRetrainer:
580
  return False, "Model promotion failed"
581
  else:
582
  # Keep current model
 
 
 
583
  keep_msg = (
584
- f"Keeping current model. "
585
- f"Improvement: {comparison_results['absolute_improvement']:.4f} "
586
- f"(threshold: {self.improvement_threshold})"
587
  )
588
  logger.info(keep_msg)
589
  return True, keep_msg
590
 
591
  except Exception as e:
592
- error_msg = f"Model retraining failed: {str(e)}"
593
  logger.error(error_msg)
594
  return False, error_msg
595
 
596
  def main():
597
- """Main execution function"""
598
  retrainer = RobustModelRetrainer()
599
  success, message = retrainer.retrain_model()
600
 
 
1
+ # File: model/retrain.py (MODIFIED)
2
+ # Enhanced version with comprehensive cross-validation for retraining
3
+
4
  import pandas as pd
5
  import numpy as np
6
  import joblib
 
20
  accuracy_score, precision_score, recall_score, f1_score,
21
  roc_auc_score, confusion_matrix, classification_report
22
  )
23
+ from sklearn.model_selection import (
24
+ cross_val_score, StratifiedKFold, cross_validate, train_test_split
25
+ )
26
  from sklearn.feature_extraction.text import TfidfVectorizer
27
  from sklearn.linear_model import LogisticRegression
28
  from sklearn.ensemble import RandomForestClassifier
 
41
  )
42
  logger = logging.getLogger(__name__)
43
 
44
+ class CVModelComparator:
45
+ """Advanced model comparison using cross-validation and statistical tests"""
46
+
47
+ def __init__(self, cv_folds: int = 5, random_state: int = 42):
48
+ self.cv_folds = cv_folds
49
+ self.random_state = random_state
50
+
51
+ def create_cv_strategy(self, X, y) -> StratifiedKFold:
52
+ """Create appropriate CV strategy based on data characteristics"""
53
+ n_samples = len(X)
54
+ min_samples_per_fold = 3
55
+ max_folds = n_samples // min_samples_per_fold
56
+
57
+ unique_classes = np.unique(y)
58
+ min_class_count = min([np.sum(y == cls) for cls in unique_classes])
59
+ max_folds_by_class = min_class_count
60
+
61
+ actual_folds = max(2, min(self.cv_folds, max_folds, max_folds_by_class))
62
+
63
+ logger.info(f"Using {actual_folds} CV folds for model comparison")
64
+
65
+ return StratifiedKFold(
66
+ n_splits=actual_folds,
67
+ shuffle=True,
68
+ random_state=self.random_state
69
+ )
70
+
71
+ def perform_model_cv_evaluation(self, model, X, y, cv_strategy=None) -> Dict:
72
+ """Perform comprehensive CV evaluation of a model"""
73
+
74
+ if cv_strategy is None:
75
+ cv_strategy = self.create_cv_strategy(X, y)
76
+
77
+ logger.info(f"Performing CV evaluation with {cv_strategy.n_splits} folds...")
78
+
79
+ scoring_metrics = {
80
+ 'accuracy': 'accuracy',
81
+ 'precision': 'precision_weighted',
82
+ 'recall': 'recall_weighted',
83
+ 'f1': 'f1_weighted',
84
+ 'roc_auc': 'roc_auc'
85
+ }
86
+
87
+ try:
88
+ cv_scores = cross_validate(
89
+ model, X, y,
90
+ cv=cv_strategy,
91
+ scoring=scoring_metrics,
92
+ return_train_score=True,
93
+ n_jobs=1,
94
+ verbose=0
95
+ )
96
+
97
+ cv_results = {
98
+ 'n_splits': cv_strategy.n_splits,
99
+ 'test_scores': {},
100
+ 'train_scores': {},
101
+ 'fold_results': []
102
+ }
103
+
104
+ # Process results for each metric
105
+ for metric_name in scoring_metrics.keys():
106
+ test_key = f'test_{metric_name}'
107
+ train_key = f'train_{metric_name}'
108
+
109
+ if test_key in cv_scores:
110
+ test_scores = cv_scores[test_key]
111
+ cv_results['test_scores'][metric_name] = {
112
+ 'mean': float(np.mean(test_scores)),
113
+ 'std': float(np.std(test_scores)),
114
+ 'min': float(np.min(test_scores)),
115
+ 'max': float(np.max(test_scores)),
116
+ 'scores': test_scores.tolist()
117
+ }
118
+
119
+ if train_key in cv_scores:
120
+ train_scores = cv_scores[train_key]
121
+ cv_results['train_scores'][metric_name] = {
122
+ 'mean': float(np.mean(train_scores)),
123
+ 'std': float(np.std(train_scores)),
124
+ 'scores': train_scores.tolist()
125
+ }
126
+
127
+ # Individual fold results
128
+ for fold_idx in range(cv_strategy.n_splits):
129
+ fold_result = {
130
+ 'fold': fold_idx + 1,
131
+ 'test_scores': {},
132
+ 'train_scores': {}
133
+ }
134
+
135
+ for metric_name in scoring_metrics.keys():
136
+ test_key = f'test_{metric_name}'
137
+ train_key = f'train_{metric_name}'
138
+
139
+ if test_key in cv_scores:
140
+ fold_result['test_scores'][metric_name] = float(cv_scores[test_key][fold_idx])
141
+ if train_key in cv_scores:
142
+ fold_result['train_scores'][metric_name] = float(cv_scores[train_key][fold_idx])
143
+
144
+ cv_results['fold_results'].append(fold_result)
145
+
146
+ # Calculate overfitting and stability scores
147
+ if 'accuracy' in cv_results['test_scores'] and 'accuracy' in cv_results['train_scores']:
148
+ train_mean = cv_results['train_scores']['accuracy']['mean']
149
+ test_mean = cv_results['test_scores']['accuracy']['mean']
150
+ cv_results['overfitting_score'] = float(train_mean - test_mean)
151
+
152
+ test_std = cv_results['test_scores']['accuracy']['std']
153
+ cv_results['stability_score'] = float(1 - (test_std / test_mean)) if test_mean > 0 else 0
154
+
155
+ return cv_results
156
+
157
+ except Exception as e:
158
+ logger.error(f"CV evaluation failed: {e}")
159
+ return {'error': str(e), 'n_splits': cv_strategy.n_splits}
160
+
161
+ def compare_models_with_cv(self, model1, model2, X, y, model1_name="Production", model2_name="Candidate") -> Dict:
162
+ """Compare two models using cross-validation and statistical tests"""
163
+
164
+ logger.info(f"Comparing {model1_name} vs {model2_name} models using CV...")
165
+
166
+ try:
167
+ cv_strategy = self.create_cv_strategy(X, y)
168
+
169
+ # Evaluate both models with same CV folds
170
+ results1 = self.perform_model_cv_evaluation(model1, X, y, cv_strategy)
171
+ results2 = self.perform_model_cv_evaluation(model2, X, y, cv_strategy)
172
+
173
+ if 'error' in results1 or 'error' in results2:
174
+ return {
175
+ 'error': 'One or both models failed CV evaluation',
176
+ 'model1_results': results1,
177
+ 'model2_results': results2
178
+ }
179
+
180
+ # Statistical comparison
181
+ comparison_results = {
182
+ 'model1_name': model1_name,
183
+ 'model2_name': model2_name,
184
+ 'cv_folds': cv_strategy.n_splits,
185
+ 'model1_cv_results': results1,
186
+ 'model2_cv_results': results2,
187
+ 'statistical_tests': {},
188
+ 'metric_comparisons': {}
189
+ }
190
+
191
+ # Compare each metric
192
+ for metric in ['accuracy', 'f1', 'precision', 'recall']:
193
+ if (metric in results1['test_scores'] and
194
+ metric in results2['test_scores']):
195
+
196
+ scores1 = results1['test_scores'][metric]['scores']
197
+ scores2 = results2['test_scores'][metric]['scores']
198
+
199
+ metric_comparison = self._compare_metric_scores(
200
+ scores1, scores2, metric, model1_name, model2_name
201
+ )
202
+ comparison_results['metric_comparisons'][metric] = metric_comparison
203
+
204
+ # Overall recommendation
205
+ f1_comparison = comparison_results['metric_comparisons'].get('f1', {})
206
+ accuracy_comparison = comparison_results['metric_comparisons'].get('accuracy', {})
207
+
208
+ # Decision logic for model promotion
209
+ promote_candidate = False
210
+ promotion_reason = ""
211
+
212
+ if f1_comparison.get('significant_improvement', False):
213
+ promote_candidate = True
214
+ promotion_reason = f"Significant F1 improvement: {f1_comparison.get('improvement', 0):.4f}"
215
+ elif (f1_comparison.get('improvement', 0) > 0.01 and
216
+ accuracy_comparison.get('improvement', 0) > 0.01):
217
+ promote_candidate = True
218
+ promotion_reason = "Practical improvement in both F1 and accuracy"
219
+ elif f1_comparison.get('improvement', 0) > 0.02:
220
+ promote_candidate = True
221
+ promotion_reason = f"Large F1 improvement: {f1_comparison.get('improvement', 0):.4f}"
222
+ else:
223
+ promotion_reason = "No significant improvement detected"
224
+
225
+ comparison_results['promotion_decision'] = {
226
+ 'promote_candidate': promote_candidate,
227
+ 'reason': promotion_reason,
228
+ 'confidence': self._calculate_decision_confidence(comparison_results)
229
+ }
230
+
231
+ logger.info(f"Model comparison completed: {promotion_reason}")
232
+ return comparison_results
233
+
234
+ except Exception as e:
235
+ logger.error(f"Model comparison failed: {e}")
236
+ return {'error': str(e)}
237
+
238
+ def _compare_metric_scores(self, scores1: list, scores2: list, metric: str,
239
+ model1_name: str, model2_name: str) -> Dict:
240
+ """Compare metric scores between two models using statistical tests"""
241
+
242
+ try:
243
+ # Basic statistics
244
+ mean1, mean2 = np.mean(scores1), np.mean(scores2)
245
+ std1, std2 = np.std(scores1), np.std(scores2)
246
+ improvement = mean2 - mean1
247
+
248
+ comparison = {
249
+ 'metric': metric,
250
+ f'{model1_name.lower()}_mean': float(mean1),
251
+ f'{model2_name.lower()}_mean': float(mean2),
252
+ f'{model1_name.lower()}_std': float(std1),
253
+ f'{model2_name.lower()}_std': float(std2),
254
+ 'improvement': float(improvement),
255
+ 'relative_improvement': float(improvement / mean1 * 100) if mean1 > 0 else 0,
256
+ 'tests': {}
257
+ }
258
+
259
+ # Paired t-test
260
+ try:
261
+ t_stat, p_value = stats.ttest_rel(scores2, scores1)
262
+ comparison['tests']['paired_ttest'] = {
263
+ 't_statistic': float(t_stat),
264
+ 'p_value': float(p_value),
265
+ 'significant': p_value < 0.05
266
+ }
267
+ except Exception as e:
268
+ logger.warning(f"Paired t-test failed for {metric}: {e}")
269
+
270
+ # Wilcoxon signed-rank test (non-parametric alternative)
271
+ try:
272
+ w_stat, w_p_value = stats.wilcoxon(scores2, scores1, alternative='greater')
273
+ comparison['tests']['wilcoxon'] = {
274
+ 'statistic': float(w_stat),
275
+ 'p_value': float(w_p_value),
276
+ 'significant': w_p_value < 0.05
277
+ }
278
+ except Exception as e:
279
+ logger.warning(f"Wilcoxon test failed for {metric}: {e}")
280
+
281
+ # Effect size (Cohen's d)
282
+ try:
283
+ pooled_std = np.sqrt(((len(scores1) - 1) * std1**2 + (len(scores2) - 1) * std2**2) /
284
+ (len(scores1) + len(scores2) - 2))
285
+ cohens_d = improvement / pooled_std if pooled_std > 0 else 0
286
+ comparison['effect_size'] = float(cohens_d)
287
+ except Exception:
288
+ comparison['effect_size'] = 0
289
+
290
+ # Practical significance
291
+ practical_threshold = 0.01 # 1% improvement threshold
292
+ comparison['practical_significance'] = abs(improvement) > practical_threshold
293
+ comparison['significant_improvement'] = (
294
+ improvement > practical_threshold and
295
+ comparison['tests'].get('paired_ttest', {}).get('significant', False)
296
+ )
297
+
298
+ return comparison
299
+
300
+ except Exception as e:
301
+ logger.error(f"Metric comparison failed for {metric}: {e}")
302
+ return {'metric': metric, 'error': str(e)}
303
+
304
+ def _calculate_decision_confidence(self, comparison_results: Dict) -> float:
305
+ """Calculate confidence in the promotion decision"""
306
+
307
+ try:
308
+ confidence_factors = []
309
+
310
+ # Check F1 improvement significance
311
+ f1_comp = comparison_results['metric_comparisons'].get('f1', {})
312
+ if f1_comp.get('significant_improvement', False):
313
+ confidence_factors.append(0.4)
314
+ elif f1_comp.get('improvement', 0) > 0.01:
315
+ confidence_factors.append(0.2)
316
+
317
+ # Check consistency across metrics
318
+ improved_metrics = 0
319
+ total_metrics = 0
320
+ for metric_comp in comparison_results['metric_comparisons'].values():
321
+ if isinstance(metric_comp, dict) and 'improvement' in metric_comp:
322
+ total_metrics += 1
323
+ if metric_comp['improvement'] > 0:
324
+ improved_metrics += 1
325
+
326
+ if total_metrics > 0:
327
+ consistency_score = improved_metrics / total_metrics
328
+ confidence_factors.append(consistency_score * 0.3)
329
+
330
+ # Check effect sizes
331
+ effect_sizes = []
332
+ for metric_comp in comparison_results['metric_comparisons'].values():
333
+ if isinstance(metric_comp, dict) and 'effect_size' in metric_comp:
334
+ effect_sizes.append(abs(metric_comp['effect_size']))
335
+
336
+ if effect_sizes:
337
+ avg_effect_size = np.mean(effect_sizes)
338
+ if avg_effect_size > 0.5: # Large effect
339
+ confidence_factors.append(0.2)
340
+ elif avg_effect_size > 0.2: # Medium effect
341
+ confidence_factors.append(0.1)
342
+
343
+ # Calculate final confidence
344
+ total_confidence = sum(confidence_factors)
345
+ return min(1.0, max(0.0, total_confidence))
346
+
347
+ except Exception as e:
348
+ logger.warning(f"Confidence calculation failed: {e}")
349
+ return 0.5
350
+
351
+
352
  class RobustModelRetrainer:
353
+ """Production-ready model retraining with comprehensive CV and statistical validation"""
354
 
355
  def __init__(self):
356
  self.setup_paths()
357
  self.setup_retraining_config()
358
  self.setup_statistical_tests()
359
+ self.cv_comparator = CVModelComparator()
360
 
361
  def setup_paths(self):
362
  """Setup all necessary paths"""
 
395
  self.min_new_samples = 50
396
  self.improvement_threshold = 0.01 # 1% improvement required
397
  self.significance_level = 0.05
398
+ self.cv_folds = 5 # Increased for better validation
399
  self.test_size = 0.2
400
  self.random_state = 42
401
  self.max_retries = 3
 
404
  def setup_statistical_tests(self):
405
  """Setup statistical test configurations"""
406
  self.statistical_tests = {
 
407
  'paired_ttest': {'alpha': 0.05, 'name': "Paired T-Test"},
408
+ 'wilcoxon': {'alpha': 0.05, 'name': "Wilcoxon Signed-Rank Test"},
409
+ 'mcnemar': {'alpha': 0.05, 'name': "McNemar's Test"}
410
  }
411
 
412
  def load_existing_metadata(self) -> Optional[Dict]:
 
560
  return pipeline
561
 
562
  def train_candidate_model(self, df: pd.DataFrame) -> Tuple[bool, Optional[Any], Dict]:
563
+ """Train candidate model with comprehensive CV evaluation"""
564
  try:
565
+ logger.info("Training candidate model with cross-validation...")
566
 
567
  # Prepare data
568
  X = df['text'].values
569
  y = df['label'].values
570
 
 
 
 
 
 
 
571
  # Create and train pipeline
572
  pipeline = self.create_advanced_pipeline()
 
573
 
574
+ # Perform cross-validation before final training
575
+ logger.info("Performing cross-validation on candidate model...")
576
+ cv_results = self.cv_comparator.perform_model_cv_evaluation(pipeline, X, y)
577
 
578
+ # Train on full dataset for final model
579
+ pipeline.fit(X, y)
 
 
580
 
581
+ # Additional holdout evaluation
582
+ X_train, X_test, y_train, y_test = train_test_split(
583
+ X, y, test_size=self.test_size, stratify=y, random_state=self.random_state
584
+ )
585
 
586
+ pipeline_holdout = self.create_advanced_pipeline()
587
+ pipeline_holdout.fit(X_train, y_train)
588
 
589
+ # Evaluate on holdout
590
+ y_pred = pipeline_holdout.predict(X_test)
591
+ y_pred_proba = pipeline_holdout.predict_proba(X_test)[:, 1]
 
 
 
 
 
 
 
 
592
 
593
+ holdout_metrics = {
 
594
  'accuracy': float(accuracy_score(y_test, y_pred)),
595
  'precision': float(precision_score(y_test, y_pred, average='weighted')),
596
  'recall': float(recall_score(y_test, y_pred, average='weighted')),
597
  'f1': float(f1_score(y_test, y_pred, average='weighted')),
598
+ 'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
 
 
599
  }
600
 
601
+ # Combine CV and holdout results
602
+ evaluation_results = {
603
+ 'cross_validation': cv_results,
604
+ 'holdout_evaluation': holdout_metrics,
605
+ 'training_samples': len(X),
606
+ 'test_samples': len(X_test)
607
+ }
 
 
 
 
 
608
 
609
+ # Save candidate model
610
+ joblib.dump(pipeline, self.candidate_pipeline_path)
611
+ if hasattr(pipeline, 'named_steps'):
612
+ joblib.dump(pipeline.named_steps['model'], self.candidate_model_path)
613
+ joblib.dump(pipeline.named_steps['vectorize'], self.candidate_vectorizer_path)
614
+
615
+ # Log results
616
+ if 'test_scores' in cv_results and 'f1' in cv_results['test_scores']:
617
+ cv_f1_mean = cv_results['test_scores']['f1']['mean']
618
+ cv_f1_std = cv_results['test_scores']['f1']['std']
619
+ logger.info(f"Candidate model CV F1: {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
620
+
621
+ logger.info(f"Candidate model holdout F1: {holdout_metrics['f1']:.4f}")
622
+ logger.info(f"Candidate model training completed")
623
+
624
+ return True, pipeline, evaluation_results
625
 
626
  except Exception as e:
627
+ error_msg = f"Candidate model training failed: {str(e)}"
628
+ logger.error(error_msg)
629
+ return False, None, {'error': error_msg}
630
 
631
+ def compare_models_with_cv_validation(self, prod_model, candidate_model, X, y) -> Dict:
632
+ """Compare models using comprehensive cross-validation"""
633
+
634
+ logger.info("Performing comprehensive model comparison with CV...")
635
+
636
  try:
637
+ # Use the CV comparator for detailed analysis
638
+ comparison_results = self.cv_comparator.compare_models_with_cv(
639
+ prod_model, candidate_model, X, y, "Production", "Candidate"
640
+ )
 
641
 
642
+ if 'error' in comparison_results:
643
+ return comparison_results
 
644
 
645
+ # Additional legacy format for backward compatibility
646
+ legacy_comparison = {
647
+ 'production_cv_results': comparison_results['model1_cv_results'],
648
+ 'candidate_cv_results': comparison_results['model2_cv_results'],
649
+ 'statistical_tests': comparison_results['statistical_tests'],
650
+ 'promotion_decision': comparison_results['promotion_decision']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  }
652
 
653
+ # Extract key metrics for legacy format
654
+ prod_cv = comparison_results['model1_cv_results']
655
+ cand_cv = comparison_results['model2_cv_results']
656
+
657
+ if 'test_scores' in prod_cv and 'test_scores' in cand_cv:
658
+ if 'accuracy' in prod_cv['test_scores'] and 'accuracy' in cand_cv['test_scores']:
659
+ legacy_comparison.update({
660
+ 'production_accuracy': prod_cv['test_scores']['accuracy']['mean'],
661
+ 'candidate_accuracy': cand_cv['test_scores']['accuracy']['mean'],
662
+ 'absolute_improvement': (cand_cv['test_scores']['accuracy']['mean'] -
663
+ prod_cv['test_scores']['accuracy']['mean']),
664
+ 'relative_improvement': ((cand_cv['test_scores']['accuracy']['mean'] -
665
+ prod_cv['test_scores']['accuracy']['mean']) /
666
+ prod_cv['test_scores']['accuracy']['mean'] * 100)
667
+ })
668
+
669
+ # Merge detailed and legacy formats
670
+ final_results = {**comparison_results, **legacy_comparison}
671
+
672
+ # Log summary
673
+ f1_comp = comparison_results.get('metric_comparisons', {}).get('f1', {})
674
+ if f1_comp:
675
+ logger.info(f"F1 improvement: {f1_comp.get('improvement', 0):.4f}")
676
+ logger.info(f"Significant improvement: {f1_comp.get('significant_improvement', False)}")
677
+
678
+ promotion_decision = comparison_results.get('promotion_decision', {})
679
+ logger.info(f"Promotion recommendation: {promotion_decision.get('promote_candidate', False)}")
680
+ logger.info(f"Reason: {promotion_decision.get('reason', 'Unknown')}")
681
+
682
+ return final_results
683
 
684
  except Exception as e:
685
+ logger.error(f"Model comparison failed: {str(e)}")
686
  return {'error': str(e)}
687
 
688
  def create_backup(self) -> bool:
 
712
  return False
713
 
714
  def promote_candidate_model(self, candidate_model, candidate_metrics: Dict, comparison_results: Dict) -> bool:
715
+ """Promote candidate model to production with enhanced metadata"""
716
  try:
717
  logger.info("Promoting candidate model to production...")
718
 
 
726
  shutil.copy2(self.candidate_vectorizer_path, self.prod_vectorizer_path)
727
  shutil.copy2(self.candidate_pipeline_path, self.prod_pipeline_path)
728
 
729
+ # Update metadata with comprehensive CV information
730
  metadata = self.load_existing_metadata() or {}
731
 
732
  # Increment version
 
740
  else:
741
  new_version = f"v1.{int(datetime.now().timestamp()) % 1000}"
742
 
743
+ # Extract metrics from candidate evaluation
744
+ cv_results = candidate_metrics.get('cross_validation', {})
745
+ holdout_results = candidate_metrics.get('holdout_evaluation', {})
746
+
747
+ # Update metadata with comprehensive information
748
  metadata.update({
749
  'model_version': new_version,
750
+ 'model_type': 'retrained_pipeline_cv',
751
  'previous_version': old_version,
 
 
 
 
 
 
 
752
  'promotion_timestamp': datetime.now().isoformat(),
753
+ 'retrain_trigger': 'cv_validated_retrain',
754
+ 'training_samples': candidate_metrics.get('training_samples', 'Unknown'),
755
+ 'test_samples': candidate_metrics.get('test_samples', 'Unknown')
756
  })
757
 
758
+ # Add holdout evaluation results
759
+ if holdout_results:
760
+ metadata.update({
761
+ 'test_accuracy': holdout_results.get('accuracy', 'Unknown'),
762
+ 'test_f1': holdout_results.get('f1', 'Unknown'),
763
+ 'test_precision': holdout_results.get('precision', 'Unknown'),
764
+ 'test_recall': holdout_results.get('recall', 'Unknown'),
765
+ 'test_roc_auc': holdout_results.get('roc_auc', 'Unknown')
766
+ })
767
+
768
+ # Add comprehensive CV results
769
+ if cv_results and 'test_scores' in cv_results:
770
+ metadata['cross_validation'] = {
771
+ 'n_splits': cv_results.get('n_splits', self.cv_folds),
772
+ 'test_scores': cv_results['test_scores'],
773
+ 'train_scores': cv_results.get('train_scores', {}),
774
+ 'overfitting_score': cv_results.get('overfitting_score', 'Unknown'),
775
+ 'stability_score': cv_results.get('stability_score', 'Unknown'),
776
+ 'individual_fold_results': cv_results.get('fold_results', [])
777
+ }
778
+
779
+ # Add CV summary statistics
780
+ if 'f1' in cv_results['test_scores']:
781
+ metadata.update({
782
+ 'cv_f1_mean': cv_results['test_scores']['f1']['mean'],
783
+ 'cv_f1_std': cv_results['test_scores']['f1']['std'],
784
+ 'cv_f1_min': cv_results['test_scores']['f1']['min'],
785
+ 'cv_f1_max': cv_results['test_scores']['f1']['max']
786
+ })
787
+
788
+ # Add model comparison results
789
+ promotion_decision = comparison_results.get('promotion_decision', {})
790
+ metadata['promotion_validation'] = {
791
+ 'decision_confidence': promotion_decision.get('confidence', 'Unknown'),
792
+ 'promotion_reason': promotion_decision.get('reason', 'Unknown'),
793
+ 'comparison_method': 'cross_validation_statistical_tests'
794
+ }
795
+
796
+ # Add statistical test results
797
+ metric_comparisons = comparison_results.get('metric_comparisons', {})
798
+ if metric_comparisons:
799
+ metadata['statistical_validation'] = {}
800
+ for metric, comparison in metric_comparisons.items():
801
+ if isinstance(comparison, dict):
802
+ metadata['statistical_validation'][metric] = {
803
+ 'improvement': comparison.get('improvement', 0),
804
+ 'significant_improvement': comparison.get('significant_improvement', False),
805
+ 'effect_size': comparison.get('effect_size', 0),
806
+ 'tests': comparison.get('tests', {})
807
+ }
808
+
809
  # Save updated metadata
810
  with open(self.metadata_path, 'w') as f:
811
  json.dump(metadata, f, indent=2)
812
 
813
  logger.info(f"Model promoted successfully to {new_version}")
814
+ logger.info(f"Promotion reason: {promotion_decision.get('reason', 'CV validation passed')}")
815
  return True
816
 
817
  except Exception as e:
 
819
  return False
820
 
821
  def log_retraining_session(self, results: Dict):
822
+ """Log comprehensive retraining session results"""
823
  try:
824
  log_entry = {
825
  'timestamp': datetime.now().isoformat(),
826
  'results': results,
827
+ 'session_id': hashlib.md5(str(datetime.now()).encode()).hexdigest()[:8],
828
+ 'retraining_type': 'cv_enhanced'
829
  }
830
 
831
  # Load existing logs
 
848
  with open(self.retraining_log_path, 'w') as f:
849
  json.dump(logs, f, indent=2)
850
 
851
+ # Also save detailed comparison results
852
+ if 'comparison_results' in results:
853
+ comparison_logs = []
854
+ if self.comparison_log_path.exists():
855
+ try:
856
+ with open(self.comparison_log_path, 'r') as f:
857
+ comparison_logs = json.load(f)
858
+ except:
859
+ comparison_logs = []
860
+
861
+ comparison_entry = {
862
+ 'timestamp': datetime.now().isoformat(),
863
+ 'session_id': log_entry['session_id'],
864
+ 'comparison_details': results['comparison_results']
865
+ }
866
+
867
+ comparison_logs.append(comparison_entry)
868
+ if len(comparison_logs) > 50:
869
+ comparison_logs = comparison_logs[-50:]
870
+
871
+ with open(self.comparison_log_path, 'w') as f:
872
+ json.dump(comparison_logs, f, indent=2)
873
+
874
  except Exception as e:
875
  logger.error(f"Failed to log retraining session: {str(e)}")
876
 
877
  def retrain_model(self) -> Tuple[bool, str]:
878
+ """Main retraining function with comprehensive CV validation"""
879
  try:
880
+ logger.info("Starting enhanced model retraining with cross-validation...")
881
 
882
  # Load existing metadata
883
  existing_metadata = self.load_existing_metadata()
 
900
  if len(df) < self.min_new_samples:
901
  return False, f"Insufficient new data: {len(df)} < {self.min_new_samples}"
902
 
903
+ # Train candidate model with CV
904
  candidate_success, candidate_model, candidate_metrics = self.train_candidate_model(df)
905
  if not candidate_success:
906
  return False, f"Candidate training failed: {candidate_metrics.get('error', 'Unknown error')}"
907
 
908
+ # Prepare data for model comparison
909
  X = df['text'].values
910
  y = df['label'].values
 
 
 
 
911
 
912
+ # Comprehensive model comparison with CV
913
+ comparison_results = self.compare_models_with_cv_validation(
914
+ prod_model, candidate_model, X, y
915
  )
916
 
917
  # Log results
 
919
  'candidate_metrics': candidate_metrics,
920
  'comparison_results': comparison_results,
921
  'data_size': len(df),
922
+ 'cv_folds': self.cv_folds,
923
+ 'retraining_method': 'cv_enhanced'
924
  }
925
 
926
  self.log_retraining_session(session_results)
927
 
928
+ # Decision based on CV comparison
929
+ promotion_decision = comparison_results.get('promotion_decision', {})
930
+ should_promote = promotion_decision.get('promote_candidate', False)
 
 
931
 
932
  if should_promote:
933
  # Promote candidate model
 
936
  )
937
 
938
  if promotion_success:
939
+ # Extract improvement information
940
+ f1_comp = comparison_results.get('metric_comparisons', {}).get('f1', {})
941
+ improvement = f1_comp.get('improvement', 0)
942
+ confidence = promotion_decision.get('confidence', 0)
943
+
944
  success_msg = (
945
+ f"Model promoted successfully with CV validation! "
946
+ f"F1 improvement: {improvement:.4f}, "
947
+ f"Confidence: {confidence:.2f}, "
948
+ f"Reason: {promotion_decision.get('reason', 'CV validation passed')}"
949
  )
950
  logger.info(success_msg)
951
  return True, success_msg
 
953
  return False, "Model promotion failed"
954
  else:
955
  # Keep current model
956
+ reason = promotion_decision.get('reason', 'No significant improvement detected')
957
+ confidence = promotion_decision.get('confidence', 0)
958
+
959
  keep_msg = (
960
+ f"Keeping current model based on CV analysis. "
961
+ f"Reason: {reason}, "
962
+ f"Confidence: {confidence:.2f}"
963
  )
964
  logger.info(keep_msg)
965
  return True, keep_msg
966
 
967
  except Exception as e:
968
+ error_msg = f"Enhanced model retraining failed: {str(e)}"
969
  logger.error(error_msg)
970
  return False, error_msg
971
 
972
  def main():
973
+ """Main execution function with CV enhancements"""
974
  retrainer = RobustModelRetrainer()
975
  success, message = retrainer.retrain_model()
976