Ahmedik95316 commited on
Commit
6b4cc07
·
1 Parent(s): 98906e6

Create statistical_analysis.py

Browse files

Advanced Statistical Analysis :
- Bootstrap confidence intervals for all performance metrics
- Feature importance stability analysis with coefficient of variation
- Comprehensive cross-validation with normality testing and overfitting detection
- Pairwise model comparisons with effect size calculations (Cohen's d)
- Statistical significance testing (paired t-tests, Wilcoxon tests)

Files changed (1) hide show
  1. utils/statistical_analysis.py +1225 -0
utils/statistical_analysis.py ADDED
@@ -0,0 +1,1225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/statistical_analysis.py
2
+ # Advanced statistical analysis for Data Science grade enhancement (B+ → A-)
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from scipy import stats
7
+ from scipy.stats import bootstrap
8
+ import warnings
9
+ from typing import Dict, List, Tuple, Optional, Any, Union, Callable
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ import json
13
+ from datetime import datetime
14
+ import logging
15
+
16
+ # Import structured logging if available
17
+ try:
18
+ from .structured_logger import StructuredLogger, EventType, MLOpsLoggers
19
+ STRUCTURED_LOGGING_AVAILABLE = True
20
+ except ImportError:
21
+ STRUCTURED_LOGGING_AVAILABLE = False
22
+ import logging
23
+
24
+ warnings.filterwarnings('ignore')
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ @dataclass
30
+ class StatisticalResult:
31
+ """Container for statistical analysis results with uncertainty quantification"""
32
+ point_estimate: float
33
+ confidence_interval: Tuple[float, float]
34
+ confidence_level: float
35
+ method: str
36
+ sample_size: int
37
+ metadata: Dict[str, Any] = None
38
+
39
+ def __post_init__(self):
40
+ if self.metadata is None:
41
+ self.metadata = {}
42
+
43
+ def to_dict(self) -> Dict[str, Any]:
44
+ """Convert to dictionary for serialization"""
45
+ return {
46
+ 'point_estimate': float(self.point_estimate),
47
+ 'confidence_interval': [float(self.confidence_interval[0]), float(self.confidence_interval[1])],
48
+ 'confidence_level': float(self.confidence_level),
49
+ 'method': self.method,
50
+ 'sample_size': int(self.sample_size),
51
+ 'metadata': self.metadata,
52
+ 'timestamp': datetime.now().isoformat()
53
+ }
54
+
55
+ def margin_of_error(self) -> float:
56
+ """Calculate margin of error from confidence interval"""
57
+ return (self.confidence_interval[1] - self.confidence_interval[0]) / 2
58
+
59
+ def is_significant_improvement_over(self, baseline_value: float) -> bool:
60
+ """Check if improvement over baseline is statistically significant"""
61
+ return self.confidence_interval[0] > baseline_value
62
+
63
+
64
+ class BootstrapAnalyzer:
65
+ """Advanced bootstrap analysis for model performance uncertainty quantification"""
66
+
67
+ def __init__(self,
68
+ n_bootstrap: int = 1000,
69
+ confidence_level: float = 0.95,
70
+ random_state: int = 42):
71
+ self.n_bootstrap = n_bootstrap
72
+ self.confidence_level = confidence_level
73
+ self.random_state = random_state
74
+ self.rng = np.random.RandomState(random_state)
75
+
76
+ if STRUCTURED_LOGGING_AVAILABLE:
77
+ self.logger = MLOpsLoggers.get_logger('statistical_analysis')
78
+ else:
79
+ self.logger = logging.getLogger(__name__)
80
+
81
+ def bootstrap_metric(self,
82
+ y_true: np.ndarray,
83
+ y_pred: np.ndarray,
84
+ metric_func: Callable,
85
+ stratify: bool = True) -> StatisticalResult:
86
+ """
87
+ Bootstrap confidence interval for any metric function
88
+
89
+ Args:
90
+ y_true: True labels
91
+ y_pred: Predicted labels or probabilities
92
+ metric_func: Function that takes (y_true, y_pred) and returns metric
93
+ stratify: Whether to use stratified bootstrap sampling
94
+ """
95
+
96
+ n_samples = len(y_true)
97
+ bootstrap_scores = []
98
+
99
+ # Original metric value
100
+ original_score = metric_func(y_true, y_pred)
101
+
102
+ for i in range(self.n_bootstrap):
103
+ # Bootstrap sampling
104
+ if stratify:
105
+ # Stratified bootstrap to maintain class distribution
106
+ indices = self._stratified_bootstrap_indices(y_true)
107
+ else:
108
+ indices = self.rng.choice(n_samples, size=n_samples, replace=True)
109
+
110
+ # Calculate metric on bootstrap sample
111
+ try:
112
+ bootstrap_score = metric_func(y_true[indices], y_pred[indices])
113
+ bootstrap_scores.append(bootstrap_score)
114
+ except Exception as e:
115
+ # Skip invalid bootstrap samples
116
+ continue
117
+
118
+ bootstrap_scores = np.array(bootstrap_scores)
119
+
120
+ # Calculate confidence interval
121
+ alpha = 1 - self.confidence_level
122
+ lower_percentile = (alpha / 2) * 100
123
+ upper_percentile = (1 - alpha / 2) * 100
124
+
125
+ ci_lower = np.percentile(bootstrap_scores, lower_percentile)
126
+ ci_upper = np.percentile(bootstrap_scores, upper_percentile)
127
+
128
+ return StatisticalResult(
129
+ point_estimate=original_score,
130
+ confidence_interval=(ci_lower, ci_upper),
131
+ confidence_level=self.confidence_level,
132
+ method='bootstrap',
133
+ sample_size=n_samples,
134
+ metadata={
135
+ 'n_bootstrap': self.n_bootstrap,
136
+ 'bootstrap_mean': float(np.mean(bootstrap_scores)),
137
+ 'bootstrap_std': float(np.std(bootstrap_scores)),
138
+ 'stratified': stratify,
139
+ 'valid_bootstraps': len(bootstrap_scores)
140
+ }
141
+ )
142
+
143
+ def _stratified_bootstrap_indices(self, y_true: np.ndarray) -> np.ndarray:
144
+ """Generate stratified bootstrap indices maintaining class distribution"""
145
+ indices = []
146
+ unique_classes, class_counts = np.unique(y_true, return_counts=True)
147
+
148
+ for class_label, count in zip(unique_classes, class_counts):
149
+ class_indices = np.where(y_true == class_label)[0]
150
+ bootstrap_indices = self.rng.choice(class_indices, size=count, replace=True)
151
+ indices.extend(bootstrap_indices)
152
+
153
+ return np.array(indices)
154
+
155
+ def bootstrap_model_comparison(self,
156
+ y_true: np.ndarray,
157
+ y_pred_1: np.ndarray,
158
+ y_pred_2: np.ndarray,
159
+ metric_func: Callable,
160
+ model_1_name: str = "Model 1",
161
+ model_2_name: str = "Model 2") -> Dict[str, Any]:
162
+ """
163
+ Bootstrap comparison between two models with statistical significance testing
164
+ """
165
+
166
+ n_samples = len(y_true)
167
+ differences = []
168
+
169
+ # Calculate original difference
170
+ score_1 = metric_func(y_true, y_pred_1)
171
+ score_2 = metric_func(y_true, y_pred_2)
172
+ original_difference = score_2 - score_1
173
+
174
+ # Bootstrap sampling for difference
175
+ for i in range(self.n_bootstrap):
176
+ indices = self.rng.choice(n_samples, size=n_samples, replace=True)
177
+
178
+ try:
179
+ boot_score_1 = metric_func(y_true[indices], y_pred_1[indices])
180
+ boot_score_2 = metric_func(y_true[indices], y_pred_2[indices])
181
+ differences.append(boot_score_2 - boot_score_1)
182
+ except:
183
+ continue
184
+
185
+ differences = np.array(differences)
186
+
187
+ # Calculate confidence interval for difference
188
+ alpha = 1 - self.confidence_level
189
+ ci_lower = np.percentile(differences, (alpha / 2) * 100)
190
+ ci_upper = np.percentile(differences, (1 - alpha / 2) * 100)
191
+
192
+ # Statistical significance test
193
+ p_value_bootstrap = np.mean(differences <= 0) * 2 # Two-tailed test
194
+ is_significant = ci_lower > 0 or ci_upper < 0
195
+
196
+ # Effect size (Cohen's d)
197
+ pooled_std = np.sqrt((np.var(differences)) / 2)
198
+ cohens_d = original_difference / pooled_std if pooled_std > 0 else 0
199
+
200
+ return {
201
+ 'model_1_name': model_1_name,
202
+ 'model_2_name': model_2_name,
203
+ 'model_1_score': StatisticalResult(
204
+ point_estimate=score_1,
205
+ confidence_interval=(score_1 - np.std(differences), score_1 + np.std(differences)),
206
+ confidence_level=self.confidence_level,
207
+ method='bootstrap_individual',
208
+ sample_size=n_samples
209
+ ).to_dict(),
210
+ 'model_2_score': StatisticalResult(
211
+ point_estimate=score_2,
212
+ confidence_interval=(score_2 - np.std(differences), score_2 + np.std(differences)),
213
+ confidence_level=self.confidence_level,
214
+ method='bootstrap_individual',
215
+ sample_size=n_samples
216
+ ).to_dict(),
217
+ 'difference': StatisticalResult(
218
+ point_estimate=original_difference,
219
+ confidence_interval=(ci_lower, ci_upper),
220
+ confidence_level=self.confidence_level,
221
+ method='bootstrap_difference',
222
+ sample_size=n_samples,
223
+ metadata={
224
+ 'p_value_bootstrap': float(p_value_bootstrap),
225
+ 'is_significant': bool(is_significant),
226
+ 'effect_size_cohens_d': float(cohens_d),
227
+ 'bootstrap_mean_difference': float(np.mean(differences)),
228
+ 'bootstrap_std_difference': float(np.std(differences))
229
+ }
230
+ ).to_dict()
231
+ }
232
+
233
+
234
+ class FeatureImportanceAnalyzer:
235
+ """Advanced feature importance analysis with uncertainty quantification"""
236
+
237
+ def __init__(self,
238
+ n_bootstrap: int = 500,
239
+ confidence_level: float = 0.95,
240
+ random_state: int = 42):
241
+ self.n_bootstrap = n_bootstrap
242
+ self.confidence_level = confidence_level
243
+ self.random_state = random_state
244
+ self.rng = np.random.RandomState(random_state)
245
+
246
+ if STRUCTURED_LOGGING_AVAILABLE:
247
+ self.logger = MLOpsLoggers.get_logger('feature_importance')
248
+ else:
249
+ self.logger = logging.getLogger(__name__)
250
+
251
+ def analyze_importance_stability(self,
252
+ model,
253
+ X: np.ndarray,
254
+ y: np.ndarray,
255
+ feature_names: List[str] = None) -> Dict[str, Any]:
256
+ """
257
+ Analyze feature importance stability using bootstrap sampling
258
+ """
259
+
260
+ if feature_names is None:
261
+ feature_names = [f'feature_{i}' for i in range(X.shape[1])]
262
+
263
+ importance_samples = []
264
+
265
+ # Bootstrap sampling for importance stability
266
+ for i in range(self.n_bootstrap):
267
+ # Bootstrap sample
268
+ indices = self.rng.choice(len(X), size=len(X), replace=True)
269
+ X_boot = X[indices]
270
+ y_boot = y[indices]
271
+
272
+ try:
273
+ # Fit model on bootstrap sample
274
+ model_copy = self._clone_model(model)
275
+ model_copy.fit(X_boot, y_boot)
276
+
277
+ # Extract feature importances
278
+ if hasattr(model_copy, 'feature_importances_'):
279
+ importances = model_copy.feature_importances_
280
+ elif hasattr(model_copy, 'coef_'):
281
+ importances = np.abs(model_copy.coef_).flatten()
282
+ else:
283
+ # Use permutation importance as fallback
284
+ from sklearn.inspection import permutation_importance
285
+ perm_importance = permutation_importance(model_copy, X_boot, y_boot, n_repeats=5, random_state=self.random_state)
286
+ importances = perm_importance.importances_mean
287
+
288
+ importance_samples.append(importances)
289
+
290
+ except Exception as e:
291
+ continue
292
+
293
+ importance_samples = np.array(importance_samples)
294
+
295
+ # Calculate statistics for each feature
296
+ feature_stats = {}
297
+
298
+ for i, feature_name in enumerate(feature_names):
299
+ if i < importance_samples.shape[1]:
300
+ feature_importances = importance_samples[:, i]
301
+
302
+ # Calculate confidence interval
303
+ alpha = 1 - self.confidence_level
304
+ ci_lower = np.percentile(feature_importances, (alpha / 2) * 100)
305
+ ci_upper = np.percentile(feature_importances, (1 - alpha / 2) * 100)
306
+
307
+ # Stability metrics
308
+ cv_importance = np.std(feature_importances) / np.mean(feature_importances) if np.mean(feature_importances) > 0 else np.inf
309
+
310
+ feature_stats[feature_name] = StatisticalResult(
311
+ point_estimate=float(np.mean(feature_importances)),
312
+ confidence_interval=(float(ci_lower), float(ci_upper)),
313
+ confidence_level=self.confidence_level,
314
+ method='bootstrap_importance',
315
+ sample_size=len(importance_samples),
316
+ metadata={
317
+ 'coefficient_of_variation': float(cv_importance),
318
+ 'std_importance': float(np.std(feature_importances)),
319
+ 'min_importance': float(np.min(feature_importances)),
320
+ 'max_importance': float(np.max(feature_importances)),
321
+ 'stability_rank': None # Will be filled later
322
+ }
323
+ ).to_dict()
324
+
325
+ # Rank features by stability (lower CV = more stable)
326
+ sorted_features = sorted(
327
+ feature_stats.items(),
328
+ key=lambda x: x[1]['metadata']['coefficient_of_variation']
329
+ )
330
+
331
+ for rank, (feature_name, stats) in enumerate(sorted_features):
332
+ feature_stats[feature_name]['metadata']['stability_rank'] = rank + 1
333
+
334
+ return {
335
+ 'feature_importance_analysis': feature_stats,
336
+ 'stability_ranking': [name for name, _ in sorted_features],
337
+ 'analysis_metadata': {
338
+ 'n_bootstrap_samples': self.n_bootstrap,
339
+ 'confidence_level': self.confidence_level,
340
+ 'n_features_analyzed': len(feature_names),
341
+ 'valid_bootstrap_runs': len(importance_samples)
342
+ }
343
+ }
344
+
345
+ def _clone_model(self, model):
346
+ """Clone model for bootstrap sampling"""
347
+ from sklearn.base import clone
348
+ try:
349
+ return clone(model)
350
+ except:
351
+ # Fallback: create new instance with same parameters
352
+ return type(model)(**model.get_params())
353
+
354
+ def permutation_importance_with_ci(self,
355
+ model,
356
+ X: np.ndarray,
357
+ y: np.ndarray,
358
+ scoring_func: Callable,
359
+ feature_names: List[str] = None,
360
+ n_repeats: int = 10) -> Dict[str, Any]:
361
+ """
362
+ Calculate permutation importance with confidence intervals
363
+ """
364
+
365
+ if feature_names is None:
366
+ feature_names = [f'feature_{i}' for i in range(X.shape[1])]
367
+
368
+ # Baseline score
369
+ baseline_score = scoring_func(model, X, y)
370
+
371
+ feature_importance_scores = {}
372
+
373
+ for feature_idx, feature_name in enumerate(feature_names):
374
+ importance_scores = []
375
+
376
+ # Multiple permutation rounds for each feature
377
+ for _ in range(n_repeats):
378
+ # Permute feature
379
+ X_permuted = X.copy()
380
+ X_permuted[:, feature_idx] = self.rng.permutation(X_permuted[:, feature_idx])
381
+
382
+ # Calculate score with permuted feature
383
+ permuted_score = scoring_func(model, X_permuted, y)
384
+ importance = baseline_score - permuted_score
385
+ importance_scores.append(importance)
386
+
387
+ # Calculate statistics
388
+ importance_scores = np.array(importance_scores)
389
+
390
+ alpha = 1 - self.confidence_level
391
+ ci_lower = np.percentile(importance_scores, (alpha / 2) * 100)
392
+ ci_upper = np.percentile(importance_scores, (1 - alpha / 2) * 100)
393
+
394
+ feature_importance_scores[feature_name] = StatisticalResult(
395
+ point_estimate=float(np.mean(importance_scores)),
396
+ confidence_interval=(float(ci_lower), float(ci_upper)),
397
+ confidence_level=self.confidence_level,
398
+ method='permutation_importance',
399
+ sample_size=n_repeats,
400
+ metadata={
401
+ 'baseline_score': float(baseline_score),
402
+ 'std_importance': float(np.std(importance_scores)),
403
+ 'is_statistically_important': float(ci_lower) > 0
404
+ }
405
+ ).to_dict()
406
+
407
+ return {
408
+ 'permutation_importance': feature_importance_scores,
409
+ 'baseline_score': float(baseline_score),
410
+ 'analysis_metadata': {
411
+ 'n_repeats': n_repeats,
412
+ 'confidence_level': self.confidence_level,
413
+ 'scoring_function': scoring_func.__name__ if hasattr(scoring_func, '__name__') else 'custom'
414
+ }
415
+ }
416
+
417
+
418
+ class AdvancedCrossValidation:
419
+ """Advanced cross-validation with comprehensive statistical reporting"""
420
+
421
+ def __init__(self,
422
+ cv_folds: int = 5,
423
+ n_bootstrap: int = 200,
424
+ confidence_level: float = 0.95,
425
+ random_state: int = 42):
426
+ self.cv_folds = cv_folds
427
+ self.n_bootstrap = n_bootstrap
428
+ self.confidence_level = confidence_level
429
+ self.random_state = random_state
430
+ self.bootstrap_analyzer = BootstrapAnalyzer(n_bootstrap, confidence_level, random_state)
431
+
432
+ if STRUCTURED_LOGGING_AVAILABLE:
433
+ self.logger = MLOpsLoggers.get_logger('cross_validation')
434
+ else:
435
+ self.logger = logging.getLogger(__name__)
436
+
437
+ def comprehensive_cv_analysis(self,
438
+ model,
439
+ X: np.ndarray,
440
+ y: np.ndarray,
441
+ scoring_metrics: Dict[str, Callable]) -> Dict[str, Any]:
442
+ """
443
+ Comprehensive cross-validation analysis with statistical significance testing
444
+ """
445
+
446
+ from sklearn.model_selection import cross_validate, StratifiedKFold
447
+
448
+ # Setup CV strategy
449
+ cv_strategy = StratifiedKFold(
450
+ n_splits=self.cv_folds,
451
+ shuffle=True,
452
+ random_state=self.random_state
453
+ )
454
+
455
+ # Perform cross-validation
456
+ cv_results = cross_validate(
457
+ model, X, y,
458
+ cv=cv_strategy,
459
+ scoring=scoring_metrics,
460
+ return_train_score=True,
461
+ return_indices=True,
462
+ n_jobs=1
463
+ )
464
+
465
+ analysis_results = {
466
+ 'cv_folds': self.cv_folds,
467
+ 'metrics_analysis': {},
468
+ 'fold_analysis': [],
469
+ 'statistical_tests': {},
470
+ 'confidence_intervals': {}
471
+ }
472
+
473
+ # Analyze each metric
474
+ for metric_name, metric_func in scoring_metrics.items():
475
+ test_scores = cv_results[f'test_{metric_name}']
476
+ train_scores = cv_results[f'train_{metric_name}']
477
+
478
+ # Bootstrap confidence intervals for CV scores
479
+ test_ci = self._bootstrap_cv_scores(test_scores)
480
+ train_ci = self._bootstrap_cv_scores(train_scores)
481
+
482
+ # Statistical tests
483
+ statistical_tests = self._perform_cv_statistical_tests(test_scores, train_scores)
484
+
485
+ analysis_results['metrics_analysis'][metric_name] = {
486
+ 'test_scores': {
487
+ 'mean': float(np.mean(test_scores)),
488
+ 'std': float(np.std(test_scores)),
489
+ 'confidence_interval': test_ci,
490
+ 'scores': test_scores.tolist()
491
+ },
492
+ 'train_scores': {
493
+ 'mean': float(np.mean(train_scores)),
494
+ 'std': float(np.std(train_scores)),
495
+ 'confidence_interval': train_ci,
496
+ 'scores': train_scores.tolist()
497
+ },
498
+ 'overfitting_analysis': {
499
+ 'overfitting_score': float(np.mean(train_scores) - np.mean(test_scores)),
500
+ 'overfitting_ci': self._calculate_overfitting_ci(train_scores, test_scores)
501
+ },
502
+ 'statistical_tests': statistical_tests
503
+ }
504
+
505
+ # Fold-by-fold analysis
506
+ for fold_idx in range(self.cv_folds):
507
+ fold_analysis = {
508
+ 'fold': fold_idx + 1,
509
+ 'metrics': {}
510
+ }
511
+
512
+ for metric_name in scoring_metrics.keys():
513
+ fold_analysis['metrics'][metric_name] = {
514
+ 'test_score': float(cv_results[f'test_{metric_name}'][fold_idx]),
515
+ 'train_score': float(cv_results[f'train_{metric_name}'][fold_idx])
516
+ }
517
+
518
+ analysis_results['fold_analysis'].append(fold_analysis)
519
+
520
+ return analysis_results
521
+
522
+ def _bootstrap_cv_scores(self, scores: np.ndarray) -> Dict[str, float]:
523
+ """Bootstrap confidence interval for CV scores"""
524
+ bootstrap_means = []
525
+
526
+ for _ in range(self.n_bootstrap):
527
+ bootstrap_sample = np.random.choice(scores, size=len(scores), replace=True)
528
+ bootstrap_means.append(np.mean(bootstrap_sample))
529
+
530
+ alpha = 1 - self.confidence_level
531
+ ci_lower = np.percentile(bootstrap_means, (alpha / 2) * 100)
532
+ ci_upper = np.percentile(bootstrap_means, (1 - alpha / 2) * 100)
533
+
534
+ return {
535
+ 'lower': float(ci_lower),
536
+ 'upper': float(ci_upper),
537
+ 'confidence_level': self.confidence_level
538
+ }
539
+
540
+ def _perform_cv_statistical_tests(self, test_scores: np.ndarray, train_scores: np.ndarray) -> Dict[str, Any]:
541
+ """Perform statistical tests on CV results"""
542
+
543
+ tests = {}
544
+
545
+ # Test for overfitting using paired t-test
546
+ try:
547
+ t_stat, p_value = stats.ttest_rel(train_scores, test_scores)
548
+ tests['overfitting_ttest'] = {
549
+ 't_statistic': float(t_stat),
550
+ 'p_value': float(p_value),
551
+ 'significant_overfitting': p_value < 0.05 and t_stat > 0,
552
+ 'interpretation': 'Significant overfitting detected' if (p_value < 0.05 and t_stat > 0) else 'No significant overfitting'
553
+ }
554
+ except Exception as e:
555
+ tests['overfitting_ttest'] = {'error': str(e)}
556
+
557
+ # Normality test for CV scores
558
+ try:
559
+ shapiro_stat, shapiro_p = stats.shapiro(test_scores)
560
+ tests['normality_test'] = {
561
+ 'shapiro_statistic': float(shapiro_stat),
562
+ 'p_value': float(shapiro_p),
563
+ 'normally_distributed': shapiro_p > 0.05,
564
+ 'interpretation': 'CV scores are normally distributed' if shapiro_p > 0.05 else 'CV scores are not normally distributed'
565
+ }
566
+ except Exception as e:
567
+ tests['normality_test'] = {'error': str(e)}
568
+
569
+ # Stability test (coefficient of variation)
570
+ cv_coefficient = np.std(test_scores) / np.mean(test_scores) if np.mean(test_scores) > 0 else np.inf
571
+ tests['stability_analysis'] = {
572
+ 'coefficient_of_variation': float(cv_coefficient),
573
+ 'stability_interpretation': self._interpret_stability(cv_coefficient)
574
+ }
575
+
576
+ return tests
577
+
578
+ def _calculate_overfitting_ci(self, train_scores: np.ndarray, test_scores: np.ndarray) -> Dict[str, float]:
579
+ """Calculate confidence interval for overfitting metric"""
580
+ overfitting_differences = train_scores - test_scores
581
+
582
+ bootstrap_diffs = []
583
+ for _ in range(self.n_bootstrap):
584
+ indices = np.random.choice(len(overfitting_differences), size=len(overfitting_differences), replace=True)
585
+ bootstrap_diffs.append(np.mean(overfitting_differences[indices]))
586
+
587
+ alpha = 1 - self.confidence_level
588
+ ci_lower = np.percentile(bootstrap_diffs, (alpha / 2) * 100)
589
+ ci_upper = np.percentile(bootstrap_diffs, (1 - alpha / 2) * 100)
590
+
591
+ return {
592
+ 'lower': float(ci_lower),
593
+ 'upper': float(ci_upper),
594
+ 'confidence_level': self.confidence_level
595
+ }
596
+
597
+ def _interpret_stability(self, cv_coefficient: float) -> str:
598
+ """Interpret CV stability based on coefficient of variation"""
599
+ if cv_coefficient < 0.1:
600
+ return "Very stable performance across folds"
601
+ elif cv_coefficient < 0.2:
602
+ return "Stable performance across folds"
603
+ elif cv_coefficient < 0.3:
604
+ return "Moderately stable performance across folds"
605
+ else:
606
+ return "Unstable performance across folds - consider data quality or model complexity"
607
+
608
+
609
+ class StatisticalModelComparison:
610
+ """Advanced statistical comparison between models with comprehensive uncertainty analysis"""
611
+
612
+ def __init__(self,
613
+ confidence_level: float = 0.95,
614
+ n_bootstrap: int = 1000,
615
+ random_state: int = 42):
616
+ self.confidence_level = confidence_level
617
+ self.n_bootstrap = n_bootstrap
618
+ self.random_state = random_state
619
+ self.bootstrap_analyzer = BootstrapAnalyzer(n_bootstrap, confidence_level, random_state)
620
+
621
+ if STRUCTURED_LOGGING_AVAILABLE:
622
+ self.logger = MLOpsLoggers.get_logger('model_comparison')
623
+ else:
624
+ self.logger = logging.getLogger(__name__)
625
+
626
+ def comprehensive_model_comparison(self,
627
+ models: Dict[str, Any],
628
+ X: np.ndarray,
629
+ y: np.ndarray,
630
+ metrics: Dict[str, Callable],
631
+ cv_folds: int = 5) -> Dict[str, Any]:
632
+ """
633
+ Comprehensive pairwise model comparison with statistical significance testing
634
+ """
635
+
636
+ from sklearn.model_selection import cross_val_predict, StratifiedKFold
637
+
638
+ cv_strategy = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.random_state)
639
+
640
+ # Get CV predictions for each model
641
+ model_predictions = {}
642
+ model_cv_scores = {}
643
+
644
+ for model_name, model in models.items():
645
+ # Cross-validation predictions
646
+ cv_pred = cross_val_predict(model, X, y, cv=cv_strategy, method='predict_proba')
647
+ if cv_pred.ndim == 2 and cv_pred.shape[1] == 2:
648
+ cv_pred = cv_pred[:, 1] # Binary classification probabilities
649
+
650
+ model_predictions[model_name] = cv_pred
651
+
652
+ # Calculate CV scores for each metric
653
+ model_cv_scores[model_name] = {}
654
+ for metric_name, metric_func in metrics.items():
655
+ try:
656
+ if 'roc_auc' in metric_name.lower():
657
+ scores = [metric_func(y[test], cv_pred[test]) for train, test in cv_strategy.split(X, y)]
658
+ else:
659
+ pred_labels = (cv_pred > 0.5).astype(int)
660
+ scores = [metric_func(y[test], pred_labels[test]) for train, test in cv_strategy.split(X, y)]
661
+
662
+ model_cv_scores[model_name][metric_name] = np.array(scores)
663
+ except Exception as e:
664
+ self.logger.warning(f"Failed to calculate {metric_name} for {model_name}: {e}")
665
+
666
+ # Pairwise comparisons
667
+ comparison_results = {}
668
+ model_names = list(models.keys())
669
+
670
+ for i, model1_name in enumerate(model_names):
671
+ for j, model2_name in enumerate(model_names[i+1:], i+1):
672
+ comparison_key = f"{model1_name}_vs_{model2_name}"
673
+
674
+ comparison_results[comparison_key] = self._pairwise_comparison(
675
+ model1_name, model2_name,
676
+ model_cv_scores[model1_name],
677
+ model_cv_scores[model2_name],
678
+ model_predictions[model1_name],
679
+ model_predictions[model2_name],
680
+ y, metrics
681
+ )
682
+
683
+ # Overall ranking
684
+ ranking = self._rank_models(model_cv_scores, primary_metric='f1')
685
+
686
+ return {
687
+ 'individual_model_results': model_cv_scores,
688
+ 'pairwise_comparisons': comparison_results,
689
+ 'model_ranking': ranking,
690
+ 'analysis_metadata': {
691
+ 'cv_folds': cv_folds,
692
+ 'confidence_level': self.confidence_level,
693
+ 'n_bootstrap': self.n_bootstrap,
694
+ 'models_compared': len(models),
695
+ 'metrics_evaluated': list(metrics.keys())
696
+ }
697
+ }
698
+
699
+ def _pairwise_comparison(self,
700
+ model1_name: str, model2_name: str,
701
+ scores1: Dict[str, np.ndarray],
702
+ scores2: Dict[str, np.ndarray],
703
+ pred1: np.ndarray, pred2: np.ndarray,
704
+ y_true: np.ndarray,
705
+ metrics: Dict[str, Callable]) -> Dict[str, Any]:
706
+ """Detailed pairwise comparison between two models"""
707
+
708
+ comparison = {
709
+ 'models': [model1_name, model2_name],
710
+ 'metric_comparisons': {},
711
+ 'overall_comparison': {}
712
+ }
713
+
714
+ significant_improvements = 0
715
+ total_comparisons = 0
716
+
717
+ # Compare each metric
718
+ for metric_name in scores1.keys():
719
+ if metric_name in scores2:
720
+ metric_comparison = self._compare_metric_scores(
721
+ scores1[metric_name], scores2[metric_name], metric_name
722
+ )
723
+
724
+ comparison['metric_comparisons'][metric_name] = metric_comparison
725
+
726
+ if metric_comparison['statistical_tests']['significant_improvement']:
727
+ significant_improvements += 1
728
+ total_comparisons += 1
729
+
730
+ # Bootstrap comparison of predictions
731
+ if len(pred1) == len(pred2) == len(y_true):
732
+ bootstrap_comparison = self._bootstrap_prediction_comparison(
733
+ y_true, pred1, pred2, metrics
734
+ )
735
+ comparison['bootstrap_prediction_comparison'] = bootstrap_comparison
736
+
737
+ # Overall decision
738
+ improvement_rate = significant_improvements / total_comparisons if total_comparisons > 0 else 0
739
+
740
+ comparison['overall_comparison'] = {
741
+ 'significant_improvements': significant_improvements,
742
+ 'total_comparisons': total_comparisons,
743
+ 'improvement_rate': float(improvement_rate),
744
+ 'recommendation': self._make_comparison_recommendation(improvement_rate, significant_improvements)
745
+ }
746
+
747
+ return comparison
748
+
749
+ def _compare_metric_scores(self, scores1: np.ndarray, scores2: np.ndarray, metric_name: str) -> Dict[str, Any]:
750
+ """Statistical comparison of metric scores between two models"""
751
+
752
+ # Basic statistics
753
+ mean1, mean2 = np.mean(scores1), np.mean(scores2)
754
+ std1, std2 = np.std(scores1), np.std(scores2)
755
+ improvement = mean2 - mean1
756
+
757
+ # Statistical tests
758
+ statistical_tests = {}
759
+
760
+ # Paired t-test
761
+ try:
762
+ t_stat, p_value = stats.ttest_rel(scores2, scores1)
763
+ statistical_tests['paired_ttest'] = {
764
+ 't_statistic': float(t_stat),
765
+ 'p_value': float(p_value),
766
+ 'significant': p_value < 0.05,
767
+ 'effect_direction': 'improvement' if t_stat > 0 else 'degradation'
768
+ }
769
+ except Exception as e:
770
+ statistical_tests['paired_ttest'] = {'error': str(e)}
771
+
772
+ # Wilcoxon signed-rank test (non-parametric)
773
+ try:
774
+ w_stat, w_p = stats.wilcoxon(scores2, scores1, alternative='two-sided')
775
+ statistical_tests['wilcoxon'] = {
776
+ 'statistic': float(w_stat),
777
+ 'p_value': float(w_p),
778
+ 'significant': w_p < 0.05
779
+ }
780
+ except Exception as e:
781
+ statistical_tests['wilcoxon'] = {'error': str(e)}
782
+
783
+ # Bootstrap confidence interval for difference
784
+ bootstrap_diffs = []
785
+ for _ in range(200): # Reduced for performance
786
+ indices = np.random.choice(len(scores1), size=len(scores1), replace=True)
787
+ diff = np.mean(scores2[indices]) - np.mean(scores1[indices])
788
+ bootstrap_diffs.append(diff)
789
+
790
+ alpha = 1 - self.confidence_level
791
+ ci_lower = np.percentile(bootstrap_diffs, (alpha / 2) * 100)
792
+ ci_upper = np.percentile(bootstrap_diffs, (1 - alpha / 2) * 100)
793
+
794
+ # Effect size (Cohen's d)
795
+ pooled_std = np.sqrt((std1**2 + std2**2) / 2)
796
+ cohens_d = improvement / pooled_std if pooled_std > 0 else 0
797
+
798
+ return {
799
+ 'metric_name': metric_name,
800
+ 'mean_scores': {'model1': float(mean1), 'model2': float(mean2)},
801
+ 'improvement': float(improvement),
802
+ 'relative_improvement_percent': float((improvement / mean1) * 100) if mean1 > 0 else 0,
803
+ 'confidence_interval': {'lower': float(ci_lower), 'upper': float(ci_upper)},
804
+ 'effect_size_cohens_d': float(cohens_d),
805
+ 'statistical_tests': statistical_tests,
806
+ 'significant_improvement': improvement > 0 and ci_lower > 0,
807
+ 'interpretation': self._interpret_effect_size(cohens_d)
808
+ }
809
+
810
+ def _bootstrap_prediction_comparison(self, y_true: np.ndarray, pred1: np.ndarray, pred2: np.ndarray, metrics: Dict[str, Callable]) -> Dict[str, Any]:
811
+ """Bootstrap comparison of model predictions"""
812
+
813
+ bootstrap_results = {}
814
+
815
+ for metric_name, metric_func in metrics.items():
816
+ try:
817
+ # For probabilistic metrics, use probabilities directly
818
+ if 'roc_auc' in metric_name.lower():
819
+ comparison = self.bootstrap_analyzer.bootstrap_model_comparison(
820
+ y_true, pred1, pred2, metric_func, "Model1", "Model2"
821
+ )
822
+ else:
823
+ # For classification metrics, convert to class predictions
824
+ pred1_class = (pred1 > 0.5).astype(int)
825
+ pred2_class = (pred2 > 0.5).astype(int)
826
+ comparison = self.bootstrap_analyzer.bootstrap_model_comparison(
827
+ y_true, pred1_class, pred2_class, metric_func, "Model1", "Model2"
828
+ )
829
+
830
+ bootstrap_results[metric_name] = comparison
831
+
832
+ except Exception as e:
833
+ bootstrap_results[metric_name] = {'error': str(e)}
834
+
835
+ return bootstrap_results
836
+
837
+ def _interpret_effect_size(self, cohens_d: float) -> str:
838
+ """Interpret Cohen's d effect size"""
839
+ abs_d = abs(cohens_d)
840
+ if abs_d < 0.2:
841
+ return "Negligible effect"
842
+ elif abs_d < 0.5:
843
+ return "Small effect"
844
+ elif abs_d < 0.8:
845
+ return "Medium effect"
846
+ else:
847
+ return "Large effect"
848
+
849
+ def _make_comparison_recommendation(self, improvement_rate: float, significant_improvements: int) -> str:
850
+ """Make recommendation based on comparison results"""
851
+ if improvement_rate >= 0.75 and significant_improvements >= 2:
852
+ return "Strong recommendation for model upgrade"
853
+ elif improvement_rate >= 0.5 and significant_improvements >= 1:
854
+ return "Moderate recommendation for model upgrade"
855
+ elif improvement_rate > 0:
856
+ return "Weak recommendation for model upgrade - consider other factors"
857
+ else:
858
+ return "No recommendation for model upgrade"
859
+
860
+ def _rank_models(self, model_cv_scores: Dict[str, Dict[str, np.ndarray]], primary_metric: str = 'f1') -> Dict[str, Any]:
861
+ """Rank models based on CV performance with statistical significance"""
862
+
863
+ # Calculate mean scores for primary metric
864
+ model_means = {}
865
+ for model_name, scores in model_cv_scores.items():
866
+ if primary_metric in scores:
867
+ model_means[model_name] = np.mean(scores[primary_metric])
868
+
869
+ # Sort by mean performance
870
+ sorted_models = sorted(model_means.items(), key=lambda x: x[1], reverse=True)
871
+
872
+ # Statistical significance testing for ranking
873
+ ranking_with_significance = []
874
+ for i, (model_name, mean_score) in enumerate(sorted_models):
875
+ rank_info = {
876
+ 'rank': i + 1,
877
+ 'model_name': model_name,
878
+ 'mean_score': float(mean_score),
879
+ 'significantly_better_than': []
880
+ }
881
+
882
+ # Compare with lower-ranked models
883
+ for j, (other_model, other_score) in enumerate(sorted_models[i+1:], i+1):
884
+ try:
885
+ t_stat, p_value = stats.ttest_rel(
886
+ model_cv_scores[model_name][primary_metric],
887
+ model_cv_scores[other_model][primary_metric]
888
+ )
889
+
890
+ if p_value < 0.05 and t_stat > 0:
891
+ rank_info['significantly_better_than'].append({
892
+ 'model': other_model,
893
+ 'p_value': float(p_value),
894
+ 'rank': j + 1
895
+ })
896
+ except Exception:
897
+ continue
898
+
899
+ ranking_with_significance.append(rank_info)
900
+
901
+ return {
902
+ 'ranking': ranking_with_significance,
903
+ 'primary_metric': primary_metric,
904
+ 'ranking_method': 'mean_cv_score_with_significance_testing'
905
+ }
906
+
907
+
908
+ # Integration utilities for existing codebase
909
+ class MLOpsStatisticalAnalyzer:
910
+ """Comprehensive statistical analyzer for MLOps pipeline"""
911
+
912
+ def __init__(self,
913
+ confidence_level: float = 0.95,
914
+ n_bootstrap: int = 1000,
915
+ random_state: int = 42):
916
+
917
+ self.confidence_level = confidence_level
918
+ self.n_bootstrap = n_bootstrap
919
+ self.random_state = random_state
920
+
921
+ # Initialize analyzers
922
+ self.bootstrap_analyzer = BootstrapAnalyzer(n_bootstrap, confidence_level, random_state)
923
+ self.feature_analyzer = FeatureImportanceAnalyzer(n_bootstrap, confidence_level, random_state)
924
+ self.cv_analyzer = AdvancedCrossValidation(5, n_bootstrap, confidence_level, random_state)
925
+ self.comparison_analyzer = StatisticalModelComparison(confidence_level, n_bootstrap, random_state)
926
+
927
+ if STRUCTURED_LOGGING_AVAILABLE:
928
+ self.logger = MLOpsLoggers.get_logger('statistical_analyzer')
929
+ else:
930
+ self.logger = logging.getLogger(__name__)
931
+
932
+ def comprehensive_model_analysis(self,
933
+ models: Dict[str, Any],
934
+ X_train: np.ndarray,
935
+ X_test: np.ndarray,
936
+ y_train: np.ndarray,
937
+ y_test: np.ndarray,
938
+ feature_names: List[str] = None) -> Dict[str, Any]:
939
+ """
940
+ Perform comprehensive statistical analysis of models including:
941
+ - Bootstrap confidence intervals for performance metrics
942
+ - Feature importance stability analysis
943
+ - Advanced cross-validation with statistical testing
944
+ - Pairwise model comparisons with significance testing
945
+ """
946
+
947
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
948
+
949
+ # Define metrics
950
+ def accuracy_func(y_true, y_pred): return accuracy_score(y_true, y_pred)
951
+ def f1_func(y_true, y_pred): return f1_score(y_true, y_pred, average='weighted')
952
+ def precision_func(y_true, y_pred): return precision_score(y_true, y_pred, average='weighted')
953
+ def recall_func(y_true, y_pred): return recall_score(y_true, y_pred, average='weighted')
954
+ def roc_auc_func(y_true, y_pred_proba): return roc_auc_score(y_true, y_pred_proba)
955
+
956
+ metrics = {
957
+ 'accuracy': accuracy_func,
958
+ 'f1': f1_func,
959
+ 'precision': precision_func,
960
+ 'recall': recall_func,
961
+ 'roc_auc': roc_auc_func
962
+ }
963
+
964
+ analysis_results = {
965
+ 'analysis_timestamp': datetime.now().isoformat(),
966
+ 'configuration': {
967
+ 'confidence_level': self.confidence_level,
968
+ 'n_bootstrap': self.n_bootstrap,
969
+ 'models_analyzed': list(models.keys())
970
+ },
971
+ 'individual_model_analysis': {},
972
+ 'comparative_analysis': {},
973
+ 'feature_importance_analysis': {},
974
+ 'recommendations': []
975
+ }
976
+
977
+ # Individual model analysis
978
+ for model_name, model in models.items():
979
+ try:
980
+ # Fit model
981
+ model.fit(X_train, y_train)
982
+
983
+ # Get predictions
984
+ y_pred = model.predict(X_test)
985
+ y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred
986
+
987
+ # Bootstrap analysis for each metric
988
+ bootstrap_results = {}
989
+ for metric_name, metric_func in metrics.items():
990
+ if metric_name == 'roc_auc':
991
+ result = self.bootstrap_analyzer.bootstrap_metric(
992
+ y_test, y_pred_proba, metric_func
993
+ )
994
+ else:
995
+ result = self.bootstrap_analyzer.bootstrap_metric(
996
+ y_test, y_pred, metric_func
997
+ )
998
+ bootstrap_results[metric_name] = result.to_dict()
999
+
1000
+ # Cross-validation analysis
1001
+ cv_analysis = self.cv_analyzer.comprehensive_cv_analysis(
1002
+ model, X_train, y_train, metrics
1003
+ )
1004
+
1005
+ # Feature importance analysis (if supported)
1006
+ feature_analysis = {}
1007
+ if hasattr(model, 'feature_importances_') or hasattr(model, 'coef_'):
1008
+ try:
1009
+ feature_analysis = self.feature_analyzer.analyze_importance_stability(
1010
+ model, X_train, y_train, feature_names
1011
+ )
1012
+ except Exception as e:
1013
+ feature_analysis = {'error': str(e)}
1014
+
1015
+ analysis_results['individual_model_analysis'][model_name] = {
1016
+ 'bootstrap_metrics': bootstrap_results,
1017
+ 'cross_validation_analysis': cv_analysis,
1018
+ 'feature_importance_analysis': feature_analysis
1019
+ }
1020
+
1021
+ except Exception as e:
1022
+ self.logger.error(f"Analysis failed for model {model_name}: {e}")
1023
+ analysis_results['individual_model_analysis'][model_name] = {'error': str(e)}
1024
+
1025
+ # Comparative analysis
1026
+ if len(models) > 1:
1027
+ try:
1028
+ comparative_results = self.comparison_analyzer.comprehensive_model_comparison(
1029
+ models, X_train, y_train, metrics
1030
+ )
1031
+ analysis_results['comparative_analysis'] = comparative_results
1032
+
1033
+ # Generate recommendations based on comparison
1034
+ recommendations = self._generate_analysis_recommendations(comparative_results)
1035
+ analysis_results['recommendations'].extend(recommendations)
1036
+
1037
+ except Exception as e:
1038
+ analysis_results['comparative_analysis'] = {'error': str(e)}
1039
+
1040
+ return analysis_results
1041
+
1042
+ def _generate_analysis_recommendations(self, comparative_results: Dict[str, Any]) -> List[Dict[str, str]]:
1043
+ """Generate actionable recommendations based on statistical analysis"""
1044
+ recommendations = []
1045
+
1046
+ # Model ranking recommendations
1047
+ if 'model_ranking' in comparative_results:
1048
+ ranking = comparative_results['model_ranking']['ranking']
1049
+ if len(ranking) > 0:
1050
+ best_model = ranking[0]
1051
+ significantly_better_count = len(best_model.get('significantly_better_than', []))
1052
+
1053
+ if significantly_better_count > 0:
1054
+ recommendations.append({
1055
+ 'type': 'model_selection',
1056
+ 'priority': 'high',
1057
+ 'message': f"Model '{best_model['model_name']}' shows statistically significant improvement over {significantly_better_count} other model(s)",
1058
+ 'action': f"Consider promoting {best_model['model_name']} to production"
1059
+ })
1060
+
1061
+ # Feature importance recommendations
1062
+ for model_name, analysis in comparative_results.get('individual_model_analysis', {}).items():
1063
+ feature_analysis = analysis.get('feature_importance_analysis', {})
1064
+ if 'stability_ranking' in feature_analysis:
1065
+ unstable_features = [
1066
+ name for name, stats in feature_analysis['feature_importance_analysis'].items()
1067
+ if stats['metadata']['coefficient_of_variation'] > 0.5
1068
+ ]
1069
+
1070
+ if unstable_features:
1071
+ recommendations.append({
1072
+ 'type': 'feature_engineering',
1073
+ 'priority': 'medium',
1074
+ 'message': f"Model '{model_name}' has {len(unstable_features)} unstable features with high variance",
1075
+ 'action': "Review feature engineering process and consider feature selection"
1076
+ })
1077
+
1078
+ # Cross-validation recommendations
1079
+ for model_name, analysis in comparative_results.get('individual_model_analysis', {}).items():
1080
+ cv_analysis = analysis.get('cross_validation_analysis', {})
1081
+ for metric_name, metric_analysis in cv_analysis.get('metrics_analysis', {}).items():
1082
+ overfitting_analysis = metric_analysis.get('overfitting_analysis', {})
1083
+ if overfitting_analysis.get('overfitting_score', 0) > 0.1: # 10% overfitting threshold
1084
+ recommendations.append({
1085
+ 'type': 'model_complexity',
1086
+ 'priority': 'medium',
1087
+ 'message': f"Model '{model_name}' shows significant overfitting in {metric_name}",
1088
+ 'action': "Consider regularization or reducing model complexity"
1089
+ })
1090
+
1091
+ return recommendations
1092
+
1093
+ def save_analysis_report(self, analysis_results: Dict[str, Any], file_path: Path = None):
1094
+ """Save comprehensive analysis report"""
1095
+ if file_path is None:
1096
+ file_path = Path("/tmp/logs/statistical_analysis_report.json")
1097
+
1098
+ file_path.parent.mkdir(parents=True, exist_ok=True)
1099
+
1100
+ with open(file_path, 'w') as f:
1101
+ json.dump(analysis_results, f, indent=2, default=str)
1102
+
1103
+ self.logger.info(f"Statistical analysis report saved to {file_path}")
1104
+ return file_path
1105
+
1106
+
1107
+ # Integration functions for existing codebase
1108
+ def integrate_statistical_analysis_with_retrain():
1109
+ """Integration example for retrain.py"""
1110
+ analyzer = MLOpsStatisticalAnalyzer()
1111
+
1112
+ # Example usage in retraining context
1113
+ def enhanced_model_comparison(models_dict, X_train, X_test, y_train, y_test):
1114
+ """Enhanced model comparison with comprehensive statistical analysis"""
1115
+
1116
+ analysis_results = analyzer.comprehensive_model_analysis(
1117
+ models_dict, X_train, X_test, y_train, y_test
1118
+ )
1119
+
1120
+ # Extract promotion decision based on statistical significance
1121
+ comparative_analysis = analysis_results.get('comparative_analysis', {})
1122
+ ranking = comparative_analysis.get('model_ranking', {}).get('ranking', [])
1123
+
1124
+ if ranking:
1125
+ best_model = ranking[0]
1126
+ promotion_confidence = len(best_model.get('significantly_better_than', [])) / (len(ranking) - 1) if len(ranking) > 1 else 1.0
1127
+
1128
+ return {
1129
+ 'recommended_model': best_model['model_name'],
1130
+ 'statistical_confidence': promotion_confidence,
1131
+ 'analysis_results': analysis_results,
1132
+ 'promote_candidate': promotion_confidence > 0.5
1133
+ }
1134
+
1135
+ return {'error': 'No valid model ranking available'}
1136
+
1137
+ return enhanced_model_comparison
1138
+
1139
+ def integrate_statistical_analysis_with_train():
1140
+ """Integration example for train.py"""
1141
+ analyzer = MLOpsStatisticalAnalyzer()
1142
+
1143
+ def enhanced_ensemble_validation(individual_models, ensemble_model, X, y):
1144
+ """Enhanced ensemble validation with bootstrap confidence intervals"""
1145
+
1146
+ models_to_compare = {**individual_models, 'ensemble': ensemble_model}
1147
+
1148
+ # Perform comprehensive statistical analysis
1149
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
1150
+
1151
+ analysis_results = analyzer.comprehensive_model_analysis(
1152
+ models_to_compare, X_train, X_test, y_train, y_test
1153
+ )
1154
+
1155
+ # Check if ensemble is statistically significantly better
1156
+ comparative_analysis = analysis_results.get('comparative_analysis', {})
1157
+ ensemble_comparisons = {
1158
+ k: v for k, v in comparative_analysis.get('pairwise_comparisons', {}).items()
1159
+ if 'ensemble' in k
1160
+ }
1161
+
1162
+ significant_improvements = 0
1163
+ total_comparisons = len(ensemble_comparisons)
1164
+
1165
+ for comparison in ensemble_comparisons.values():
1166
+ if comparison.get('overall_comparison', {}).get('improvement_rate', 0) > 0.5:
1167
+ significant_improvements += 1
1168
+
1169
+ ensemble_confidence = significant_improvements / total_comparisons if total_comparisons > 0 else 0
1170
+
1171
+ return {
1172
+ 'use_ensemble': ensemble_confidence > 0.5,
1173
+ 'ensemble_confidence': ensemble_confidence,
1174
+ 'statistical_analysis': analysis_results
1175
+ }
1176
+
1177
+ return enhanced_ensemble_validation
1178
+
1179
+
1180
+ if __name__ == "__main__":
1181
+ # Example usage and testing
1182
+ print("Testing advanced statistical analysis system...")
1183
+
1184
+ # Generate sample data for testing
1185
+ np.random.seed(42)
1186
+ X = np.random.randn(200, 10)
1187
+ y = (X[:, 0] + X[:, 1] + np.random.randn(200) * 0.1 > 0).astype(int)
1188
+
1189
+ # Create sample models
1190
+ from sklearn.linear_model import LogisticRegression
1191
+ from sklearn.ensemble import RandomForestClassifier
1192
+
1193
+ models = {
1194
+ 'logistic_regression': LogisticRegression(random_state=42),
1195
+ 'random_forest': RandomForestClassifier(n_estimators=50, random_state=42)
1196
+ }
1197
+
1198
+ # Test comprehensive analysis
1199
+ analyzer = MLOpsStatisticalAnalyzer(n_bootstrap=100) # Reduced for testing
1200
+
1201
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
1202
+
1203
+ print("Running comprehensive statistical analysis...")
1204
+ results = analyzer.comprehensive_model_analysis(
1205
+ models, X_train, X_test, y_train, y_test
1206
+ )
1207
+
1208
+ print(f"Analysis completed for {len(models)} models")
1209
+ print(f"Generated {len(results['recommendations'])} recommendations")
1210
+
1211
+ # Test bootstrap analysis
1212
+ bootstrap_analyzer = BootstrapAnalyzer(n_bootstrap=100)
1213
+
1214
+ from sklearn.metrics import f1_score
1215
+ def f1_metric(y_true, y_pred):
1216
+ return f1_score(y_true, y_pred, average='weighted')
1217
+
1218
+ model = LogisticRegression(random_state=42)
1219
+ model.fit(X_train, y_train)
1220
+ y_pred = model.predict(X_test)
1221
+
1222
+ bootstrap_result = bootstrap_analyzer.bootstrap_metric(y_test, y_pred, f1_metric)
1223
+ print(f"Bootstrap F1 confidence interval: {bootstrap_result.confidence_interval}")
1224
+
1225
+ print("Advanced statistical analysis system test completed successfully!")