Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

App Files Files Community

Ahmedik95316 commited on Aug 21

Commit

aff6c9c

1 Parent(s): 620e5bd

Update monitor/monitor_drift.py

Browse files

Adding Automated Retraining Triggers

Files changed (1) hide show

monitor/monitor_drift.py +484 -0

monitor/monitor_drift.py CHANGED Viewed

@@ -667,6 +667,490 @@ class AdvancedDriftMonitor:
             logger.error(f"Drift monitoring failed: {e}")
             return None
 def monitor_drift():
     """Main function for external calls"""
     monitor = AdvancedDriftMonitor()

             logger.error(f"Drift monitoring failed: {e}")
             return None
+    def setup_automation_config(self):
+        """Setup automation-specific configuration"""
+        self.automation_config = {
+            'retraining_thresholds': {
+                'drift_score': 0.2,
+                'consecutive_detections': 3,
+                'performance_drop': 0.05,
+                'data_volume_threshold': 1000,
+                'time_since_last_training': timedelta(days=7)
+            },
+            'monitoring_schedule': {
+                'check_interval': timedelta(hours=6),
+                'force_check_interval': timedelta(days=1),
+                'max_monitoring_failures': 5
+            },
+            'emergency_thresholds': {
+                'critical_drift_score': 0.4,
+                'critical_performance_drop': 0.15,
+                'emergency_action_required': True
+            },
+            'data_quality_thresholds': {
+                'min_samples_for_detection': 100,
+                'min_samples_for_retraining': 500,
+                'data_freshness_hours': 24
+            }
+        }
+    def check_retraining_triggers(self, drift_results: Dict = None) -> Dict:
+        """Check if retraining should be triggered based on multiple criteria"""
+        try:
+            trigger_results = {
+                'should_retrain': False,
+                'trigger_reason': None,
+                'urgency': 'none',
+                'triggers_detected': [],
+                'data_quality_check': {},
+                'recommendations': []
+            }
+            # Perform drift monitoring if not provided
+            if drift_results is None:
+                reference_df, current_df = self.load_and_prepare_data()
+                if reference_df is None or current_df is None:
+                    trigger_results['trigger_reason'] = 'insufficient_data'
+                    return trigger_results
+                drift_results = self.comprehensive_drift_detection(reference_df, current_df)
+                if 'error' in drift_results:
+                    trigger_results['trigger_reason'] = f"drift_detection_error: {drift_results['error']}"
+                    return trigger_results
+            # Check drift-based triggers
+            drift_triggers = self.check_drift_triggers(drift_results)
+            trigger_results['triggers_detected'].extend(drift_triggers)
+            # Check data volume triggers
+            volume_triggers = self.check_data_volume_triggers()
+            trigger_results['triggers_detected'].extend(volume_triggers)
+            # Check time-based triggers
+            time_triggers = self.check_time_based_triggers()
+            trigger_results['triggers_detected'].extend(time_triggers)
+            # Check data quality
+            trigger_results['data_quality_check'] = self.check_data_quality()
+            # Determine if retraining should be triggered
+            trigger_results = self.evaluate_retraining_decision(trigger_results, drift_results)
+            # Save trigger evaluation
+            self.save_trigger_evaluation(trigger_results)
+            return trigger_results
+        except Exception as e:
+            logger.error(f"Retraining trigger check failed: {e}")
+            return {
+                'should_retrain': False,
+                'trigger_reason': f'trigger_check_error: {str(e)}',
+                'urgency': 'none',
+                'triggers_detected': [],
+                'error': str(e)
+            }
+    def check_drift_triggers(self, drift_results: Dict) -> List[Dict]:
+        """Check drift-based retraining triggers"""
+        triggers = []
+        # Overall drift score trigger
+        overall_score = drift_results.get('overall_drift_score', 0)
+        if overall_score > self.automation_config['retraining_thresholds']['drift_score']:
+            triggers.append({
+                'type': 'drift_score',
+                'severity': 'high' if overall_score > self.automation_config['emergency_thresholds']['critical_drift_score'] else 'medium',
+                'value': overall_score,
+                'threshold': self.automation_config['retraining_thresholds']['drift_score'],
+                'message': f"Drift score {overall_score:.3f} exceeds threshold {self.automation_config['retraining_thresholds']['drift_score']}"
+            })
+        # Performance degradation trigger
+        perf_results = drift_results.get('individual_methods', {}).get('performance_drift', {})
+        if 'performance_drop' in perf_results:
+            perf_drop = perf_results['performance_drop']
+            if perf_drop > self.automation_config['retraining_thresholds']['performance_drop']:
+                triggers.append({
+                    'type': 'performance_degradation',
+                    'severity': 'critical' if perf_drop > self.automation_config['emergency_thresholds']['critical_performance_drop'] else 'high',
+                    'value': perf_drop,
+                    'threshold': self.automation_config['retraining_thresholds']['performance_drop'],
+                    'message': f"Performance drop {perf_drop:.3f} exceeds threshold"
+                })
+        # Consecutive detection trigger
+        consecutive_detections = self.count_consecutive_drift_detections()
+        if consecutive_detections >= self.automation_config['retraining_thresholds']['consecutive_detections']:
+            triggers.append({
+                'type': 'consecutive_detections',
+                'severity': 'medium',
+                'value': consecutive_detections,
+                'threshold': self.automation_config['retraining_thresholds']['consecutive_detections'],
+                'message': f"Drift detected in {consecutive_detections} consecutive monitoring cycles"
+            })
+        return triggers
+    def check_data_volume_triggers(self) -> List[Dict]:
+        """Check data volume-based triggers"""
+        triggers = []
+        try:
+            # Count new data since last training
+            new_data_count = self.count_new_data_since_training()
+            if new_data_count >= self.automation_config['retraining_thresholds']['data_volume_threshold']:
+                triggers.append({
+                    'type': 'data_volume',
+                    'severity': 'low',
+                    'value': new_data_count,
+                    'threshold': self.automation_config['retraining_thresholds']['data_volume_threshold'],
+                    'message': f"Accumulated {new_data_count} new samples since last training"
+                })
+            return triggers
+        except Exception as e:
+            logger.warning(f"Data volume trigger check failed: {e}")
+            return []
+    def check_time_based_triggers(self) -> List[Dict]:
+        """Check time-based retraining triggers"""
+        triggers = []
+        try:
+            # Get last training time
+            last_training_time = self.get_last_training_time()
+            if last_training_time:
+                time_since_training = datetime.now() - last_training_time
+                threshold = self.automation_config['retraining_thresholds']['time_since_last_training']
+                if time_since_training > threshold:
+                    triggers.append({
+                        'type': 'time_since_training',
+                        'severity': 'low',
+                        'value': time_since_training.days,
+                        'threshold': threshold.days,
+                        'message': f"Last training was {time_since_training.days} days ago"
+                    })
+            return triggers
+        except Exception as e:
+            logger.warning(f"Time-based trigger check failed: {e}")
+            return []
+    def check_data_quality(self) -> Dict:
+        """Check data quality for retraining"""
+        quality_check = {
+            'sufficient_data': False,
+            'data_freshness': False,
+            'data_balance': False,
+            'overall_quality': 'poor',
+            'issues': []
+        }
+        try:
+            # Load current data
+            _, current_df = self.load_and_prepare_data()
+            if current_df is None or len(current_df) == 0:
+                quality_check['issues'].append('No current data available')
+                return quality_check
+            # Check data volume
+            min_samples = self.automation_config['data_quality_thresholds']['min_samples_for_retraining']
+            if len(current_df) >= min_samples:
+                quality_check['sufficient_data'] = True
+            else:
+                quality_check['issues'].append(f'Insufficient data: {len(current_df)} < {min_samples}')
+            # Check data freshness
+            if 'timestamp' in current_df.columns:
+                try:
+                    current_df['timestamp'] = pd.to_datetime(current_df['timestamp'])
+                    latest_data = current_df['timestamp'].max()
+                    freshness_threshold = datetime.now() - timedelta(
+                        hours=self.automation_config['data_quality_thresholds']['data_freshness_hours']
+                    )
+                    if latest_data > freshness_threshold:
+                        quality_check['data_freshness'] = True
+                    else:
+                        quality_check['issues'].append('Data is not fresh enough')
+                except:
+                    quality_check['issues'].append('Cannot determine data freshness')
+            # Check data balance if labels available
+            if 'label' in current_df.columns:
+                label_counts = current_df['label'].value_counts()
+                if len(label_counts) > 1:
+                    balance_ratio = label_counts.min() / label_counts.max()
+                    if balance_ratio > 0.3:  # At least 30% minority class
+                        quality_check['data_balance'] = True
+                    else:
+                        quality_check['issues'].append(f'Data imbalance: ratio {balance_ratio:.2f}')
+            # Overall quality assessment
+            quality_score = sum([
+                quality_check['sufficient_data'],
+                quality_check['data_freshness'],
+                quality_check['data_balance']
+            ])
+            if quality_score >= 3:
+                quality_check['overall_quality'] = 'excellent'
+            elif quality_score >= 2:
+                quality_check['overall_quality'] = 'good'
+            elif quality_score >= 1:
+                quality_check['overall_quality'] = 'fair'
+            else:
+                quality_check['overall_quality'] = 'poor'
+            return quality_check
+        except Exception as e:
+            logger.error(f"Data quality check failed: {e}")
+            quality_check['issues'].append(f'Quality check error: {str(e)}')
+            return quality_check
+    def evaluate_retraining_decision(self, trigger_results: Dict, drift_results: Dict) -> Dict:
+        """Evaluate whether retraining should be triggered"""
+        triggers = trigger_results['triggers_detected']
+        data_quality = trigger_results['data_quality_check']
+        # Count trigger types and severities
+        critical_triggers = [t for t in triggers if t['severity'] == 'critical']
+        high_triggers = [t for t in triggers if t['severity'] == 'high']
+        medium_triggers = [t for t in triggers if t['severity'] == 'medium']
+        # Decision logic
+        should_retrain = False
+        urgency = 'none'
+        reason = None
+        recommendations = []
+        # Critical triggers - immediate retraining
+        if critical_triggers:
+            should_retrain = True
+            urgency = 'critical'
+            reason = f"Critical triggers detected: {[t['type'] for t in critical_triggers]}"
+            recommendations.extend([
+                "URGENT: Critical model degradation detected",
+                "Stop current model serving if possible",
+                "Initiate emergency retraining immediately"
+            ])
+        # High priority triggers - urgent retraining
+        elif high_triggers:
+            if data_quality['overall_quality'] in ['good', 'excellent']:
+                should_retrain = True
+                urgency = 'high'
+                reason = f"High priority triggers with good data quality: {[t['type'] for t in high_triggers]}"
+                recommendations.extend([
+                    "High priority retraining recommended",
+                    "Schedule retraining within 24 hours"
+                ])
+            else:
+                recommendations.extend([
+                    "High priority triggers detected but data quality insufficient",
+                    "Improve data quality before retraining"
+                ])
+        # Medium priority triggers - scheduled retraining
+        elif len(medium_triggers) >= 2 or len(triggers) >= 3:
+            if data_quality['overall_quality'] in ['good', 'excellent', 'fair']:
+                should_retrain = True
+                urgency = 'medium'
+                reason = f"Multiple triggers detected: {[t['type'] for t in triggers]}"
+                recommendations.extend([
+                    "Multiple retraining indicators detected",
+                    "Schedule retraining within next maintenance window"
+                ])
+        # Single medium or low priority triggers
+        elif triggers:
+            recommendations.extend([
+                "Some retraining indicators detected",
+                "Monitor closely and prepare for retraining",
+                f"Triggers: {[t['type'] for t in triggers]}"
+            ])
+        # Update results
+        trigger_results.update({
+            'should_retrain': should_retrain,
+            'urgency': urgency,
+            'trigger_reason': reason,
+            'recommendations': recommendations
+        })
+        return trigger_results
+    def count_consecutive_drift_detections(self) -> int:
+        """Count consecutive drift detections from historical data"""
+        try:
+            if not self.drift_log_path.exists():
+                return 0
+            with open(self.drift_log_path, 'r') as f:
+                logs = json.load(f)
+            if not logs:
+                return 0
+            # Sort by timestamp and count consecutive detections
+            logs_sorted = sorted(logs, key=lambda x: x.get('timestamp', ''))
+            consecutive_count = 0
+            for log_entry in reversed(logs_sorted[-10:]):  # Check last 10 entries
+                if log_entry.get('overall_drift_detected', False):
+                    consecutive_count += 1
+                else:
+                    break
+            return consecutive_count
+        except Exception as e:
+            logger.warning(f"Failed to count consecutive detections: {e}")
+            return 0
+    def count_new_data_since_training(self) -> int:
+        """Count new data samples since last training"""
+        try:
+            last_training_time = self.get_last_training_time()
+            if not last_training_time:
+                return 0
+            # Count data from current sources
+            total_count = 0
+            for data_path in [self.current_data_path, self.generated_data_path]:
+                if data_path.exists():
+                    df = pd.read_csv(data_path)
+                    if 'timestamp' in df.columns:
+                        df['timestamp'] = pd.to_datetime(df['timestamp'])
+                        new_data = df[df['timestamp'] > last_training_time]
+                        total_count += len(new_data)
+                    else:
+                        # If no timestamp, assume all data is new
+                        total_count += len(df)
+            return total_count
+        except Exception as e:
+            logger.warning(f"Failed to count new data: {e}")
+            return 0
+    def get_last_training_time(self) -> Optional[datetime]:
+        """Get timestamp of last model training"""
+        try:
+            # Check model metadata
+            metadata_path = self.model_dir / "metadata.json"
+            if metadata_path.exists():
+                with open(metadata_path, 'r') as f:
+                    metadata = json.load(f)
+                timestamp_str = metadata.get('timestamp')
+                if timestamp_str:
+                    return datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
+            # Fallback to model file modification time
+            for model_path in [self.pipeline_path, self.model_path]:
+                if model_path.exists():
+                    return datetime.fromtimestamp(model_path.stat().st_mtime)
+            return None
+        except Exception as e:
+            logger.warning(f"Failed to get last training time: {e}")
+            return None
+    def save_trigger_evaluation(self, trigger_results: Dict):
+        """Save trigger evaluation results"""
+        try:
+            trigger_log_path = self.logs_dir / "retraining_triggers.json"
+            # Load existing logs
+            logs = []
+            if trigger_log_path.exists():
+                try:
+                    with open(trigger_log_path, 'r') as f:
+                        logs = json.load(f)
+                except:
+                    logs = []
+            # Add timestamp and save
+            trigger_results['evaluation_timestamp'] = datetime.now().isoformat()
+            logs.append(trigger_results)
+            # Keep only last 100 evaluations
+            if len(logs) > 100:
+                logs = logs[-100:]
+            with open(trigger_log_path, 'w') as f:
+                json.dump(logs, f, indent=2)
+            logger.info(f"Trigger evaluation saved to {trigger_log_path}")
+        except Exception as e:
+            logger.error(f"Failed to save trigger evaluation: {e}")
+    def get_automation_status(self) -> Dict:
+        """Get current automation status and recent trigger evaluations"""
+        try:
+            status = {
+                'automation_active': True,
+                'last_drift_check': None,
+                'last_trigger_evaluation': None,
+                'recent_triggers': [],
+                'data_quality_status': {},
+                'next_scheduled_check': None
+            }
+            # Get last drift check
+            if self.drift_log_path.exists():
+                try:
+                    with open(self.drift_log_path, 'r') as f:
+                        logs = json.load(f)
+                    if logs:
+                        status['last_drift_check'] = logs[-1].get('timestamp')
+                except:
+                    pass
+            # Get recent trigger evaluations
+            trigger_log_path = self.logs_dir / "retraining_triggers.json"
+            if trigger_log_path.exists():
+                try:
+                    with open(trigger_log_path, 'r') as f:
+                        trigger_logs = json.load(f)
+                    if trigger_logs:
+                        status['last_trigger_evaluation'] = trigger_logs[-1].get('evaluation_timestamp')
+                        status['recent_triggers'] = trigger_logs[-5:]  # Last 5 evaluations
+                except:
+                    pass
+            # Get current data quality
+            status['data_quality_status'] = self.check_data_quality()
+            return status
+        except Exception as e:
+            logger.error(f"Failed to get automation status: {e}")
+            return {'automation_active': False, 'error': str(e)}
+    # Add to __init__ method
+    def __init__(self):
+        self.setup_paths()
+        self.setup_drift_config()
+        self.setup_automation_config()
+        self.setup_drift_methods()
+        self.historical_data = self.load_historical_data()
 def monitor_drift():
     """Main function for external calls"""
     monitor = AdvancedDriftMonitor()