Ahmedik95316 commited on
Commit
0616f70
·
1 Parent(s): 0d47abb

Create utils/structured_logger.py

Browse files

Adding Tests for MLOps Infrastructure Enhancement

Files changed (1) hide show
  1. utils/structured_logger.py +750 -0
utils/structured_logger.py ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/structured_logger.py
2
+ # Production-ready structured logging system for MLOps grade enhancement
3
+
4
+ import logging
5
+ import json
6
+ import sys
7
+ import traceback
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Dict, Any, Optional, Union
11
+ from contextlib import contextmanager
12
+ from dataclasses import dataclass, asdict
13
+ from enum import Enum
14
+ import threading
15
+ import time
16
+
17
+
18
+ class LogLevel(Enum):
19
+ """Standardized log levels with numeric values for filtering"""
20
+ DEBUG = 10
21
+ INFO = 20
22
+ WARNING = 30
23
+ ERROR = 40
24
+ CRITICAL = 50
25
+
26
+
27
+ class EventType(Enum):
28
+ """Standardized event types for structured logging"""
29
+ # Model lifecycle events
30
+ MODEL_TRAINING_START = "model.training.start"
31
+ MODEL_TRAINING_COMPLETE = "model.training.complete"
32
+ MODEL_TRAINING_ERROR = "model.training.error"
33
+ MODEL_VALIDATION = "model.validation"
34
+ MODEL_PROMOTION = "model.promotion"
35
+ MODEL_BACKUP = "model.backup"
36
+
37
+ # Data processing events
38
+ DATA_LOADING = "data.loading"
39
+ DATA_VALIDATION = "data.validation"
40
+ DATA_PREPROCESSING = "data.preprocessing"
41
+ DATA_QUALITY_CHECK = "data.quality.check"
42
+
43
+ # Feature engineering events
44
+ FEATURE_EXTRACTION = "features.extraction"
45
+ FEATURE_SELECTION = "features.selection"
46
+ FEATURE_VALIDATION = "features.validation"
47
+
48
+ # Cross-validation and ensemble events
49
+ CROSS_VALIDATION_START = "cv.start"
50
+ CROSS_VALIDATION_COMPLETE = "cv.complete"
51
+ ENSEMBLE_CREATION = "ensemble.creation"
52
+ ENSEMBLE_VALIDATION = "ensemble.validation"
53
+ STATISTICAL_COMPARISON = "model.statistical_comparison"
54
+
55
+ # System performance events
56
+ PERFORMANCE_METRIC = "system.performance"
57
+ RESOURCE_USAGE = "system.resource_usage"
58
+ CPU_CONSTRAINT_WARNING = "system.cpu_constraint"
59
+
60
+ # API and application events
61
+ API_REQUEST = "api.request"
62
+ API_RESPONSE = "api.response"
63
+ API_ERROR = "api.error"
64
+ PREDICTION_REQUEST = "prediction.request"
65
+ PREDICTION_RESPONSE = "prediction.response"
66
+
67
+ # Monitoring and alerting events
68
+ DRIFT_DETECTION = "monitor.drift"
69
+ ALERT_TRIGGERED = "alert.triggered"
70
+ HEALTH_CHECK = "health.check"
71
+
72
+ # Security and access events
73
+ ACCESS_GRANTED = "security.access_granted"
74
+ ACCESS_DENIED = "security.access_denied"
75
+ AUTHENTICATION = "security.authentication"
76
+
77
+
78
+ @dataclass
79
+ class LogEntry:
80
+ """Structured log entry with standardized fields"""
81
+ timestamp: str
82
+ level: str
83
+ event_type: str
84
+ message: str
85
+ component: str
86
+ session_id: Optional[str] = None
87
+ trace_id: Optional[str] = None
88
+ user_id: Optional[str] = None
89
+ duration_ms: Optional[float] = None
90
+ metadata: Optional[Dict[str, Any]] = None
91
+ tags: Optional[list] = None
92
+ environment: str = "production"
93
+ version: str = "1.0"
94
+
95
+ def to_dict(self) -> Dict[str, Any]:
96
+ """Convert log entry to dictionary for JSON serialization"""
97
+ return {k: v for k, v in asdict(self).items() if v is not None}
98
+
99
+ def to_json(self) -> str:
100
+ """Convert log entry to JSON string"""
101
+ return json.dumps(self.to_dict(), default=str, ensure_ascii=False)
102
+
103
+
104
+ class StructuredLogger:
105
+ """Production-ready structured logger with performance monitoring"""
106
+
107
+ def __init__(self,
108
+ name: str,
109
+ log_level: LogLevel = LogLevel.INFO,
110
+ log_file: Optional[Path] = None,
111
+ max_file_size_mb: int = 100,
112
+ backup_count: int = 5,
113
+ enable_console: bool = True,
114
+ enable_json_format: bool = True):
115
+
116
+ self.name = name
117
+ self.log_level = log_level
118
+ self.enable_json_format = enable_json_format
119
+ self.session_id = self._generate_session_id()
120
+ self._local = threading.local()
121
+
122
+ # Setup logging infrastructure
123
+ self.logger = logging.getLogger(name)
124
+ self.logger.setLevel(log_level.value)
125
+
126
+ # Clear existing handlers to avoid duplicates
127
+ self.logger.handlers.clear()
128
+
129
+ # Setup file logging if specified
130
+ if log_file:
131
+ self._setup_file_handler(log_file, max_file_size_mb, backup_count)
132
+
133
+ # Setup console logging if enabled
134
+ if enable_console:
135
+ self._setup_console_handler()
136
+
137
+ def _generate_session_id(self) -> str:
138
+ """Generate unique session ID for tracking related events"""
139
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
140
+ thread_id = threading.current_thread().ident
141
+ return f"session_{timestamp}_{thread_id}"
142
+
143
+ def _setup_file_handler(self, log_file: Path, max_size_mb: int, backup_count: int):
144
+ """Setup rotating file handler"""
145
+ from logging.handlers import RotatingFileHandler
146
+
147
+ log_file.parent.mkdir(parents=True, exist_ok=True)
148
+
149
+ file_handler = RotatingFileHandler(
150
+ log_file,
151
+ maxBytes=max_size_mb * 1024 * 1024,
152
+ backupCount=backup_count,
153
+ encoding='utf-8'
154
+ )
155
+
156
+ if self.enable_json_format:
157
+ file_handler.setFormatter(JsonFormatter())
158
+ else:
159
+ file_handler.setFormatter(self._create_standard_formatter())
160
+
161
+ self.logger.addHandler(file_handler)
162
+
163
+ def _setup_console_handler(self):
164
+ """Setup console handler with appropriate formatting"""
165
+ console_handler = logging.StreamHandler(sys.stdout)
166
+
167
+ if self.enable_json_format:
168
+ console_handler.setFormatter(JsonFormatter())
169
+ else:
170
+ console_handler.setFormatter(self._create_standard_formatter())
171
+
172
+ self.logger.addHandler(console_handler)
173
+
174
+ def _create_standard_formatter(self):
175
+ """Create human-readable formatter for non-JSON output"""
176
+ return logging.Formatter(
177
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
178
+ datefmt='%Y-%m-%d %H:%M:%S'
179
+ )
180
+
181
+ def _get_trace_id(self) -> Optional[str]:
182
+ """Get current trace ID from thread-local storage"""
183
+ return getattr(self._local, 'trace_id', None)
184
+
185
+ def set_trace_id(self, trace_id: str):
186
+ """Set trace ID for current thread"""
187
+ self._local.trace_id = trace_id
188
+
189
+ def clear_trace_id(self):
190
+ """Clear trace ID for current thread"""
191
+ if hasattr(self._local, 'trace_id'):
192
+ del self._local.trace_id
193
+
194
+ def _create_log_entry(self,
195
+ level: LogLevel,
196
+ event_type: EventType,
197
+ message: str,
198
+ component: str = None,
199
+ duration_ms: float = None,
200
+ metadata: Dict[str, Any] = None,
201
+ tags: list = None,
202
+ user_id: str = None) -> LogEntry:
203
+ """Create structured log entry"""
204
+
205
+ return LogEntry(
206
+ timestamp=datetime.now(timezone.utc).isoformat(),
207
+ level=level.name,
208
+ event_type=event_type.value,
209
+ message=message,
210
+ component=component or self.name,
211
+ session_id=self.session_id,
212
+ trace_id=self._get_trace_id(),
213
+ user_id=user_id,
214
+ duration_ms=duration_ms,
215
+ metadata=metadata or {},
216
+ tags=tags or [],
217
+ environment=self._detect_environment(),
218
+ version=self._get_version()
219
+ )
220
+
221
+ def _detect_environment(self) -> str:
222
+ """Detect current environment"""
223
+ if any(env in str(Path.cwd()) for env in ['test', 'pytest']):
224
+ return 'test'
225
+ elif 'STREAMLIT_SERVER_PORT' in os.environ:
226
+ return 'streamlit'
227
+ elif 'SPACE_ID' in os.environ:
228
+ return 'huggingface_spaces'
229
+ elif 'DOCKER_CONTAINER' in os.environ:
230
+ return 'docker'
231
+ else:
232
+ return 'local'
233
+
234
+ def _get_version(self) -> str:
235
+ """Get application version"""
236
+ # Try to read from metadata or config
237
+ try:
238
+ metadata_path = Path("/tmp/metadata.json")
239
+ if metadata_path.exists():
240
+ with open(metadata_path) as f:
241
+ metadata = json.load(f)
242
+ return metadata.get('model_version', '1.0')
243
+ except:
244
+ pass
245
+ return '1.0'
246
+
247
+ def log(self,
248
+ level: LogLevel,
249
+ event_type: EventType,
250
+ message: str,
251
+ component: str = None,
252
+ duration_ms: float = None,
253
+ metadata: Dict[str, Any] = None,
254
+ tags: list = None,
255
+ user_id: str = None,
256
+ exc_info: bool = False):
257
+ """Core logging method"""
258
+
259
+ if level.value < self.log_level.value:
260
+ return
261
+
262
+ # Create structured log entry
263
+ log_entry = self._create_log_entry(
264
+ level=level,
265
+ event_type=event_type,
266
+ message=message,
267
+ component=component,
268
+ duration_ms=duration_ms,
269
+ metadata=metadata,
270
+ tags=tags,
271
+ user_id=user_id
272
+ )
273
+
274
+ # Add exception information if requested
275
+ if exc_info:
276
+ log_entry.metadata['exception'] = {
277
+ 'type': sys.exc_info()[0].__name__ if sys.exc_info()[0] else None,
278
+ 'message': str(sys.exc_info()[1]) if sys.exc_info()[1] else None,
279
+ 'traceback': traceback.format_exc() if sys.exc_info()[0] else None
280
+ }
281
+
282
+ # Log using Python's logging framework
283
+ self.logger.log(
284
+ level.value,
285
+ log_entry.to_json() if self.enable_json_format else message,
286
+ extra={'log_entry': log_entry}
287
+ )
288
+
289
+ # Convenience methods for different log levels
290
+ def debug(self, event_type: EventType, message: str, **kwargs):
291
+ """Log debug message"""
292
+ self.log(LogLevel.DEBUG, event_type, message, **kwargs)
293
+
294
+ def info(self, event_type: EventType, message: str, **kwargs):
295
+ """Log info message"""
296
+ self.log(LogLevel.INFO, event_type, message, **kwargs)
297
+
298
+ def warning(self, event_type: EventType, message: str, **kwargs):
299
+ """Log warning message"""
300
+ self.log(LogLevel.WARNING, event_type, message, **kwargs)
301
+
302
+ def error(self, event_type: EventType, message: str, **kwargs):
303
+ """Log error message"""
304
+ self.log(LogLevel.ERROR, event_type, message, exc_info=True, **kwargs)
305
+
306
+ def critical(self, event_type: EventType, message: str, **kwargs):
307
+ """Log critical message"""
308
+ self.log(LogLevel.CRITICAL, event_type, message, exc_info=True, **kwargs)
309
+
310
+ @contextmanager
311
+ def operation(self,
312
+ event_type: EventType,
313
+ operation_name: str,
314
+ component: str = None,
315
+ metadata: Dict[str, Any] = None):
316
+ """Context manager for timing operations"""
317
+
318
+ start_time = time.time()
319
+ trace_id = f"{operation_name}_{int(start_time * 1000)}"
320
+
321
+ # Set trace ID for operation
322
+ self.set_trace_id(trace_id)
323
+
324
+ # Log operation start
325
+ self.info(
326
+ event_type,
327
+ f"Starting {operation_name}",
328
+ component=component,
329
+ metadata={**(metadata or {}), 'operation': operation_name, 'status': 'started'}
330
+ )
331
+
332
+ try:
333
+ yield self
334
+
335
+ # Log successful completion
336
+ duration = (time.time() - start_time) * 1000
337
+ self.info(
338
+ event_type,
339
+ f"Completed {operation_name}",
340
+ component=component,
341
+ duration_ms=duration,
342
+ metadata={**(metadata or {}), 'operation': operation_name, 'status': 'completed'}
343
+ )
344
+
345
+ except Exception as e:
346
+ # Log error
347
+ duration = (time.time() - start_time) * 1000
348
+ self.error(
349
+ EventType.MODEL_TRAINING_ERROR,
350
+ f"Failed {operation_name}: {str(e)}",
351
+ component=component,
352
+ duration_ms=duration,
353
+ metadata={**(metadata or {}), 'operation': operation_name, 'status': 'failed'}
354
+ )
355
+ raise
356
+
357
+ finally:
358
+ # Clear trace ID
359
+ self.clear_trace_id()
360
+
361
+ def log_performance_metrics(self,
362
+ component: str,
363
+ metrics: Dict[str, Union[int, float]],
364
+ tags: list = None):
365
+ """Log performance metrics"""
366
+ self.info(
367
+ EventType.PERFORMANCE_METRIC,
368
+ f"Performance metrics for {component}",
369
+ component=component,
370
+ metadata={'metrics': metrics},
371
+ tags=tags or []
372
+ )
373
+
374
+ def log_model_metrics(self,
375
+ model_name: str,
376
+ metrics: Dict[str, float],
377
+ dataset_size: int = None,
378
+ cv_folds: int = None,
379
+ metadata: Dict[str, Any] = None):
380
+ """Log model performance metrics"""
381
+ model_metadata = {
382
+ 'model_name': model_name,
383
+ 'metrics': metrics,
384
+ **(metadata or {})
385
+ }
386
+
387
+ if dataset_size:
388
+ model_metadata['dataset_size'] = dataset_size
389
+ if cv_folds:
390
+ model_metadata['cv_folds'] = cv_folds
391
+
392
+ self.info(
393
+ EventType.MODEL_VALIDATION,
394
+ f"Model validation completed for {model_name}",
395
+ component="model_trainer",
396
+ metadata=model_metadata,
397
+ tags=['model_validation', 'metrics']
398
+ )
399
+
400
+ def log_cpu_constraint_warning(self,
401
+ component: str,
402
+ operation: str,
403
+ resource_usage: Dict[str, Any] = None):
404
+ """Log CPU constraint warnings for HuggingFace Spaces"""
405
+ self.warning(
406
+ EventType.CPU_CONSTRAINT_WARNING,
407
+ f"CPU constraint detected in {component} during {operation}",
408
+ component=component,
409
+ metadata={
410
+ 'operation': operation,
411
+ 'resource_usage': resource_usage or {},
412
+ 'optimization_applied': True,
413
+ 'environment': 'huggingface_spaces'
414
+ },
415
+ tags=['cpu_constraint', 'optimization', 'hfs']
416
+ )
417
+
418
+
419
+ class JsonFormatter(logging.Formatter):
420
+ """JSON formatter for structured logging"""
421
+
422
+ def format(self, record):
423
+ """Format log record as JSON"""
424
+ if hasattr(record, 'log_entry'):
425
+ return record.log_entry.to_json()
426
+
427
+ # Fallback for non-structured logs
428
+ log_data = {
429
+ 'timestamp': datetime.now(timezone.utc).isoformat(),
430
+ 'level': record.levelname,
431
+ 'message': record.getMessage(),
432
+ 'component': record.name,
433
+ 'environment': 'unknown'
434
+ }
435
+
436
+ if record.exc_info:
437
+ log_data['exception'] = {
438
+ 'type': record.exc_info[0].__name__ if record.exc_info[0] else None,
439
+ 'message': str(record.exc_info[1]) if record.exc_info[1] else None,
440
+ 'traceback': self.formatException(record.exc_info)
441
+ }
442
+
443
+ return json.dumps(log_data, default=str, ensure_ascii=False)
444
+
445
+
446
+ # Singleton logger instances for different components
447
+ class MLOpsLoggers:
448
+ """Centralized logger management for MLOps components"""
449
+
450
+ _loggers: Dict[str, StructuredLogger] = {}
451
+
452
+ @classmethod
453
+ def get_logger(cls,
454
+ component: str,
455
+ log_level: LogLevel = LogLevel.INFO,
456
+ log_file: Optional[Path] = None) -> StructuredLogger:
457
+ """Get or create logger for component"""
458
+ if component not in cls._loggers:
459
+ if log_file is None:
460
+ log_file = Path("/tmp/logs") / f"{component}.log"
461
+
462
+ cls._loggers[component] = StructuredLogger(
463
+ name=component,
464
+ log_level=log_level,
465
+ log_file=log_file,
466
+ enable_console=True,
467
+ enable_json_format=True
468
+ )
469
+
470
+ return cls._loggers[component]
471
+
472
+ @classmethod
473
+ def get_model_trainer_logger(cls) -> StructuredLogger:
474
+ """Get logger for model training components"""
475
+ return cls.get_logger("model_trainer", LogLevel.INFO)
476
+
477
+ @classmethod
478
+ def get_retraining_logger(cls) -> StructuredLogger:
479
+ """Get logger for retraining components"""
480
+ return cls.get_logger("model_retrainer", LogLevel.INFO)
481
+
482
+ @classmethod
483
+ def get_api_logger(cls) -> StructuredLogger:
484
+ """Get logger for API components"""
485
+ return cls.get_logger("api_server", LogLevel.INFO)
486
+
487
+ @classmethod
488
+ def get_monitoring_logger(cls) -> StructuredLogger:
489
+ """Get logger for monitoring components"""
490
+ return cls.get_logger("monitoring", LogLevel.INFO)
491
+
492
+ @classmethod
493
+ def get_data_logger(cls) -> StructuredLogger:
494
+ """Get logger for data processing components"""
495
+ return cls.get_logger("data_processing", LogLevel.INFO)
496
+
497
+
498
+ # Performance monitoring utilities
499
+ class PerformanceMonitor:
500
+ """Monitor and log performance metrics for CPU-constrained environments"""
501
+
502
+ def __init__(self, logger: StructuredLogger):
503
+ self.logger = logger
504
+
505
+ def monitor_training_performance(self,
506
+ model_name: str,
507
+ dataset_size: int,
508
+ training_time: float,
509
+ memory_usage_mb: float = None):
510
+ """Monitor and log training performance"""
511
+
512
+ # Calculate performance metrics
513
+ samples_per_second = dataset_size / training_time if training_time > 0 else 0
514
+
515
+ performance_metrics = {
516
+ 'training_time_seconds': training_time,
517
+ 'dataset_size': dataset_size,
518
+ 'samples_per_second': samples_per_second,
519
+ 'model_name': model_name
520
+ }
521
+
522
+ if memory_usage_mb:
523
+ performance_metrics['memory_usage_mb'] = memory_usage_mb
524
+
525
+ # Log performance
526
+ self.logger.log_performance_metrics(
527
+ component="model_trainer",
528
+ metrics=performance_metrics,
529
+ tags=['training_performance', 'cpu_optimized']
530
+ )
531
+
532
+ # Check for performance issues
533
+ if training_time > 300: # 5 minutes
534
+ self.logger.log_cpu_constraint_warning(
535
+ component="model_trainer",
536
+ operation="model_training",
537
+ resource_usage={'training_time': training_time, 'dataset_size': dataset_size}
538
+ )
539
+
540
+ def monitor_cv_performance(self,
541
+ cv_folds: int,
542
+ total_cv_time: float,
543
+ models_evaluated: int):
544
+ """Monitor cross-validation performance"""
545
+
546
+ avg_fold_time = total_cv_time / cv_folds if cv_folds > 0 else 0
547
+ avg_model_time = total_cv_time / models_evaluated if models_evaluated > 0 else 0
548
+
549
+ cv_metrics = {
550
+ 'cv_folds': cv_folds,
551
+ 'total_cv_time_seconds': total_cv_time,
552
+ 'avg_fold_time_seconds': avg_fold_time,
553
+ 'models_evaluated': models_evaluated,
554
+ 'avg_model_time_seconds': avg_model_time
555
+ }
556
+
557
+ self.logger.log_performance_metrics(
558
+ component="cross_validation",
559
+ metrics=cv_metrics,
560
+ tags=['cv_performance', 'statistical_validation']
561
+ )
562
+
563
+ def monitor_ensemble_performance(self,
564
+ individual_models_count: int,
565
+ ensemble_training_time: float,
566
+ statistical_test_time: float):
567
+ """Monitor ensemble creation and validation performance"""
568
+
569
+ ensemble_metrics = {
570
+ 'individual_models_count': individual_models_count,
571
+ 'ensemble_training_time_seconds': ensemble_training_time,
572
+ 'statistical_test_time_seconds': statistical_test_time,
573
+ 'total_ensemble_time_seconds': ensemble_training_time + statistical_test_time
574
+ }
575
+
576
+ self.logger.log_performance_metrics(
577
+ component="ensemble_manager",
578
+ metrics=ensemble_metrics,
579
+ tags=['ensemble_performance', 'statistical_tests']
580
+ )
581
+
582
+
583
+ # Integration helpers for existing codebase
584
+ def setup_mlops_logging():
585
+ """Setup structured logging for MLOps components"""
586
+ # Ensure log directory exists
587
+ log_dir = Path("/tmp/logs")
588
+ log_dir.mkdir(exist_ok=True)
589
+
590
+ # Configure root logger to avoid interference
591
+ root_logger = logging.getLogger()
592
+ root_logger.setLevel(logging.INFO)
593
+
594
+ # Clear any existing handlers
595
+ for handler in root_logger.handlers[:]:
596
+ root_logger.removeHandler(handler)
597
+
598
+ return MLOpsLoggers
599
+
600
+
601
+ def get_component_logger(component_name: str) -> StructuredLogger:
602
+ """Get logger for specific component (backwards compatibility)"""
603
+ return MLOpsLoggers.get_logger(component_name)
604
+
605
+
606
+ # Decorators for automatic logging
607
+ def log_function_call(event_type: EventType, component: str = None):
608
+ """Decorator to automatically log function calls"""
609
+ def decorator(func):
610
+ def wrapper(*args, **kwargs):
611
+ logger = MLOpsLoggers.get_logger(component or func.__module__)
612
+
613
+ with logger.operation(
614
+ event_type=event_type,
615
+ operation_name=func.__name__,
616
+ component=component,
617
+ metadata={'function': func.__name__, 'args_count': len(args), 'kwargs_count': len(kwargs)}
618
+ ):
619
+ return func(*args, **kwargs)
620
+
621
+ return wrapper
622
+ return decorator
623
+
624
+
625
+ # Example usage functions for integration
626
+ def integrate_with_retrain_py():
627
+ """Example integration with retrain.py"""
628
+ logger = MLOpsLoggers.get_retraining_logger()
629
+
630
+ # Example: Log retraining session start
631
+ logger.info(
632
+ EventType.MODEL_TRAINING_START,
633
+ "Enhanced retraining session started with LightGBM and ensemble",
634
+ component="retrain",
635
+ metadata={
636
+ 'models': ['logistic_regression', 'random_forest', 'lightgbm'],
637
+ 'ensemble_enabled': True,
638
+ 'enhanced_features': True
639
+ },
640
+ tags=['retraining', 'lightgbm', 'ensemble']
641
+ )
642
+
643
+ return logger
644
+
645
+
646
+ def integrate_with_train_py():
647
+ """Example integration with train.py"""
648
+ logger = MLOpsLoggers.get_model_trainer_logger()
649
+
650
+ # Example: Log training session start
651
+ logger.info(
652
+ EventType.MODEL_TRAINING_START,
653
+ "Enhanced training session started with comprehensive CV",
654
+ component="train",
655
+ metadata={
656
+ 'models': ['logistic_regression', 'random_forest', 'lightgbm'],
657
+ 'cv_folds': 5,
658
+ 'ensemble_enabled': True
659
+ },
660
+ tags=['training', 'cv', 'ensemble']
661
+ )
662
+
663
+ return logger
664
+
665
+
666
+ # CPU constraint monitoring
667
+ import os
668
+ import psutil
669
+
670
+ def monitor_cpu_constraints():
671
+ """Monitor CPU usage and memory for HuggingFace Spaces constraints"""
672
+ logger = MLOpsLoggers.get_monitoring_logger()
673
+
674
+ try:
675
+ # Get system metrics
676
+ cpu_percent = psutil.cpu_percent(interval=1)
677
+ memory = psutil.virtual_memory()
678
+ process = psutil.Process()
679
+
680
+ resource_metrics = {
681
+ 'cpu_percent': cpu_percent,
682
+ 'memory_percent': memory.percent,
683
+ 'memory_used_mb': memory.used / 1024 / 1024,
684
+ 'memory_available_mb': memory.available / 1024 / 1024,
685
+ 'process_memory_mb': process.memory_info().rss / 1024 / 1024,
686
+ 'process_cpu_percent': process.cpu_percent()
687
+ }
688
+
689
+ # Log resource usage
690
+ logger.log_performance_metrics(
691
+ component="system_monitor",
692
+ metrics=resource_metrics,
693
+ tags=['resource_monitoring', 'hfs_constraints']
694
+ )
695
+
696
+ # Alert on high usage (HFS constraints)
697
+ if cpu_percent > 80 or memory.percent > 85:
698
+ logger.log_cpu_constraint_warning(
699
+ component="system_monitor",
700
+ operation="resource_monitoring",
701
+ resource_usage=resource_metrics
702
+ )
703
+
704
+ return resource_metrics
705
+
706
+ except Exception as e:
707
+ logger.error(
708
+ EventType.PERFORMANCE_METRIC,
709
+ f"Failed to monitor CPU constraints: {str(e)}",
710
+ component="system_monitor"
711
+ )
712
+ return None
713
+
714
+
715
+ if __name__ == "__main__":
716
+ # Example usage and testing
717
+ setup_mlops_logging()
718
+
719
+ # Test structured logging
720
+ logger = MLOpsLoggers.get_model_trainer_logger()
721
+
722
+ # Test basic logging
723
+ logger.info(
724
+ EventType.MODEL_TRAINING_START,
725
+ "Testing structured logging system",
726
+ metadata={'test': True, 'version': '1.0'},
727
+ tags=['test', 'structured_logging']
728
+ )
729
+
730
+ # Test operation timing
731
+ with logger.operation(
732
+ EventType.MODEL_VALIDATION,
733
+ "test_operation",
734
+ metadata={'test_data': 'example'}
735
+ ):
736
+ time.sleep(0.1) # Simulate work
737
+
738
+ # Test performance monitoring
739
+ perf_monitor = PerformanceMonitor(logger)
740
+ perf_monitor.monitor_training_performance(
741
+ model_name="test_model",
742
+ dataset_size=1000,
743
+ training_time=5.0,
744
+ memory_usage_mb=150.0
745
+ )
746
+
747
+ # Test CPU monitoring
748
+ monitor_cpu_constraints()
749
+
750
+ print("Structured logging system test completed successfully!")