Commit
·
0616f70
1
Parent(s):
0d47abb
Create utils/structured_logger.py
Browse filesAdding Tests for MLOps Infrastructure Enhancement
- utils/structured_logger.py +750 -0
utils/structured_logger.py
ADDED
@@ -0,0 +1,750 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# utils/structured_logger.py
|
2 |
+
# Production-ready structured logging system for MLOps grade enhancement
|
3 |
+
|
4 |
+
import logging
|
5 |
+
import json
|
6 |
+
import sys
|
7 |
+
import traceback
|
8 |
+
from datetime import datetime, timezone
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import Dict, Any, Optional, Union
|
11 |
+
from contextlib import contextmanager
|
12 |
+
from dataclasses import dataclass, asdict
|
13 |
+
from enum import Enum
|
14 |
+
import threading
|
15 |
+
import time
|
16 |
+
|
17 |
+
|
18 |
+
class LogLevel(Enum):
|
19 |
+
"""Standardized log levels with numeric values for filtering"""
|
20 |
+
DEBUG = 10
|
21 |
+
INFO = 20
|
22 |
+
WARNING = 30
|
23 |
+
ERROR = 40
|
24 |
+
CRITICAL = 50
|
25 |
+
|
26 |
+
|
27 |
+
class EventType(Enum):
|
28 |
+
"""Standardized event types for structured logging"""
|
29 |
+
# Model lifecycle events
|
30 |
+
MODEL_TRAINING_START = "model.training.start"
|
31 |
+
MODEL_TRAINING_COMPLETE = "model.training.complete"
|
32 |
+
MODEL_TRAINING_ERROR = "model.training.error"
|
33 |
+
MODEL_VALIDATION = "model.validation"
|
34 |
+
MODEL_PROMOTION = "model.promotion"
|
35 |
+
MODEL_BACKUP = "model.backup"
|
36 |
+
|
37 |
+
# Data processing events
|
38 |
+
DATA_LOADING = "data.loading"
|
39 |
+
DATA_VALIDATION = "data.validation"
|
40 |
+
DATA_PREPROCESSING = "data.preprocessing"
|
41 |
+
DATA_QUALITY_CHECK = "data.quality.check"
|
42 |
+
|
43 |
+
# Feature engineering events
|
44 |
+
FEATURE_EXTRACTION = "features.extraction"
|
45 |
+
FEATURE_SELECTION = "features.selection"
|
46 |
+
FEATURE_VALIDATION = "features.validation"
|
47 |
+
|
48 |
+
# Cross-validation and ensemble events
|
49 |
+
CROSS_VALIDATION_START = "cv.start"
|
50 |
+
CROSS_VALIDATION_COMPLETE = "cv.complete"
|
51 |
+
ENSEMBLE_CREATION = "ensemble.creation"
|
52 |
+
ENSEMBLE_VALIDATION = "ensemble.validation"
|
53 |
+
STATISTICAL_COMPARISON = "model.statistical_comparison"
|
54 |
+
|
55 |
+
# System performance events
|
56 |
+
PERFORMANCE_METRIC = "system.performance"
|
57 |
+
RESOURCE_USAGE = "system.resource_usage"
|
58 |
+
CPU_CONSTRAINT_WARNING = "system.cpu_constraint"
|
59 |
+
|
60 |
+
# API and application events
|
61 |
+
API_REQUEST = "api.request"
|
62 |
+
API_RESPONSE = "api.response"
|
63 |
+
API_ERROR = "api.error"
|
64 |
+
PREDICTION_REQUEST = "prediction.request"
|
65 |
+
PREDICTION_RESPONSE = "prediction.response"
|
66 |
+
|
67 |
+
# Monitoring and alerting events
|
68 |
+
DRIFT_DETECTION = "monitor.drift"
|
69 |
+
ALERT_TRIGGERED = "alert.triggered"
|
70 |
+
HEALTH_CHECK = "health.check"
|
71 |
+
|
72 |
+
# Security and access events
|
73 |
+
ACCESS_GRANTED = "security.access_granted"
|
74 |
+
ACCESS_DENIED = "security.access_denied"
|
75 |
+
AUTHENTICATION = "security.authentication"
|
76 |
+
|
77 |
+
|
78 |
+
@dataclass
|
79 |
+
class LogEntry:
|
80 |
+
"""Structured log entry with standardized fields"""
|
81 |
+
timestamp: str
|
82 |
+
level: str
|
83 |
+
event_type: str
|
84 |
+
message: str
|
85 |
+
component: str
|
86 |
+
session_id: Optional[str] = None
|
87 |
+
trace_id: Optional[str] = None
|
88 |
+
user_id: Optional[str] = None
|
89 |
+
duration_ms: Optional[float] = None
|
90 |
+
metadata: Optional[Dict[str, Any]] = None
|
91 |
+
tags: Optional[list] = None
|
92 |
+
environment: str = "production"
|
93 |
+
version: str = "1.0"
|
94 |
+
|
95 |
+
def to_dict(self) -> Dict[str, Any]:
|
96 |
+
"""Convert log entry to dictionary for JSON serialization"""
|
97 |
+
return {k: v for k, v in asdict(self).items() if v is not None}
|
98 |
+
|
99 |
+
def to_json(self) -> str:
|
100 |
+
"""Convert log entry to JSON string"""
|
101 |
+
return json.dumps(self.to_dict(), default=str, ensure_ascii=False)
|
102 |
+
|
103 |
+
|
104 |
+
class StructuredLogger:
|
105 |
+
"""Production-ready structured logger with performance monitoring"""
|
106 |
+
|
107 |
+
def __init__(self,
|
108 |
+
name: str,
|
109 |
+
log_level: LogLevel = LogLevel.INFO,
|
110 |
+
log_file: Optional[Path] = None,
|
111 |
+
max_file_size_mb: int = 100,
|
112 |
+
backup_count: int = 5,
|
113 |
+
enable_console: bool = True,
|
114 |
+
enable_json_format: bool = True):
|
115 |
+
|
116 |
+
self.name = name
|
117 |
+
self.log_level = log_level
|
118 |
+
self.enable_json_format = enable_json_format
|
119 |
+
self.session_id = self._generate_session_id()
|
120 |
+
self._local = threading.local()
|
121 |
+
|
122 |
+
# Setup logging infrastructure
|
123 |
+
self.logger = logging.getLogger(name)
|
124 |
+
self.logger.setLevel(log_level.value)
|
125 |
+
|
126 |
+
# Clear existing handlers to avoid duplicates
|
127 |
+
self.logger.handlers.clear()
|
128 |
+
|
129 |
+
# Setup file logging if specified
|
130 |
+
if log_file:
|
131 |
+
self._setup_file_handler(log_file, max_file_size_mb, backup_count)
|
132 |
+
|
133 |
+
# Setup console logging if enabled
|
134 |
+
if enable_console:
|
135 |
+
self._setup_console_handler()
|
136 |
+
|
137 |
+
def _generate_session_id(self) -> str:
|
138 |
+
"""Generate unique session ID for tracking related events"""
|
139 |
+
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
140 |
+
thread_id = threading.current_thread().ident
|
141 |
+
return f"session_{timestamp}_{thread_id}"
|
142 |
+
|
143 |
+
def _setup_file_handler(self, log_file: Path, max_size_mb: int, backup_count: int):
|
144 |
+
"""Setup rotating file handler"""
|
145 |
+
from logging.handlers import RotatingFileHandler
|
146 |
+
|
147 |
+
log_file.parent.mkdir(parents=True, exist_ok=True)
|
148 |
+
|
149 |
+
file_handler = RotatingFileHandler(
|
150 |
+
log_file,
|
151 |
+
maxBytes=max_size_mb * 1024 * 1024,
|
152 |
+
backupCount=backup_count,
|
153 |
+
encoding='utf-8'
|
154 |
+
)
|
155 |
+
|
156 |
+
if self.enable_json_format:
|
157 |
+
file_handler.setFormatter(JsonFormatter())
|
158 |
+
else:
|
159 |
+
file_handler.setFormatter(self._create_standard_formatter())
|
160 |
+
|
161 |
+
self.logger.addHandler(file_handler)
|
162 |
+
|
163 |
+
def _setup_console_handler(self):
|
164 |
+
"""Setup console handler with appropriate formatting"""
|
165 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
166 |
+
|
167 |
+
if self.enable_json_format:
|
168 |
+
console_handler.setFormatter(JsonFormatter())
|
169 |
+
else:
|
170 |
+
console_handler.setFormatter(self._create_standard_formatter())
|
171 |
+
|
172 |
+
self.logger.addHandler(console_handler)
|
173 |
+
|
174 |
+
def _create_standard_formatter(self):
|
175 |
+
"""Create human-readable formatter for non-JSON output"""
|
176 |
+
return logging.Formatter(
|
177 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
178 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
179 |
+
)
|
180 |
+
|
181 |
+
def _get_trace_id(self) -> Optional[str]:
|
182 |
+
"""Get current trace ID from thread-local storage"""
|
183 |
+
return getattr(self._local, 'trace_id', None)
|
184 |
+
|
185 |
+
def set_trace_id(self, trace_id: str):
|
186 |
+
"""Set trace ID for current thread"""
|
187 |
+
self._local.trace_id = trace_id
|
188 |
+
|
189 |
+
def clear_trace_id(self):
|
190 |
+
"""Clear trace ID for current thread"""
|
191 |
+
if hasattr(self._local, 'trace_id'):
|
192 |
+
del self._local.trace_id
|
193 |
+
|
194 |
+
def _create_log_entry(self,
|
195 |
+
level: LogLevel,
|
196 |
+
event_type: EventType,
|
197 |
+
message: str,
|
198 |
+
component: str = None,
|
199 |
+
duration_ms: float = None,
|
200 |
+
metadata: Dict[str, Any] = None,
|
201 |
+
tags: list = None,
|
202 |
+
user_id: str = None) -> LogEntry:
|
203 |
+
"""Create structured log entry"""
|
204 |
+
|
205 |
+
return LogEntry(
|
206 |
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
207 |
+
level=level.name,
|
208 |
+
event_type=event_type.value,
|
209 |
+
message=message,
|
210 |
+
component=component or self.name,
|
211 |
+
session_id=self.session_id,
|
212 |
+
trace_id=self._get_trace_id(),
|
213 |
+
user_id=user_id,
|
214 |
+
duration_ms=duration_ms,
|
215 |
+
metadata=metadata or {},
|
216 |
+
tags=tags or [],
|
217 |
+
environment=self._detect_environment(),
|
218 |
+
version=self._get_version()
|
219 |
+
)
|
220 |
+
|
221 |
+
def _detect_environment(self) -> str:
|
222 |
+
"""Detect current environment"""
|
223 |
+
if any(env in str(Path.cwd()) for env in ['test', 'pytest']):
|
224 |
+
return 'test'
|
225 |
+
elif 'STREAMLIT_SERVER_PORT' in os.environ:
|
226 |
+
return 'streamlit'
|
227 |
+
elif 'SPACE_ID' in os.environ:
|
228 |
+
return 'huggingface_spaces'
|
229 |
+
elif 'DOCKER_CONTAINER' in os.environ:
|
230 |
+
return 'docker'
|
231 |
+
else:
|
232 |
+
return 'local'
|
233 |
+
|
234 |
+
def _get_version(self) -> str:
|
235 |
+
"""Get application version"""
|
236 |
+
# Try to read from metadata or config
|
237 |
+
try:
|
238 |
+
metadata_path = Path("/tmp/metadata.json")
|
239 |
+
if metadata_path.exists():
|
240 |
+
with open(metadata_path) as f:
|
241 |
+
metadata = json.load(f)
|
242 |
+
return metadata.get('model_version', '1.0')
|
243 |
+
except:
|
244 |
+
pass
|
245 |
+
return '1.0'
|
246 |
+
|
247 |
+
def log(self,
|
248 |
+
level: LogLevel,
|
249 |
+
event_type: EventType,
|
250 |
+
message: str,
|
251 |
+
component: str = None,
|
252 |
+
duration_ms: float = None,
|
253 |
+
metadata: Dict[str, Any] = None,
|
254 |
+
tags: list = None,
|
255 |
+
user_id: str = None,
|
256 |
+
exc_info: bool = False):
|
257 |
+
"""Core logging method"""
|
258 |
+
|
259 |
+
if level.value < self.log_level.value:
|
260 |
+
return
|
261 |
+
|
262 |
+
# Create structured log entry
|
263 |
+
log_entry = self._create_log_entry(
|
264 |
+
level=level,
|
265 |
+
event_type=event_type,
|
266 |
+
message=message,
|
267 |
+
component=component,
|
268 |
+
duration_ms=duration_ms,
|
269 |
+
metadata=metadata,
|
270 |
+
tags=tags,
|
271 |
+
user_id=user_id
|
272 |
+
)
|
273 |
+
|
274 |
+
# Add exception information if requested
|
275 |
+
if exc_info:
|
276 |
+
log_entry.metadata['exception'] = {
|
277 |
+
'type': sys.exc_info()[0].__name__ if sys.exc_info()[0] else None,
|
278 |
+
'message': str(sys.exc_info()[1]) if sys.exc_info()[1] else None,
|
279 |
+
'traceback': traceback.format_exc() if sys.exc_info()[0] else None
|
280 |
+
}
|
281 |
+
|
282 |
+
# Log using Python's logging framework
|
283 |
+
self.logger.log(
|
284 |
+
level.value,
|
285 |
+
log_entry.to_json() if self.enable_json_format else message,
|
286 |
+
extra={'log_entry': log_entry}
|
287 |
+
)
|
288 |
+
|
289 |
+
# Convenience methods for different log levels
|
290 |
+
def debug(self, event_type: EventType, message: str, **kwargs):
|
291 |
+
"""Log debug message"""
|
292 |
+
self.log(LogLevel.DEBUG, event_type, message, **kwargs)
|
293 |
+
|
294 |
+
def info(self, event_type: EventType, message: str, **kwargs):
|
295 |
+
"""Log info message"""
|
296 |
+
self.log(LogLevel.INFO, event_type, message, **kwargs)
|
297 |
+
|
298 |
+
def warning(self, event_type: EventType, message: str, **kwargs):
|
299 |
+
"""Log warning message"""
|
300 |
+
self.log(LogLevel.WARNING, event_type, message, **kwargs)
|
301 |
+
|
302 |
+
def error(self, event_type: EventType, message: str, **kwargs):
|
303 |
+
"""Log error message"""
|
304 |
+
self.log(LogLevel.ERROR, event_type, message, exc_info=True, **kwargs)
|
305 |
+
|
306 |
+
def critical(self, event_type: EventType, message: str, **kwargs):
|
307 |
+
"""Log critical message"""
|
308 |
+
self.log(LogLevel.CRITICAL, event_type, message, exc_info=True, **kwargs)
|
309 |
+
|
310 |
+
@contextmanager
|
311 |
+
def operation(self,
|
312 |
+
event_type: EventType,
|
313 |
+
operation_name: str,
|
314 |
+
component: str = None,
|
315 |
+
metadata: Dict[str, Any] = None):
|
316 |
+
"""Context manager for timing operations"""
|
317 |
+
|
318 |
+
start_time = time.time()
|
319 |
+
trace_id = f"{operation_name}_{int(start_time * 1000)}"
|
320 |
+
|
321 |
+
# Set trace ID for operation
|
322 |
+
self.set_trace_id(trace_id)
|
323 |
+
|
324 |
+
# Log operation start
|
325 |
+
self.info(
|
326 |
+
event_type,
|
327 |
+
f"Starting {operation_name}",
|
328 |
+
component=component,
|
329 |
+
metadata={**(metadata or {}), 'operation': operation_name, 'status': 'started'}
|
330 |
+
)
|
331 |
+
|
332 |
+
try:
|
333 |
+
yield self
|
334 |
+
|
335 |
+
# Log successful completion
|
336 |
+
duration = (time.time() - start_time) * 1000
|
337 |
+
self.info(
|
338 |
+
event_type,
|
339 |
+
f"Completed {operation_name}",
|
340 |
+
component=component,
|
341 |
+
duration_ms=duration,
|
342 |
+
metadata={**(metadata or {}), 'operation': operation_name, 'status': 'completed'}
|
343 |
+
)
|
344 |
+
|
345 |
+
except Exception as e:
|
346 |
+
# Log error
|
347 |
+
duration = (time.time() - start_time) * 1000
|
348 |
+
self.error(
|
349 |
+
EventType.MODEL_TRAINING_ERROR,
|
350 |
+
f"Failed {operation_name}: {str(e)}",
|
351 |
+
component=component,
|
352 |
+
duration_ms=duration,
|
353 |
+
metadata={**(metadata or {}), 'operation': operation_name, 'status': 'failed'}
|
354 |
+
)
|
355 |
+
raise
|
356 |
+
|
357 |
+
finally:
|
358 |
+
# Clear trace ID
|
359 |
+
self.clear_trace_id()
|
360 |
+
|
361 |
+
def log_performance_metrics(self,
|
362 |
+
component: str,
|
363 |
+
metrics: Dict[str, Union[int, float]],
|
364 |
+
tags: list = None):
|
365 |
+
"""Log performance metrics"""
|
366 |
+
self.info(
|
367 |
+
EventType.PERFORMANCE_METRIC,
|
368 |
+
f"Performance metrics for {component}",
|
369 |
+
component=component,
|
370 |
+
metadata={'metrics': metrics},
|
371 |
+
tags=tags or []
|
372 |
+
)
|
373 |
+
|
374 |
+
def log_model_metrics(self,
|
375 |
+
model_name: str,
|
376 |
+
metrics: Dict[str, float],
|
377 |
+
dataset_size: int = None,
|
378 |
+
cv_folds: int = None,
|
379 |
+
metadata: Dict[str, Any] = None):
|
380 |
+
"""Log model performance metrics"""
|
381 |
+
model_metadata = {
|
382 |
+
'model_name': model_name,
|
383 |
+
'metrics': metrics,
|
384 |
+
**(metadata or {})
|
385 |
+
}
|
386 |
+
|
387 |
+
if dataset_size:
|
388 |
+
model_metadata['dataset_size'] = dataset_size
|
389 |
+
if cv_folds:
|
390 |
+
model_metadata['cv_folds'] = cv_folds
|
391 |
+
|
392 |
+
self.info(
|
393 |
+
EventType.MODEL_VALIDATION,
|
394 |
+
f"Model validation completed for {model_name}",
|
395 |
+
component="model_trainer",
|
396 |
+
metadata=model_metadata,
|
397 |
+
tags=['model_validation', 'metrics']
|
398 |
+
)
|
399 |
+
|
400 |
+
def log_cpu_constraint_warning(self,
|
401 |
+
component: str,
|
402 |
+
operation: str,
|
403 |
+
resource_usage: Dict[str, Any] = None):
|
404 |
+
"""Log CPU constraint warnings for HuggingFace Spaces"""
|
405 |
+
self.warning(
|
406 |
+
EventType.CPU_CONSTRAINT_WARNING,
|
407 |
+
f"CPU constraint detected in {component} during {operation}",
|
408 |
+
component=component,
|
409 |
+
metadata={
|
410 |
+
'operation': operation,
|
411 |
+
'resource_usage': resource_usage or {},
|
412 |
+
'optimization_applied': True,
|
413 |
+
'environment': 'huggingface_spaces'
|
414 |
+
},
|
415 |
+
tags=['cpu_constraint', 'optimization', 'hfs']
|
416 |
+
)
|
417 |
+
|
418 |
+
|
419 |
+
class JsonFormatter(logging.Formatter):
|
420 |
+
"""JSON formatter for structured logging"""
|
421 |
+
|
422 |
+
def format(self, record):
|
423 |
+
"""Format log record as JSON"""
|
424 |
+
if hasattr(record, 'log_entry'):
|
425 |
+
return record.log_entry.to_json()
|
426 |
+
|
427 |
+
# Fallback for non-structured logs
|
428 |
+
log_data = {
|
429 |
+
'timestamp': datetime.now(timezone.utc).isoformat(),
|
430 |
+
'level': record.levelname,
|
431 |
+
'message': record.getMessage(),
|
432 |
+
'component': record.name,
|
433 |
+
'environment': 'unknown'
|
434 |
+
}
|
435 |
+
|
436 |
+
if record.exc_info:
|
437 |
+
log_data['exception'] = {
|
438 |
+
'type': record.exc_info[0].__name__ if record.exc_info[0] else None,
|
439 |
+
'message': str(record.exc_info[1]) if record.exc_info[1] else None,
|
440 |
+
'traceback': self.formatException(record.exc_info)
|
441 |
+
}
|
442 |
+
|
443 |
+
return json.dumps(log_data, default=str, ensure_ascii=False)
|
444 |
+
|
445 |
+
|
446 |
+
# Singleton logger instances for different components
|
447 |
+
class MLOpsLoggers:
|
448 |
+
"""Centralized logger management for MLOps components"""
|
449 |
+
|
450 |
+
_loggers: Dict[str, StructuredLogger] = {}
|
451 |
+
|
452 |
+
@classmethod
|
453 |
+
def get_logger(cls,
|
454 |
+
component: str,
|
455 |
+
log_level: LogLevel = LogLevel.INFO,
|
456 |
+
log_file: Optional[Path] = None) -> StructuredLogger:
|
457 |
+
"""Get or create logger for component"""
|
458 |
+
if component not in cls._loggers:
|
459 |
+
if log_file is None:
|
460 |
+
log_file = Path("/tmp/logs") / f"{component}.log"
|
461 |
+
|
462 |
+
cls._loggers[component] = StructuredLogger(
|
463 |
+
name=component,
|
464 |
+
log_level=log_level,
|
465 |
+
log_file=log_file,
|
466 |
+
enable_console=True,
|
467 |
+
enable_json_format=True
|
468 |
+
)
|
469 |
+
|
470 |
+
return cls._loggers[component]
|
471 |
+
|
472 |
+
@classmethod
|
473 |
+
def get_model_trainer_logger(cls) -> StructuredLogger:
|
474 |
+
"""Get logger for model training components"""
|
475 |
+
return cls.get_logger("model_trainer", LogLevel.INFO)
|
476 |
+
|
477 |
+
@classmethod
|
478 |
+
def get_retraining_logger(cls) -> StructuredLogger:
|
479 |
+
"""Get logger for retraining components"""
|
480 |
+
return cls.get_logger("model_retrainer", LogLevel.INFO)
|
481 |
+
|
482 |
+
@classmethod
|
483 |
+
def get_api_logger(cls) -> StructuredLogger:
|
484 |
+
"""Get logger for API components"""
|
485 |
+
return cls.get_logger("api_server", LogLevel.INFO)
|
486 |
+
|
487 |
+
@classmethod
|
488 |
+
def get_monitoring_logger(cls) -> StructuredLogger:
|
489 |
+
"""Get logger for monitoring components"""
|
490 |
+
return cls.get_logger("monitoring", LogLevel.INFO)
|
491 |
+
|
492 |
+
@classmethod
|
493 |
+
def get_data_logger(cls) -> StructuredLogger:
|
494 |
+
"""Get logger for data processing components"""
|
495 |
+
return cls.get_logger("data_processing", LogLevel.INFO)
|
496 |
+
|
497 |
+
|
498 |
+
# Performance monitoring utilities
|
499 |
+
class PerformanceMonitor:
|
500 |
+
"""Monitor and log performance metrics for CPU-constrained environments"""
|
501 |
+
|
502 |
+
def __init__(self, logger: StructuredLogger):
|
503 |
+
self.logger = logger
|
504 |
+
|
505 |
+
def monitor_training_performance(self,
|
506 |
+
model_name: str,
|
507 |
+
dataset_size: int,
|
508 |
+
training_time: float,
|
509 |
+
memory_usage_mb: float = None):
|
510 |
+
"""Monitor and log training performance"""
|
511 |
+
|
512 |
+
# Calculate performance metrics
|
513 |
+
samples_per_second = dataset_size / training_time if training_time > 0 else 0
|
514 |
+
|
515 |
+
performance_metrics = {
|
516 |
+
'training_time_seconds': training_time,
|
517 |
+
'dataset_size': dataset_size,
|
518 |
+
'samples_per_second': samples_per_second,
|
519 |
+
'model_name': model_name
|
520 |
+
}
|
521 |
+
|
522 |
+
if memory_usage_mb:
|
523 |
+
performance_metrics['memory_usage_mb'] = memory_usage_mb
|
524 |
+
|
525 |
+
# Log performance
|
526 |
+
self.logger.log_performance_metrics(
|
527 |
+
component="model_trainer",
|
528 |
+
metrics=performance_metrics,
|
529 |
+
tags=['training_performance', 'cpu_optimized']
|
530 |
+
)
|
531 |
+
|
532 |
+
# Check for performance issues
|
533 |
+
if training_time > 300: # 5 minutes
|
534 |
+
self.logger.log_cpu_constraint_warning(
|
535 |
+
component="model_trainer",
|
536 |
+
operation="model_training",
|
537 |
+
resource_usage={'training_time': training_time, 'dataset_size': dataset_size}
|
538 |
+
)
|
539 |
+
|
540 |
+
def monitor_cv_performance(self,
|
541 |
+
cv_folds: int,
|
542 |
+
total_cv_time: float,
|
543 |
+
models_evaluated: int):
|
544 |
+
"""Monitor cross-validation performance"""
|
545 |
+
|
546 |
+
avg_fold_time = total_cv_time / cv_folds if cv_folds > 0 else 0
|
547 |
+
avg_model_time = total_cv_time / models_evaluated if models_evaluated > 0 else 0
|
548 |
+
|
549 |
+
cv_metrics = {
|
550 |
+
'cv_folds': cv_folds,
|
551 |
+
'total_cv_time_seconds': total_cv_time,
|
552 |
+
'avg_fold_time_seconds': avg_fold_time,
|
553 |
+
'models_evaluated': models_evaluated,
|
554 |
+
'avg_model_time_seconds': avg_model_time
|
555 |
+
}
|
556 |
+
|
557 |
+
self.logger.log_performance_metrics(
|
558 |
+
component="cross_validation",
|
559 |
+
metrics=cv_metrics,
|
560 |
+
tags=['cv_performance', 'statistical_validation']
|
561 |
+
)
|
562 |
+
|
563 |
+
def monitor_ensemble_performance(self,
|
564 |
+
individual_models_count: int,
|
565 |
+
ensemble_training_time: float,
|
566 |
+
statistical_test_time: float):
|
567 |
+
"""Monitor ensemble creation and validation performance"""
|
568 |
+
|
569 |
+
ensemble_metrics = {
|
570 |
+
'individual_models_count': individual_models_count,
|
571 |
+
'ensemble_training_time_seconds': ensemble_training_time,
|
572 |
+
'statistical_test_time_seconds': statistical_test_time,
|
573 |
+
'total_ensemble_time_seconds': ensemble_training_time + statistical_test_time
|
574 |
+
}
|
575 |
+
|
576 |
+
self.logger.log_performance_metrics(
|
577 |
+
component="ensemble_manager",
|
578 |
+
metrics=ensemble_metrics,
|
579 |
+
tags=['ensemble_performance', 'statistical_tests']
|
580 |
+
)
|
581 |
+
|
582 |
+
|
583 |
+
# Integration helpers for existing codebase
|
584 |
+
def setup_mlops_logging():
|
585 |
+
"""Setup structured logging for MLOps components"""
|
586 |
+
# Ensure log directory exists
|
587 |
+
log_dir = Path("/tmp/logs")
|
588 |
+
log_dir.mkdir(exist_ok=True)
|
589 |
+
|
590 |
+
# Configure root logger to avoid interference
|
591 |
+
root_logger = logging.getLogger()
|
592 |
+
root_logger.setLevel(logging.INFO)
|
593 |
+
|
594 |
+
# Clear any existing handlers
|
595 |
+
for handler in root_logger.handlers[:]:
|
596 |
+
root_logger.removeHandler(handler)
|
597 |
+
|
598 |
+
return MLOpsLoggers
|
599 |
+
|
600 |
+
|
601 |
+
def get_component_logger(component_name: str) -> StructuredLogger:
|
602 |
+
"""Get logger for specific component (backwards compatibility)"""
|
603 |
+
return MLOpsLoggers.get_logger(component_name)
|
604 |
+
|
605 |
+
|
606 |
+
# Decorators for automatic logging
|
607 |
+
def log_function_call(event_type: EventType, component: str = None):
|
608 |
+
"""Decorator to automatically log function calls"""
|
609 |
+
def decorator(func):
|
610 |
+
def wrapper(*args, **kwargs):
|
611 |
+
logger = MLOpsLoggers.get_logger(component or func.__module__)
|
612 |
+
|
613 |
+
with logger.operation(
|
614 |
+
event_type=event_type,
|
615 |
+
operation_name=func.__name__,
|
616 |
+
component=component,
|
617 |
+
metadata={'function': func.__name__, 'args_count': len(args), 'kwargs_count': len(kwargs)}
|
618 |
+
):
|
619 |
+
return func(*args, **kwargs)
|
620 |
+
|
621 |
+
return wrapper
|
622 |
+
return decorator
|
623 |
+
|
624 |
+
|
625 |
+
# Example usage functions for integration
|
626 |
+
def integrate_with_retrain_py():
|
627 |
+
"""Example integration with retrain.py"""
|
628 |
+
logger = MLOpsLoggers.get_retraining_logger()
|
629 |
+
|
630 |
+
# Example: Log retraining session start
|
631 |
+
logger.info(
|
632 |
+
EventType.MODEL_TRAINING_START,
|
633 |
+
"Enhanced retraining session started with LightGBM and ensemble",
|
634 |
+
component="retrain",
|
635 |
+
metadata={
|
636 |
+
'models': ['logistic_regression', 'random_forest', 'lightgbm'],
|
637 |
+
'ensemble_enabled': True,
|
638 |
+
'enhanced_features': True
|
639 |
+
},
|
640 |
+
tags=['retraining', 'lightgbm', 'ensemble']
|
641 |
+
)
|
642 |
+
|
643 |
+
return logger
|
644 |
+
|
645 |
+
|
646 |
+
def integrate_with_train_py():
|
647 |
+
"""Example integration with train.py"""
|
648 |
+
logger = MLOpsLoggers.get_model_trainer_logger()
|
649 |
+
|
650 |
+
# Example: Log training session start
|
651 |
+
logger.info(
|
652 |
+
EventType.MODEL_TRAINING_START,
|
653 |
+
"Enhanced training session started with comprehensive CV",
|
654 |
+
component="train",
|
655 |
+
metadata={
|
656 |
+
'models': ['logistic_regression', 'random_forest', 'lightgbm'],
|
657 |
+
'cv_folds': 5,
|
658 |
+
'ensemble_enabled': True
|
659 |
+
},
|
660 |
+
tags=['training', 'cv', 'ensemble']
|
661 |
+
)
|
662 |
+
|
663 |
+
return logger
|
664 |
+
|
665 |
+
|
666 |
+
# CPU constraint monitoring
|
667 |
+
import os
|
668 |
+
import psutil
|
669 |
+
|
670 |
+
def monitor_cpu_constraints():
|
671 |
+
"""Monitor CPU usage and memory for HuggingFace Spaces constraints"""
|
672 |
+
logger = MLOpsLoggers.get_monitoring_logger()
|
673 |
+
|
674 |
+
try:
|
675 |
+
# Get system metrics
|
676 |
+
cpu_percent = psutil.cpu_percent(interval=1)
|
677 |
+
memory = psutil.virtual_memory()
|
678 |
+
process = psutil.Process()
|
679 |
+
|
680 |
+
resource_metrics = {
|
681 |
+
'cpu_percent': cpu_percent,
|
682 |
+
'memory_percent': memory.percent,
|
683 |
+
'memory_used_mb': memory.used / 1024 / 1024,
|
684 |
+
'memory_available_mb': memory.available / 1024 / 1024,
|
685 |
+
'process_memory_mb': process.memory_info().rss / 1024 / 1024,
|
686 |
+
'process_cpu_percent': process.cpu_percent()
|
687 |
+
}
|
688 |
+
|
689 |
+
# Log resource usage
|
690 |
+
logger.log_performance_metrics(
|
691 |
+
component="system_monitor",
|
692 |
+
metrics=resource_metrics,
|
693 |
+
tags=['resource_monitoring', 'hfs_constraints']
|
694 |
+
)
|
695 |
+
|
696 |
+
# Alert on high usage (HFS constraints)
|
697 |
+
if cpu_percent > 80 or memory.percent > 85:
|
698 |
+
logger.log_cpu_constraint_warning(
|
699 |
+
component="system_monitor",
|
700 |
+
operation="resource_monitoring",
|
701 |
+
resource_usage=resource_metrics
|
702 |
+
)
|
703 |
+
|
704 |
+
return resource_metrics
|
705 |
+
|
706 |
+
except Exception as e:
|
707 |
+
logger.error(
|
708 |
+
EventType.PERFORMANCE_METRIC,
|
709 |
+
f"Failed to monitor CPU constraints: {str(e)}",
|
710 |
+
component="system_monitor"
|
711 |
+
)
|
712 |
+
return None
|
713 |
+
|
714 |
+
|
715 |
+
if __name__ == "__main__":
|
716 |
+
# Example usage and testing
|
717 |
+
setup_mlops_logging()
|
718 |
+
|
719 |
+
# Test structured logging
|
720 |
+
logger = MLOpsLoggers.get_model_trainer_logger()
|
721 |
+
|
722 |
+
# Test basic logging
|
723 |
+
logger.info(
|
724 |
+
EventType.MODEL_TRAINING_START,
|
725 |
+
"Testing structured logging system",
|
726 |
+
metadata={'test': True, 'version': '1.0'},
|
727 |
+
tags=['test', 'structured_logging']
|
728 |
+
)
|
729 |
+
|
730 |
+
# Test operation timing
|
731 |
+
with logger.operation(
|
732 |
+
EventType.MODEL_VALIDATION,
|
733 |
+
"test_operation",
|
734 |
+
metadata={'test_data': 'example'}
|
735 |
+
):
|
736 |
+
time.sleep(0.1) # Simulate work
|
737 |
+
|
738 |
+
# Test performance monitoring
|
739 |
+
perf_monitor = PerformanceMonitor(logger)
|
740 |
+
perf_monitor.monitor_training_performance(
|
741 |
+
model_name="test_model",
|
742 |
+
dataset_size=1000,
|
743 |
+
training_time=5.0,
|
744 |
+
memory_usage_mb=150.0
|
745 |
+
)
|
746 |
+
|
747 |
+
# Test CPU monitoring
|
748 |
+
monitor_cpu_constraints()
|
749 |
+
|
750 |
+
print("Structured logging system test completed successfully!")
|