Update initialize_system.py
Browse files- initialize_system.py +17 -88
initialize_system.py
CHANGED
|
@@ -1,19 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
-
import json
|
| 4 |
-
import joblib
|
| 5 |
import shutil
|
| 6 |
import pandas as pd
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
from datetime import datetime
|
| 9 |
-
from sklearn.pipeline import Pipeline
|
| 10 |
-
from model.train import EnhancedModelTrainer
|
| 11 |
-
from sklearn.model_selection import cross_validate
|
| 12 |
-
from sklearn.linear_model import LogisticRegression
|
| 13 |
-
from sklearn.model_selection import train_test_split
|
| 14 |
-
from sklearn.metrics import accuracy_score, f1_score
|
| 15 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 16 |
-
|
| 17 |
|
| 18 |
# Import the new path manager
|
| 19 |
try:
|
|
@@ -188,9 +179,6 @@ def create_minimal_dataset():
|
|
| 188 |
return False
|
| 189 |
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
def run_initial_training():
|
| 195 |
"""Run basic model training"""
|
| 196 |
log_step("Starting initial model training...")
|
|
@@ -236,7 +224,12 @@ def run_initial_training():
|
|
| 236 |
return True
|
| 237 |
|
| 238 |
# Import required libraries
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
# Load dataset
|
| 242 |
dataset_path = path_manager.get_combined_dataset_path()
|
|
@@ -281,67 +274,22 @@ def run_initial_training():
|
|
| 281 |
))
|
| 282 |
])
|
| 283 |
|
| 284 |
-
# Train model
|
| 285 |
-
log_step("Training model
|
| 286 |
-
|
| 287 |
-
# Perform cross-validation before final training
|
| 288 |
-
cv_results = cross_validate(
|
| 289 |
-
pipeline, X_train, y_train,
|
| 290 |
-
cv=3,
|
| 291 |
-
scoring=['accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted'],
|
| 292 |
-
return_train_score=True
|
| 293 |
-
)
|
| 294 |
-
|
| 295 |
-
# Train final model on all training data
|
| 296 |
pipeline.fit(X_train, y_train)
|
| 297 |
-
|
| 298 |
-
# Evaluate
|
| 299 |
y_pred = pipeline.predict(X_test)
|
| 300 |
accuracy = accuracy_score(y_test, y_pred)
|
| 301 |
f1 = f1_score(y_test, y_pred, average='weighted')
|
| 302 |
-
|
| 303 |
-
# Save CV results for API access
|
| 304 |
-
cv_data = {
|
| 305 |
-
"n_splits": 3,
|
| 306 |
-
"test_scores": {
|
| 307 |
-
"accuracy": {
|
| 308 |
-
"mean": float(cv_results['test_accuracy'].mean()),
|
| 309 |
-
"std": float(cv_results['test_accuracy'].std()),
|
| 310 |
-
"scores": cv_results['test_accuracy'].tolist()
|
| 311 |
-
},
|
| 312 |
-
"f1": {
|
| 313 |
-
"mean": float(cv_results['test_f1_weighted'].mean()),
|
| 314 |
-
"std": float(cv_results['test_f1_weighted'].std()),
|
| 315 |
-
"scores": cv_results['test_f1_weighted'].tolist()
|
| 316 |
-
}
|
| 317 |
-
},
|
| 318 |
-
"train_scores": {
|
| 319 |
-
"accuracy": {
|
| 320 |
-
"mean": float(cv_results['train_accuracy'].mean()),
|
| 321 |
-
"std": float(cv_results['train_accuracy'].std()),
|
| 322 |
-
"scores": cv_results['train_accuracy'].tolist()
|
| 323 |
-
},
|
| 324 |
-
"f1": {
|
| 325 |
-
"mean": float(cv_results['train_f1_weighted'].mean()),
|
| 326 |
-
"std": float(cv_results['train_f1_weighted'].std()),
|
| 327 |
-
"scores": cv_results['train_f1_weighted'].tolist()
|
| 328 |
-
}
|
| 329 |
-
}
|
| 330 |
-
}
|
| 331 |
-
|
| 332 |
-
# Save CV results to file
|
| 333 |
-
cv_results_path = path_manager.get_logs_path("cv_results.json")
|
| 334 |
-
with open(cv_results_path, 'w') as f:
|
| 335 |
-
json.dump(cv_data, f, indent=2)
|
| 336 |
-
log_step(f"Saved CV results to: {cv_results_path}")
|
| 337 |
-
|
| 338 |
# Ensure model directory exists
|
| 339 |
model_path.parent.mkdir(parents=True, exist_ok=True)
|
| 340 |
-
|
| 341 |
# Save complete pipeline FIRST (this is the priority)
|
| 342 |
log_step(f"Saving pipeline to: {pipeline_path}")
|
| 343 |
joblib.dump(pipeline, pipeline_path)
|
| 344 |
-
|
| 345 |
# Verify pipeline was saved
|
| 346 |
if pipeline_path.exists():
|
| 347 |
log_step(f"✅ Pipeline saved successfully to {pipeline_path}")
|
|
@@ -378,12 +326,7 @@ def run_initial_training():
|
|
| 378 |
"data_path": str(dataset_path),
|
| 379 |
"class_distribution": class_counts.to_dict(),
|
| 380 |
"pipeline_created": pipeline_path.exists(),
|
| 381 |
-
"individual_components_created": model_path.exists() and vectorizer_path.exists()
|
| 382 |
-
# Add CV results to metadata
|
| 383 |
-
"cv_f1_mean": float(cv_results['test_f1_weighted'].mean()),
|
| 384 |
-
"cv_f1_std": float(cv_results['test_f1_weighted'].std()),
|
| 385 |
-
"cv_accuracy_mean": float(cv_results['test_accuracy'].mean()),
|
| 386 |
-
"cv_accuracy_std": float(cv_results['test_accuracy'].std())
|
| 387 |
}
|
| 388 |
|
| 389 |
metadata_path = path_manager.get_metadata_path()
|
|
@@ -445,20 +388,6 @@ def create_initial_logs():
|
|
| 445 |
json.dump([], f)
|
| 446 |
log_step(f"✅ Created {log_file}")
|
| 447 |
|
| 448 |
-
# Create monitoring directory structure
|
| 449 |
-
monitor_dir = path_manager.get_logs_path("monitor")
|
| 450 |
-
monitor_dir.mkdir(parents=True, exist_ok=True)
|
| 451 |
-
log_step(f"✅ Created monitoring directory: {monitor_dir}")
|
| 452 |
-
|
| 453 |
-
# Create empty monitoring log files
|
| 454 |
-
monitor_files = ["predictions.json", "metrics.json", "alerts.json"]
|
| 455 |
-
for monitor_file in monitor_files:
|
| 456 |
-
monitor_path = monitor_dir / monitor_file
|
| 457 |
-
if not monitor_path.exists():
|
| 458 |
-
with open(monitor_path, 'w') as f:
|
| 459 |
-
json.dump([], f)
|
| 460 |
-
log_step(f"✅ Created {monitor_file}")
|
| 461 |
-
|
| 462 |
return True
|
| 463 |
|
| 464 |
except Exception as e:
|
|
@@ -591,7 +520,7 @@ def main():
|
|
| 591 |
return len(failed_steps) == 0
|
| 592 |
|
| 593 |
|
| 594 |
-
|
| 595 |
if __name__ == "__main__":
|
| 596 |
success = main()
|
| 597 |
-
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
|
|
|
|
|
|
| 3 |
import shutil
|
| 4 |
import pandas as pd
|
| 5 |
+
import json
|
| 6 |
from pathlib import Path
|
| 7 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Import the new path manager
|
| 10 |
try:
|
|
|
|
| 179 |
return False
|
| 180 |
|
| 181 |
|
|
|
|
|
|
|
|
|
|
| 182 |
def run_initial_training():
|
| 183 |
"""Run basic model training"""
|
| 184 |
log_step("Starting initial model training...")
|
|
|
|
| 224 |
return True
|
| 225 |
|
| 226 |
# Import required libraries
|
| 227 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 228 |
+
from sklearn.linear_model import LogisticRegression
|
| 229 |
+
from sklearn.model_selection import train_test_split
|
| 230 |
+
from sklearn.metrics import accuracy_score, f1_score
|
| 231 |
+
from sklearn.pipeline import Pipeline
|
| 232 |
+
import joblib
|
| 233 |
|
| 234 |
# Load dataset
|
| 235 |
dataset_path = path_manager.get_combined_dataset_path()
|
|
|
|
| 274 |
))
|
| 275 |
])
|
| 276 |
|
| 277 |
+
# Train model
|
| 278 |
+
log_step("Training model...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
pipeline.fit(X_train, y_train)
|
| 280 |
+
|
| 281 |
+
# Evaluate
|
| 282 |
y_pred = pipeline.predict(X_test)
|
| 283 |
accuracy = accuracy_score(y_test, y_pred)
|
| 284 |
f1 = f1_score(y_test, y_pred, average='weighted')
|
| 285 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
# Ensure model directory exists
|
| 287 |
model_path.parent.mkdir(parents=True, exist_ok=True)
|
| 288 |
+
|
| 289 |
# Save complete pipeline FIRST (this is the priority)
|
| 290 |
log_step(f"Saving pipeline to: {pipeline_path}")
|
| 291 |
joblib.dump(pipeline, pipeline_path)
|
| 292 |
+
|
| 293 |
# Verify pipeline was saved
|
| 294 |
if pipeline_path.exists():
|
| 295 |
log_step(f"✅ Pipeline saved successfully to {pipeline_path}")
|
|
|
|
| 326 |
"data_path": str(dataset_path),
|
| 327 |
"class_distribution": class_counts.to_dict(),
|
| 328 |
"pipeline_created": pipeline_path.exists(),
|
| 329 |
+
"individual_components_created": model_path.exists() and vectorizer_path.exists()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
}
|
| 331 |
|
| 332 |
metadata_path = path_manager.get_metadata_path()
|
|
|
|
| 388 |
json.dump([], f)
|
| 389 |
log_step(f"✅ Created {log_file}")
|
| 390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
return True
|
| 392 |
|
| 393 |
except Exception as e:
|
|
|
|
| 520 |
return len(failed_steps) == 0
|
| 521 |
|
| 522 |
|
|
|
|
| 523 |
if __name__ == "__main__":
|
| 524 |
success = main()
|
| 525 |
+
if not success:
|
| 526 |
+
sys.exit(1)
|