Update initialize_system.py
Browse files- initialize_system.py +23 -5
initialize_system.py
CHANGED
|
@@ -7,13 +7,16 @@ import pandas as pd
|
|
| 7 |
from pathlib import Path
|
| 8 |
from datetime import datetime
|
| 9 |
from sklearn.pipeline import Pipeline
|
|
|
|
| 10 |
from sklearn.model_selection import cross_validate
|
| 11 |
from sklearn.linear_model import LogisticRegression
|
|
|
|
| 12 |
from sklearn.model_selection import train_test_split
|
| 13 |
from sklearn.metrics import accuracy_score, f1_score
|
| 14 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 15 |
|
| 16 |
|
|
|
|
| 17 |
# Import the new path manager# Cal
|
| 18 |
try:
|
| 19 |
from path_config import path_manager
|
|
@@ -210,13 +213,28 @@ def run_initial_training():
|
|
| 210 |
log_step("Creating pipeline from existing components...")
|
| 211 |
try:
|
| 212 |
# Load existing components
|
| 213 |
-
model = joblib.load(model_path)
|
| 214 |
vectorizer = joblib.load(vectorizer_path)
|
| 215 |
-
|
| 216 |
# Create pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
pipeline = Pipeline([
|
| 218 |
-
('vectorizer',
|
| 219 |
-
('model', model
|
| 220 |
])
|
| 221 |
|
| 222 |
# Save pipeline
|
|
@@ -370,7 +388,7 @@ def run_initial_training():
|
|
| 370 |
# Save metadata
|
| 371 |
metadata = {
|
| 372 |
"model_version": "v1.0_init",
|
| 373 |
-
"model_type": "logistic_regression_pipeline",
|
| 374 |
"test_accuracy": float(accuracy),
|
| 375 |
"test_f1": float(f1),
|
| 376 |
"train_size": len(X_train),
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
from datetime import datetime
|
| 9 |
from sklearn.pipeline import Pipeline
|
| 10 |
+
from sklearn.ensemble import VotingClassifier
|
| 11 |
from sklearn.model_selection import cross_validate
|
| 12 |
from sklearn.linear_model import LogisticRegression
|
| 13 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 14 |
from sklearn.model_selection import train_test_split
|
| 15 |
from sklearn.metrics import accuracy_score, f1_score
|
| 16 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 17 |
|
| 18 |
|
| 19 |
+
|
| 20 |
# Import the new path manager# Cal
|
| 21 |
try:
|
| 22 |
from path_config import path_manager
|
|
|
|
| 213 |
log_step("Creating pipeline from existing components...")
|
| 214 |
try:
|
| 215 |
# Load existing components
|
| 216 |
+
# model = joblib.load(model_path)
|
| 217 |
vectorizer = joblib.load(vectorizer_path)
|
| 218 |
+
|
| 219 |
# Create pipeline
|
| 220 |
+
# pipeline = Pipeline([
|
| 221 |
+
# ('vectorizer', vectorizer),
|
| 222 |
+
# ('model', model)
|
| 223 |
+
# ])
|
| 224 |
+
|
| 225 |
+
# Create ensemble method pipeline
|
| 226 |
+
# Initialize ensemble model
|
| 227 |
+
ensemble_model = VotingClassifier(
|
| 228 |
+
estimators=[
|
| 229 |
+
('logistic', LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')),
|
| 230 |
+
('random_forest', RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced'))
|
| 231 |
+
],
|
| 232 |
+
voting='soft'
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
pipeline = Pipeline([
|
| 236 |
+
('vectorizer', TfidfVectorizer(...)),
|
| 237 |
+
('model', ensemble_model) # Use ensemble instead of single model
|
| 238 |
])
|
| 239 |
|
| 240 |
# Save pipeline
|
|
|
|
| 388 |
# Save metadata
|
| 389 |
metadata = {
|
| 390 |
"model_version": "v1.0_init",
|
| 391 |
+
"model_type": "ensemble_voting_pipeline", # "logistic_regression_pipeline",
|
| 392 |
"test_accuracy": float(accuracy),
|
| 393 |
"test_f1": float(f1),
|
| 394 |
"train_size": len(X_train),
|