Commit
Β·
9702556
1
Parent(s):
b9a8a05
Update initialize_system.py
Browse files- initialize_system.py +91 -17
initialize_system.py
CHANGED
|
@@ -184,13 +184,43 @@ def run_initial_training():
|
|
| 184 |
log_step("Starting initial model training...")
|
| 185 |
|
| 186 |
try:
|
| 187 |
-
#
|
| 188 |
model_path = path_manager.get_model_file_path()
|
| 189 |
vectorizer_path = path_manager.get_vectorizer_path()
|
| 190 |
pipeline_path = path_manager.get_pipeline_path()
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
|
| 193 |
-
log_step("β
Model files already exist")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
return True
|
| 195 |
|
| 196 |
# Import required libraries
|
|
@@ -253,14 +283,34 @@ def run_initial_training():
|
|
| 253 |
accuracy = accuracy_score(y_test, y_pred)
|
| 254 |
f1 = f1_score(y_test, y_pred, average='weighted')
|
| 255 |
|
| 256 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
joblib.dump(pipeline, pipeline_path)
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
# Save individual components for backward compatibility
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
# Save metadata
|
| 266 |
metadata = {
|
|
@@ -274,7 +324,9 @@ def run_initial_training():
|
|
| 274 |
"training_method": "initialization",
|
| 275 |
"environment": path_manager.environment,
|
| 276 |
"data_path": str(dataset_path),
|
| 277 |
-
"class_distribution": class_counts.to_dict()
|
|
|
|
|
|
|
| 278 |
}
|
| 279 |
|
| 280 |
metadata_path = path_manager.get_metadata_path()
|
|
@@ -284,9 +336,9 @@ def run_initial_training():
|
|
| 284 |
log_step(f"β
Training completed successfully")
|
| 285 |
log_step(f" Accuracy: {accuracy:.4f}")
|
| 286 |
log_step(f" F1 Score: {f1:.4f}")
|
|
|
|
| 287 |
log_step(f" Model saved to: {model_path}")
|
| 288 |
log_step(f" Vectorizer saved to: {vectorizer_path}")
|
| 289 |
-
log_step(f" Pipeline saved to: {pipeline_path}")
|
| 290 |
|
| 291 |
return True
|
| 292 |
|
|
@@ -352,6 +404,7 @@ def verify_system():
|
|
| 352 |
(path_manager.get_combined_dataset_path(), "Combined dataset"),
|
| 353 |
(path_manager.get_model_file_path(), "Model file"),
|
| 354 |
(path_manager.get_vectorizer_path(), "Vectorizer file"),
|
|
|
|
| 355 |
(path_manager.get_metadata_path(), "Metadata file"),
|
| 356 |
(path_manager.get_activity_log_path(), "Activity log")
|
| 357 |
]
|
|
@@ -362,24 +415,31 @@ def verify_system():
|
|
| 362 |
log_step(f"β
{description}: {file_path}")
|
| 363 |
else:
|
| 364 |
log_step(f"β Missing {description}: {file_path}")
|
| 365 |
-
|
|
|
|
|
|
|
| 366 |
|
| 367 |
-
# Test model loading
|
| 368 |
try:
|
| 369 |
import joblib
|
| 370 |
pipeline_path = path_manager.get_pipeline_path()
|
| 371 |
if pipeline_path.exists():
|
| 372 |
pipeline = joblib.load(pipeline_path)
|
| 373 |
test_pred = pipeline.predict(["This is a test text"])
|
| 374 |
-
log_step(f"β
|
| 375 |
else:
|
|
|
|
| 376 |
model_path = path_manager.get_model_file_path()
|
| 377 |
vectorizer_path = path_manager.get_vectorizer_path()
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
except Exception as e:
|
| 384 |
log_step(f"β Model test failed: {e}")
|
| 385 |
all_good = False
|
|
@@ -441,6 +501,20 @@ def main():
|
|
| 441 |
log_step(f" Available datasets: {sum(env_info['available_datasets'].values())}")
|
| 442 |
log_step(f" Available models: {sum(env_info['available_models'].values())}")
|
| 443 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
log_step("\nπ― System ready for use!")
|
| 445 |
|
| 446 |
return len(failed_steps) == 0
|
|
|
|
| 184 |
log_step("Starting initial model training...")
|
| 185 |
|
| 186 |
try:
|
| 187 |
+
# Get all the paths
|
| 188 |
model_path = path_manager.get_model_file_path()
|
| 189 |
vectorizer_path = path_manager.get_vectorizer_path()
|
| 190 |
pipeline_path = path_manager.get_pipeline_path()
|
| 191 |
|
| 192 |
+
log_step(f"Model path: {model_path}")
|
| 193 |
+
log_step(f"Vectorizer path: {vectorizer_path}")
|
| 194 |
+
log_step(f"Pipeline path: {pipeline_path}")
|
| 195 |
+
|
| 196 |
+
# Check if model already exists
|
| 197 |
if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
|
| 198 |
+
log_step("β
Model files already exist, checking if pipeline needs to be created...")
|
| 199 |
+
|
| 200 |
+
# If individual components exist but pipeline doesn't, create pipeline
|
| 201 |
+
if model_path.exists() and vectorizer_path.exists() and not pipeline_path.exists():
|
| 202 |
+
log_step("Creating pipeline from existing components...")
|
| 203 |
+
try:
|
| 204 |
+
import joblib
|
| 205 |
+
from sklearn.pipeline import Pipeline
|
| 206 |
+
|
| 207 |
+
# Load existing components
|
| 208 |
+
model = joblib.load(model_path)
|
| 209 |
+
vectorizer = joblib.load(vectorizer_path)
|
| 210 |
+
|
| 211 |
+
# Create pipeline
|
| 212 |
+
pipeline = Pipeline([
|
| 213 |
+
('vectorizer', vectorizer),
|
| 214 |
+
('model', model)
|
| 215 |
+
])
|
| 216 |
+
|
| 217 |
+
# Save pipeline
|
| 218 |
+
joblib.dump(pipeline, pipeline_path)
|
| 219 |
+
log_step(f"β
Created pipeline from existing components: {pipeline_path}")
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
log_step(f"β οΈ Failed to create pipeline from existing components: {e}")
|
| 223 |
+
|
| 224 |
return True
|
| 225 |
|
| 226 |
# Import required libraries
|
|
|
|
| 283 |
accuracy = accuracy_score(y_test, y_pred)
|
| 284 |
f1 = f1_score(y_test, y_pred, average='weighted')
|
| 285 |
|
| 286 |
+
# Ensure model directory exists
|
| 287 |
+
model_path.parent.mkdir(parents=True, exist_ok=True)
|
| 288 |
+
|
| 289 |
+
# Save complete pipeline FIRST (this is the priority)
|
| 290 |
+
log_step(f"Saving pipeline to: {pipeline_path}")
|
| 291 |
joblib.dump(pipeline, pipeline_path)
|
| 292 |
+
|
| 293 |
+
# Verify pipeline was saved
|
| 294 |
+
if pipeline_path.exists():
|
| 295 |
+
log_step(f"β
Pipeline saved successfully to {pipeline_path}")
|
| 296 |
+
|
| 297 |
+
# Test loading the pipeline
|
| 298 |
+
try:
|
| 299 |
+
test_pipeline = joblib.load(pipeline_path)
|
| 300 |
+
test_pred = test_pipeline.predict(["This is a test"])
|
| 301 |
+
log_step(f"β
Pipeline verification successful: {test_pred}")
|
| 302 |
+
except Exception as e:
|
| 303 |
+
log_step(f"β οΈ Pipeline verification failed: {e}")
|
| 304 |
+
else:
|
| 305 |
+
log_step(f"β Pipeline was not saved to {pipeline_path}")
|
| 306 |
|
| 307 |
# Save individual components for backward compatibility
|
| 308 |
+
try:
|
| 309 |
+
joblib.dump(pipeline.named_steps['model'], model_path)
|
| 310 |
+
joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
|
| 311 |
+
log_step(f"β
Saved individual components")
|
| 312 |
+
except Exception as e:
|
| 313 |
+
log_step(f"β οΈ Failed to save individual components: {e}")
|
| 314 |
|
| 315 |
# Save metadata
|
| 316 |
metadata = {
|
|
|
|
| 324 |
"training_method": "initialization",
|
| 325 |
"environment": path_manager.environment,
|
| 326 |
"data_path": str(dataset_path),
|
| 327 |
+
"class_distribution": class_counts.to_dict(),
|
| 328 |
+
"pipeline_created": pipeline_path.exists(),
|
| 329 |
+
"individual_components_created": model_path.exists() and vectorizer_path.exists()
|
| 330 |
}
|
| 331 |
|
| 332 |
metadata_path = path_manager.get_metadata_path()
|
|
|
|
| 336 |
log_step(f"β
Training completed successfully")
|
| 337 |
log_step(f" Accuracy: {accuracy:.4f}")
|
| 338 |
log_step(f" F1 Score: {f1:.4f}")
|
| 339 |
+
log_step(f" Pipeline saved: {pipeline_path.exists()}")
|
| 340 |
log_step(f" Model saved to: {model_path}")
|
| 341 |
log_step(f" Vectorizer saved to: {vectorizer_path}")
|
|
|
|
| 342 |
|
| 343 |
return True
|
| 344 |
|
|
|
|
| 404 |
(path_manager.get_combined_dataset_path(), "Combined dataset"),
|
| 405 |
(path_manager.get_model_file_path(), "Model file"),
|
| 406 |
(path_manager.get_vectorizer_path(), "Vectorizer file"),
|
| 407 |
+
(path_manager.get_pipeline_path(), "Pipeline file"),
|
| 408 |
(path_manager.get_metadata_path(), "Metadata file"),
|
| 409 |
(path_manager.get_activity_log_path(), "Activity log")
|
| 410 |
]
|
|
|
|
| 415 |
log_step(f"β
{description}: {file_path}")
|
| 416 |
else:
|
| 417 |
log_step(f"β Missing {description}: {file_path}")
|
| 418 |
+
if description == "Pipeline file":
|
| 419 |
+
# Pipeline is critical, mark as not all good
|
| 420 |
+
all_good = False
|
| 421 |
|
| 422 |
+
# Test model loading - prioritize pipeline
|
| 423 |
try:
|
| 424 |
import joblib
|
| 425 |
pipeline_path = path_manager.get_pipeline_path()
|
| 426 |
if pipeline_path.exists():
|
| 427 |
pipeline = joblib.load(pipeline_path)
|
| 428 |
test_pred = pipeline.predict(["This is a test text"])
|
| 429 |
+
log_step(f"β
Pipeline test prediction successful: {test_pred}")
|
| 430 |
else:
|
| 431 |
+
log_step("β οΈ Pipeline not available, testing individual components...")
|
| 432 |
model_path = path_manager.get_model_file_path()
|
| 433 |
vectorizer_path = path_manager.get_vectorizer_path()
|
| 434 |
+
if model_path.exists() and vectorizer_path.exists():
|
| 435 |
+
model = joblib.load(model_path)
|
| 436 |
+
vectorizer = joblib.load(vectorizer_path)
|
| 437 |
+
test_text_vec = vectorizer.transform(["This is a test text"])
|
| 438 |
+
test_pred = model.predict(test_text_vec)
|
| 439 |
+
log_step(f"β
Individual components test prediction successful: {test_pred}")
|
| 440 |
+
else:
|
| 441 |
+
log_step("β No working model components found")
|
| 442 |
+
all_good = False
|
| 443 |
except Exception as e:
|
| 444 |
log_step(f"β Model test failed: {e}")
|
| 445 |
all_good = False
|
|
|
|
| 501 |
log_step(f" Available datasets: {sum(env_info['available_datasets'].values())}")
|
| 502 |
log_step(f" Available models: {sum(env_info['available_models'].values())}")
|
| 503 |
|
| 504 |
+
# Final pipeline check
|
| 505 |
+
pipeline_path = path_manager.get_pipeline_path()
|
| 506 |
+
log_step(f"\nπ― Final Pipeline Check:")
|
| 507 |
+
log_step(f" Pipeline path: {pipeline_path}")
|
| 508 |
+
log_step(f" Pipeline exists: {pipeline_path.exists()}")
|
| 509 |
+
if pipeline_path.exists():
|
| 510 |
+
try:
|
| 511 |
+
import joblib
|
| 512 |
+
pipeline = joblib.load(pipeline_path)
|
| 513 |
+
log_step(f" Pipeline loadable: β
")
|
| 514 |
+
log_step(f" Pipeline steps: {list(pipeline.named_steps.keys())}")
|
| 515 |
+
except Exception as e:
|
| 516 |
+
log_step(f" Pipeline load error: {e}")
|
| 517 |
+
|
| 518 |
log_step("\nπ― System ready for use!")
|
| 519 |
|
| 520 |
return len(failed_steps) == 0
|