Ahmedik95316 commited on
Commit
9702556
Β·
1 Parent(s): b9a8a05

Update initialize_system.py

Browse files
Files changed (1) hide show
  1. initialize_system.py +91 -17
initialize_system.py CHANGED
@@ -184,13 +184,43 @@ def run_initial_training():
184
  log_step("Starting initial model training...")
185
 
186
  try:
187
- # Check if model already exists
188
  model_path = path_manager.get_model_file_path()
189
  vectorizer_path = path_manager.get_vectorizer_path()
190
  pipeline_path = path_manager.get_pipeline_path()
191
 
 
 
 
 
 
192
  if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
193
- log_step("βœ… Model files already exist")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  return True
195
 
196
  # Import required libraries
@@ -253,14 +283,34 @@ def run_initial_training():
253
  accuracy = accuracy_score(y_test, y_pred)
254
  f1 = f1_score(y_test, y_pred, average='weighted')
255
 
256
- # Save complete pipeline
 
 
 
 
257
  joblib.dump(pipeline, pipeline_path)
258
- log_step(f"βœ… Saved pipeline to {pipeline_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  # Save individual components for backward compatibility
261
- joblib.dump(pipeline.named_steps['model'], model_path)
262
- joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
263
- log_step(f"βœ… Saved individual components")
 
 
 
264
 
265
  # Save metadata
266
  metadata = {
@@ -274,7 +324,9 @@ def run_initial_training():
274
  "training_method": "initialization",
275
  "environment": path_manager.environment,
276
  "data_path": str(dataset_path),
277
- "class_distribution": class_counts.to_dict()
 
 
278
  }
279
 
280
  metadata_path = path_manager.get_metadata_path()
@@ -284,9 +336,9 @@ def run_initial_training():
284
  log_step(f"βœ… Training completed successfully")
285
  log_step(f" Accuracy: {accuracy:.4f}")
286
  log_step(f" F1 Score: {f1:.4f}")
 
287
  log_step(f" Model saved to: {model_path}")
288
  log_step(f" Vectorizer saved to: {vectorizer_path}")
289
- log_step(f" Pipeline saved to: {pipeline_path}")
290
 
291
  return True
292
 
@@ -352,6 +404,7 @@ def verify_system():
352
  (path_manager.get_combined_dataset_path(), "Combined dataset"),
353
  (path_manager.get_model_file_path(), "Model file"),
354
  (path_manager.get_vectorizer_path(), "Vectorizer file"),
 
355
  (path_manager.get_metadata_path(), "Metadata file"),
356
  (path_manager.get_activity_log_path(), "Activity log")
357
  ]
@@ -362,24 +415,31 @@ def verify_system():
362
  log_step(f"βœ… {description}: {file_path}")
363
  else:
364
  log_step(f"❌ Missing {description}: {file_path}")
365
- all_good = False
 
 
366
 
367
- # Test model loading
368
  try:
369
  import joblib
370
  pipeline_path = path_manager.get_pipeline_path()
371
  if pipeline_path.exists():
372
  pipeline = joblib.load(pipeline_path)
373
  test_pred = pipeline.predict(["This is a test text"])
374
- log_step(f"βœ… Model test prediction successful: {test_pred}")
375
  else:
 
376
  model_path = path_manager.get_model_file_path()
377
  vectorizer_path = path_manager.get_vectorizer_path()
378
- model = joblib.load(model_path)
379
- vectorizer = joblib.load(vectorizer_path)
380
- test_text_vec = vectorizer.transform(["This is a test text"])
381
- test_pred = model.predict(test_text_vec)
382
- log_step(f"βœ… Model component test prediction successful: {test_pred}")
 
 
 
 
383
  except Exception as e:
384
  log_step(f"❌ Model test failed: {e}")
385
  all_good = False
@@ -441,6 +501,20 @@ def main():
441
  log_step(f" Available datasets: {sum(env_info['available_datasets'].values())}")
442
  log_step(f" Available models: {sum(env_info['available_models'].values())}")
443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  log_step("\n🎯 System ready for use!")
445
 
446
  return len(failed_steps) == 0
 
184
  log_step("Starting initial model training...")
185
 
186
  try:
187
+ # Get all the paths
188
  model_path = path_manager.get_model_file_path()
189
  vectorizer_path = path_manager.get_vectorizer_path()
190
  pipeline_path = path_manager.get_pipeline_path()
191
 
192
+ log_step(f"Model path: {model_path}")
193
+ log_step(f"Vectorizer path: {vectorizer_path}")
194
+ log_step(f"Pipeline path: {pipeline_path}")
195
+
196
+ # Check if model already exists
197
  if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
198
+ log_step("βœ… Model files already exist, checking if pipeline needs to be created...")
199
+
200
+ # If individual components exist but pipeline doesn't, create pipeline
201
+ if model_path.exists() and vectorizer_path.exists() and not pipeline_path.exists():
202
+ log_step("Creating pipeline from existing components...")
203
+ try:
204
+ import joblib
205
+ from sklearn.pipeline import Pipeline
206
+
207
+ # Load existing components
208
+ model = joblib.load(model_path)
209
+ vectorizer = joblib.load(vectorizer_path)
210
+
211
+ # Create pipeline
212
+ pipeline = Pipeline([
213
+ ('vectorizer', vectorizer),
214
+ ('model', model)
215
+ ])
216
+
217
+ # Save pipeline
218
+ joblib.dump(pipeline, pipeline_path)
219
+ log_step(f"βœ… Created pipeline from existing components: {pipeline_path}")
220
+
221
+ except Exception as e:
222
+ log_step(f"⚠️ Failed to create pipeline from existing components: {e}")
223
+
224
  return True
225
 
226
  # Import required libraries
 
283
  accuracy = accuracy_score(y_test, y_pred)
284
  f1 = f1_score(y_test, y_pred, average='weighted')
285
 
286
+ # Ensure model directory exists
287
+ model_path.parent.mkdir(parents=True, exist_ok=True)
288
+
289
+ # Save complete pipeline FIRST (this is the priority)
290
+ log_step(f"Saving pipeline to: {pipeline_path}")
291
  joblib.dump(pipeline, pipeline_path)
292
+
293
+ # Verify pipeline was saved
294
+ if pipeline_path.exists():
295
+ log_step(f"βœ… Pipeline saved successfully to {pipeline_path}")
296
+
297
+ # Test loading the pipeline
298
+ try:
299
+ test_pipeline = joblib.load(pipeline_path)
300
+ test_pred = test_pipeline.predict(["This is a test"])
301
+ log_step(f"βœ… Pipeline verification successful: {test_pred}")
302
+ except Exception as e:
303
+ log_step(f"⚠️ Pipeline verification failed: {e}")
304
+ else:
305
+ log_step(f"❌ Pipeline was not saved to {pipeline_path}")
306
 
307
  # Save individual components for backward compatibility
308
+ try:
309
+ joblib.dump(pipeline.named_steps['model'], model_path)
310
+ joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
311
+ log_step(f"βœ… Saved individual components")
312
+ except Exception as e:
313
+ log_step(f"⚠️ Failed to save individual components: {e}")
314
 
315
  # Save metadata
316
  metadata = {
 
324
  "training_method": "initialization",
325
  "environment": path_manager.environment,
326
  "data_path": str(dataset_path),
327
+ "class_distribution": class_counts.to_dict(),
328
+ "pipeline_created": pipeline_path.exists(),
329
+ "individual_components_created": model_path.exists() and vectorizer_path.exists()
330
  }
331
 
332
  metadata_path = path_manager.get_metadata_path()
 
336
  log_step(f"βœ… Training completed successfully")
337
  log_step(f" Accuracy: {accuracy:.4f}")
338
  log_step(f" F1 Score: {f1:.4f}")
339
+ log_step(f" Pipeline saved: {pipeline_path.exists()}")
340
  log_step(f" Model saved to: {model_path}")
341
  log_step(f" Vectorizer saved to: {vectorizer_path}")
 
342
 
343
  return True
344
 
 
404
  (path_manager.get_combined_dataset_path(), "Combined dataset"),
405
  (path_manager.get_model_file_path(), "Model file"),
406
  (path_manager.get_vectorizer_path(), "Vectorizer file"),
407
+ (path_manager.get_pipeline_path(), "Pipeline file"),
408
  (path_manager.get_metadata_path(), "Metadata file"),
409
  (path_manager.get_activity_log_path(), "Activity log")
410
  ]
 
415
  log_step(f"βœ… {description}: {file_path}")
416
  else:
417
  log_step(f"❌ Missing {description}: {file_path}")
418
+ if description == "Pipeline file":
419
+ # Pipeline is critical, mark as not all good
420
+ all_good = False
421
 
422
+ # Test model loading - prioritize pipeline
423
  try:
424
  import joblib
425
  pipeline_path = path_manager.get_pipeline_path()
426
  if pipeline_path.exists():
427
  pipeline = joblib.load(pipeline_path)
428
  test_pred = pipeline.predict(["This is a test text"])
429
+ log_step(f"βœ… Pipeline test prediction successful: {test_pred}")
430
  else:
431
+ log_step("⚠️ Pipeline not available, testing individual components...")
432
  model_path = path_manager.get_model_file_path()
433
  vectorizer_path = path_manager.get_vectorizer_path()
434
+ if model_path.exists() and vectorizer_path.exists():
435
+ model = joblib.load(model_path)
436
+ vectorizer = joblib.load(vectorizer_path)
437
+ test_text_vec = vectorizer.transform(["This is a test text"])
438
+ test_pred = model.predict(test_text_vec)
439
+ log_step(f"βœ… Individual components test prediction successful: {test_pred}")
440
+ else:
441
+ log_step("❌ No working model components found")
442
+ all_good = False
443
  except Exception as e:
444
  log_step(f"❌ Model test failed: {e}")
445
  all_good = False
 
501
  log_step(f" Available datasets: {sum(env_info['available_datasets'].values())}")
502
  log_step(f" Available models: {sum(env_info['available_models'].values())}")
503
 
504
+ # Final pipeline check
505
+ pipeline_path = path_manager.get_pipeline_path()
506
+ log_step(f"\n🎯 Final Pipeline Check:")
507
+ log_step(f" Pipeline path: {pipeline_path}")
508
+ log_step(f" Pipeline exists: {pipeline_path.exists()}")
509
+ if pipeline_path.exists():
510
+ try:
511
+ import joblib
512
+ pipeline = joblib.load(pipeline_path)
513
+ log_step(f" Pipeline loadable: βœ…")
514
+ log_step(f" Pipeline steps: {list(pipeline.named_steps.keys())}")
515
+ except Exception as e:
516
+ log_step(f" Pipeline load error: {e}")
517
+
518
  log_step("\n🎯 System ready for use!")
519
 
520
  return len(failed_steps) == 0