Ahmedik95316 commited on
Commit
44dceca
·
verified ·
1 Parent(s): f984f56

Update initialize_system.py

Browse files
Files changed (1) hide show
  1. initialize_system.py +17 -88
initialize_system.py CHANGED
@@ -1,19 +1,10 @@
1
  import os
2
  import sys
3
- import json
4
- import joblib
5
  import shutil
6
  import pandas as pd
 
7
  from pathlib import Path
8
  from datetime import datetime
9
- from sklearn.pipeline import Pipeline
10
- from model.train import EnhancedModelTrainer
11
- from sklearn.model_selection import cross_validate
12
- from sklearn.linear_model import LogisticRegression
13
- from sklearn.model_selection import train_test_split
14
- from sklearn.metrics import accuracy_score, f1_score
15
- from sklearn.feature_extraction.text import TfidfVectorizer
16
-
17
 
18
  # Import the new path manager
19
  try:
@@ -188,9 +179,6 @@ def create_minimal_dataset():
188
  return False
189
 
190
 
191
-
192
-
193
-
194
  def run_initial_training():
195
  """Run basic model training"""
196
  log_step("Starting initial model training...")
@@ -236,7 +224,12 @@ def run_initial_training():
236
  return True
237
 
238
  # Import required libraries
239
-
 
 
 
 
 
240
 
241
  # Load dataset
242
  dataset_path = path_manager.get_combined_dataset_path()
@@ -281,67 +274,22 @@ def run_initial_training():
281
  ))
282
  ])
283
 
284
- # Train model with cross-validation
285
- log_step("Training model with cross-validation...")
286
-
287
- # Perform cross-validation before final training
288
- cv_results = cross_validate(
289
- pipeline, X_train, y_train,
290
- cv=3,
291
- scoring=['accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted'],
292
- return_train_score=True
293
- )
294
-
295
- # Train final model on all training data
296
  pipeline.fit(X_train, y_train)
297
-
298
- # Evaluate on test set
299
  y_pred = pipeline.predict(X_test)
300
  accuracy = accuracy_score(y_test, y_pred)
301
  f1 = f1_score(y_test, y_pred, average='weighted')
302
-
303
- # Save CV results for API access
304
- cv_data = {
305
- "n_splits": 3,
306
- "test_scores": {
307
- "accuracy": {
308
- "mean": float(cv_results['test_accuracy'].mean()),
309
- "std": float(cv_results['test_accuracy'].std()),
310
- "scores": cv_results['test_accuracy'].tolist()
311
- },
312
- "f1": {
313
- "mean": float(cv_results['test_f1_weighted'].mean()),
314
- "std": float(cv_results['test_f1_weighted'].std()),
315
- "scores": cv_results['test_f1_weighted'].tolist()
316
- }
317
- },
318
- "train_scores": {
319
- "accuracy": {
320
- "mean": float(cv_results['train_accuracy'].mean()),
321
- "std": float(cv_results['train_accuracy'].std()),
322
- "scores": cv_results['train_accuracy'].tolist()
323
- },
324
- "f1": {
325
- "mean": float(cv_results['train_f1_weighted'].mean()),
326
- "std": float(cv_results['train_f1_weighted'].std()),
327
- "scores": cv_results['train_f1_weighted'].tolist()
328
- }
329
- }
330
- }
331
-
332
- # Save CV results to file
333
- cv_results_path = path_manager.get_logs_path("cv_results.json")
334
- with open(cv_results_path, 'w') as f:
335
- json.dump(cv_data, f, indent=2)
336
- log_step(f"Saved CV results to: {cv_results_path}")
337
-
338
  # Ensure model directory exists
339
  model_path.parent.mkdir(parents=True, exist_ok=True)
340
-
341
  # Save complete pipeline FIRST (this is the priority)
342
  log_step(f"Saving pipeline to: {pipeline_path}")
343
  joblib.dump(pipeline, pipeline_path)
344
-
345
  # Verify pipeline was saved
346
  if pipeline_path.exists():
347
  log_step(f"✅ Pipeline saved successfully to {pipeline_path}")
@@ -378,12 +326,7 @@ def run_initial_training():
378
  "data_path": str(dataset_path),
379
  "class_distribution": class_counts.to_dict(),
380
  "pipeline_created": pipeline_path.exists(),
381
- "individual_components_created": model_path.exists() and vectorizer_path.exists(),
382
- # Add CV results to metadata
383
- "cv_f1_mean": float(cv_results['test_f1_weighted'].mean()),
384
- "cv_f1_std": float(cv_results['test_f1_weighted'].std()),
385
- "cv_accuracy_mean": float(cv_results['test_accuracy'].mean()),
386
- "cv_accuracy_std": float(cv_results['test_accuracy'].std())
387
  }
388
 
389
  metadata_path = path_manager.get_metadata_path()
@@ -445,20 +388,6 @@ def create_initial_logs():
445
  json.dump([], f)
446
  log_step(f"✅ Created {log_file}")
447
 
448
- # Create monitoring directory structure
449
- monitor_dir = path_manager.get_logs_path("monitor")
450
- monitor_dir.mkdir(parents=True, exist_ok=True)
451
- log_step(f"✅ Created monitoring directory: {monitor_dir}")
452
-
453
- # Create empty monitoring log files
454
- monitor_files = ["predictions.json", "metrics.json", "alerts.json"]
455
- for monitor_file in monitor_files:
456
- monitor_path = monitor_dir / monitor_file
457
- if not monitor_path.exists():
458
- with open(monitor_path, 'w') as f:
459
- json.dump([], f)
460
- log_step(f"✅ Created {monitor_file}")
461
-
462
  return True
463
 
464
  except Exception as e:
@@ -591,7 +520,7 @@ def main():
591
  return len(failed_steps) == 0
592
 
593
 
594
-
595
  if __name__ == "__main__":
596
  success = main()
597
- sys.exit(0)
 
 
1
  import os
2
  import sys
 
 
3
  import shutil
4
  import pandas as pd
5
+ import json
6
  from pathlib import Path
7
  from datetime import datetime
 
 
 
 
 
 
 
 
8
 
9
  # Import the new path manager
10
  try:
 
179
  return False
180
 
181
 
 
 
 
182
  def run_initial_training():
183
  """Run basic model training"""
184
  log_step("Starting initial model training...")
 
224
  return True
225
 
226
  # Import required libraries
227
+ from sklearn.feature_extraction.text import TfidfVectorizer
228
+ from sklearn.linear_model import LogisticRegression
229
+ from sklearn.model_selection import train_test_split
230
+ from sklearn.metrics import accuracy_score, f1_score
231
+ from sklearn.pipeline import Pipeline
232
+ import joblib
233
 
234
  # Load dataset
235
  dataset_path = path_manager.get_combined_dataset_path()
 
274
  ))
275
  ])
276
 
277
+ # Train model
278
+ log_step("Training model...")
 
 
 
 
 
 
 
 
 
 
279
  pipeline.fit(X_train, y_train)
280
+
281
+ # Evaluate
282
  y_pred = pipeline.predict(X_test)
283
  accuracy = accuracy_score(y_test, y_pred)
284
  f1 = f1_score(y_test, y_pred, average='weighted')
285
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  # Ensure model directory exists
287
  model_path.parent.mkdir(parents=True, exist_ok=True)
288
+
289
  # Save complete pipeline FIRST (this is the priority)
290
  log_step(f"Saving pipeline to: {pipeline_path}")
291
  joblib.dump(pipeline, pipeline_path)
292
+
293
  # Verify pipeline was saved
294
  if pipeline_path.exists():
295
  log_step(f"✅ Pipeline saved successfully to {pipeline_path}")
 
326
  "data_path": str(dataset_path),
327
  "class_distribution": class_counts.to_dict(),
328
  "pipeline_created": pipeline_path.exists(),
329
+ "individual_components_created": model_path.exists() and vectorizer_path.exists()
 
 
 
 
 
330
  }
331
 
332
  metadata_path = path_manager.get_metadata_path()
 
388
  json.dump([], f)
389
  log_step(f"✅ Created {log_file}")
390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  return True
392
 
393
  except Exception as e:
 
520
  return len(failed_steps) == 0
521
 
522
 
 
523
  if __name__ == "__main__":
524
  success = main()
525
+ if not success:
526
+ sys.exit(1)