Ahmedik95316 commited on
Commit
c678ee1
Β·
1 Parent(s): 3a989cc

Update initialize_system.py

Browse files

Update to include the model training at the start so the pipeline is available

Files changed (1) hide show
  1. initialize_system.py +336 -57
initialize_system.py CHANGED
@@ -12,14 +12,55 @@ def log_step(message):
12
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def create_directories():
16
  """Create necessary directories"""
17
  log_step("Creating directory structure...")
18
 
19
  directories = [
20
  "/tmp/data",
 
21
  "/tmp/model",
22
- "/tmp/logs"
 
 
23
  ]
24
 
25
  for dir_path in directories:
@@ -34,7 +75,10 @@ def copy_original_datasets():
34
  source_files = [
35
  ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
36
  ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
37
- ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv")
 
 
 
38
  ]
39
 
40
  copied_count = 0
@@ -60,47 +104,92 @@ def create_minimal_dataset():
60
  log_step("βœ… Combined dataset already exists")
61
  return True
62
 
63
- # Create minimal training data
64
  minimal_data = pd.DataFrame({
65
  'text': [
66
- 'Scientists discover new species in Amazon rainforest',
67
- 'SHOCKING: Aliens spotted in Area 51, government confirms existence',
68
- 'Local authorities report increase in renewable energy adoption',
69
- 'You won\'t believe what happens when you eat this miracle fruit',
70
- 'Economic indicators show steady growth in manufacturing sector',
71
- 'EXCLUSIVE: Celebrity caught in secret alien communication scandal',
72
- 'Research shows positive effects of meditation on mental health',
73
- 'Government hiding truth about flat earth, conspiracy theorists claim',
74
- 'New study reveals benefits of regular exercise for elderly',
75
- 'BREAKING: Time travel confirmed by underground scientists'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  ],
77
- 'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # 0=Real, 1=Fake
 
 
 
 
 
78
  })
79
 
80
  minimal_data.to_csv(combined_path, index=False)
81
- log_step(f"βœ… Created minimal dataset with {len(minimal_data)} samples")
 
 
82
  return True
83
 
84
 
85
  def run_initial_training():
86
- """Run basic model training"""
87
- log_step("Starting initial model training...")
88
 
89
  try:
90
- # Check if model already exists
91
- model_path = Path("/tmp/model.pkl")
92
- vectorizer_path = Path("/tmp/vectorizer.pkl")
93
-
94
- if model_path.exists() and vectorizer_path.exists():
95
- log_step("βœ… Model files already exist")
96
- return True
97
-
98
- # Import required libraries
99
  from sklearn.feature_extraction.text import TfidfVectorizer
100
  from sklearn.linear_model import LogisticRegression
101
- from sklearn.model_selection import train_test_split
102
- from sklearn.metrics import accuracy_score
 
 
 
 
103
  import joblib
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  # Load dataset
106
  dataset_path = Path("/tmp/data/combined_dataset.csv")
@@ -109,7 +198,14 @@ def run_initial_training():
109
  return False
110
 
111
  df = pd.read_csv(dataset_path)
112
- log_step(f"Loaded dataset with {len(df)} samples")
 
 
 
 
 
 
 
113
 
114
  # Prepare data
115
  X = df['text'].values
@@ -120,46 +216,125 @@ def run_initial_training():
120
  X, y, test_size=0.2, random_state=42, stratify=y
121
  )
122
 
123
- # Vectorization
 
 
 
 
 
 
 
124
  vectorizer = TfidfVectorizer(
125
  max_features=5000,
 
 
 
126
  stop_words='english',
127
- ngram_range=(1, 2)
 
 
 
 
 
 
128
  )
129
- X_train_vec = vectorizer.fit_transform(X_train)
130
- X_test_vec = vectorizer.transform(X_test)
131
 
132
- # Train model
133
- model = LogisticRegression(max_iter=1000, random_state=42)
134
- model.fit(X_train_vec, y_train)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
- # Evaluate
137
- y_pred = model.predict(X_test_vec)
138
  accuracy = accuracy_score(y_test, y_pred)
 
139
 
140
- # Save model
141
- joblib.dump(model, "/tmp/model.pkl")
142
- joblib.dump(vectorizer, "/tmp/vectorizer.pkl")
143
 
144
- # Save metadata
 
 
 
 
 
 
 
 
 
 
 
 
145
  metadata = {
146
- "model_version": "v1.0_init",
147
- "test_accuracy": float(accuracy),
 
 
148
  "train_size": len(X_train),
149
  "test_size": len(X_test),
 
 
 
 
 
 
 
 
 
 
 
150
  "timestamp": datetime.now().isoformat(),
151
- "training_method": "initialization"
 
152
  }
153
 
 
154
  with open("/tmp/metadata.json", 'w') as f:
155
  json.dump(metadata, f, indent=2)
156
 
157
- log_step(
158
- f"βœ… Training completed successfully, accuracy: {accuracy:.4f}")
 
 
159
  return True
160
 
161
  except Exception as e:
162
  log_step(f"❌ Training failed: {str(e)}")
 
 
163
  return False
164
 
165
 
@@ -171,16 +346,24 @@ def create_initial_logs():
171
  # Activity log
172
  activity_log = [{
173
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
174
- "event": "System initialized successfully"
 
175
  }]
176
 
177
  with open("/tmp/activity_log.json", 'w') as f:
178
  json.dump(activity_log, f, indent=2)
179
 
180
  # Create empty monitoring logs
 
 
 
 
181
  with open("/tmp/logs/monitoring_log.json", 'w') as f:
182
  json.dump([], f)
183
 
 
 
 
184
  log_step("βœ… Initial log files created")
185
  return True
186
 
@@ -189,22 +372,98 @@ def create_initial_logs():
189
  return False
190
 
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  def main():
193
- """Main initialization function"""
194
- log_step("πŸš€ Starting system initialization...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
 
196
  steps = [
197
  ("Directory Creation", create_directories),
198
  ("Dataset Copy", copy_original_datasets),
199
- ("Minimal Dataset", create_minimal_dataset),
200
- ("Model Training", run_initial_training),
201
  ("Log Creation", create_initial_logs)
202
  ]
203
 
 
 
 
 
204
  failed_steps = []
205
 
206
  for step_name, step_function in steps:
207
  try:
 
208
  if step_function():
209
  log_step(f"βœ… {step_name} completed")
210
  else:
@@ -214,15 +473,35 @@ def main():
214
  log_step(f"❌ {step_name} failed: {str(e)}")
215
  failed_steps.append(step_name)
216
 
 
 
 
 
 
 
217
  if failed_steps:
218
- log_step(
219
- f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
220
- log_step(f"Failed: {', '.join(failed_steps)}")
221
  else:
222
  log_step("πŸŽ‰ System initialization completed successfully!")
223
 
224
- log_step("System ready for use!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
 
227
  if __name__ == "__main__":
228
- main()
 
12
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
13
 
14
 
15
+ def check_model_exists():
16
+ """Check if trained model already exists"""
17
+ model_files = [
18
+ Path("/tmp/pipeline.pkl"),
19
+ Path("/tmp/model.pkl"),
20
+ Path("/tmp/vectorizer.pkl"),
21
+ Path("/tmp/metadata.json")
22
+ ]
23
+
24
+ existing_files = [f for f in model_files if f.exists()]
25
+
26
+ if len(existing_files) >= 2: # At least pipeline + metadata OR model + vectorizer
27
+ log_step(f"βœ… Found {len(existing_files)} existing model files")
28
+ return True, existing_files
29
+ else:
30
+ log_step(f"❌ Missing model files - only found {len(existing_files)}")
31
+ return False, existing_files
32
+
33
+
34
+ def check_training_data_exists():
35
+ """Check if training data is available"""
36
+ data_files = [
37
+ Path("/tmp/data/combined_dataset.csv"),
38
+ Path("/app/data/combined_dataset.csv"),
39
+ Path("/tmp/data/kaggle/Fake.csv"),
40
+ Path("/tmp/data/kaggle/True.csv")
41
+ ]
42
+
43
+ existing_data = [f for f in data_files if f.exists()]
44
+
45
+ if existing_data:
46
+ log_step(f"βœ… Found training data: {[str(f) for f in existing_data]}")
47
+ return True, existing_data
48
+ else:
49
+ log_step("❌ No training data found")
50
+ return False, []
51
+
52
+
53
  def create_directories():
54
  """Create necessary directories"""
55
  log_step("Creating directory structure...")
56
 
57
  directories = [
58
  "/tmp/data",
59
+ "/tmp/data/kaggle",
60
  "/tmp/model",
61
+ "/tmp/logs",
62
+ "/tmp/results",
63
+ "/tmp/backups"
64
  ]
65
 
66
  for dir_path in directories:
 
75
  source_files = [
76
  ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
77
  ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
78
+ ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
79
+ ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
80
+ ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
81
+ ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
82
  ]
83
 
84
  copied_count = 0
 
104
  log_step("βœ… Combined dataset already exists")
105
  return True
106
 
107
+ # Create minimal training data with more samples for better training
108
  minimal_data = pd.DataFrame({
109
  'text': [
110
+ # Real news samples
111
+ 'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
112
+ 'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
113
+ 'Local authorities report significant improvements in air quality following new environmental regulations',
114
+ 'Research published in Nature journal shows promising results for renewable energy storage technology',
115
+ 'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
116
+ 'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
117
+ 'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
118
+ 'Transportation department announces infrastructure improvements to major highways across the region',
119
+ 'Educational institutions implement new digital learning platforms to enhance student engagement',
120
+ 'Agricultural studies reveal improved crop yields through sustainable farming practices',
121
+ 'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
122
+ 'Municipal government approves budget for public transportation expansion project in urban areas',
123
+ 'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
124
+ 'International trade agreements show positive impact on local businesses and job creation',
125
+ 'Environmental protection agency releases report on water quality improvements in major rivers',
126
+
127
+ # Fake news samples
128
+ 'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
129
+ 'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
130
+ 'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
131
+ 'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
132
+ 'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
133
+ 'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
134
+ 'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
135
+ 'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
136
+ 'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
137
+ 'WARNING: New technology allows complete thought reading through WiFi signals in your home',
138
+ 'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
139
+ 'UNCOVERED: All news media controlled by single person living in secret underground bunker',
140
+ 'PROOF: Time travel already exists but only available to wealthy elite who control world events',
141
+ 'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
142
+ 'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
143
  ],
144
+ 'label': [
145
+ # Real news labels (0)
146
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
147
+ # Fake news labels (1)
148
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
149
+ ]
150
  })
151
 
152
  minimal_data.to_csv(combined_path, index=False)
153
+ log_step(f"βœ… Created enhanced minimal dataset with {len(minimal_data)} samples")
154
+ log_step(f" - Real news samples: {sum(minimal_data['label'] == 0)}")
155
+ log_step(f" - Fake news samples: {sum(minimal_data['label'] == 1)}")
156
  return True
157
 
158
 
159
  def run_initial_training():
160
+ """Run comprehensive model training for first-time setup"""
161
+ log_step("πŸš€ Starting comprehensive model training for first-time setup...")
162
 
163
  try:
164
+ # Import training modules
 
 
 
 
 
 
 
 
165
  from sklearn.feature_extraction.text import TfidfVectorizer
166
  from sklearn.linear_model import LogisticRegression
167
+ from sklearn.ensemble import RandomForestClassifier
168
+ from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
169
+ from sklearn.pipeline import Pipeline
170
+ from sklearn.feature_selection import SelectKBest, chi2
171
+ from sklearn.preprocessing import FunctionTransformer
172
+ from sklearn.metrics import accuracy_score, f1_score, classification_report
173
  import joblib
174
+ import re
175
+
176
+ # Text preprocessing function (same as in train.py)
177
+ def preprocess_text_function(texts):
178
+ def clean_single_text(text):
179
+ text = str(text)
180
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text)
181
+ text = re.sub(r'\S+@\S+', '', text)
182
+ text = re.sub(r'[!]{2,}', '!', text)
183
+ text = re.sub(r'[?]{2,}', '?', text)
184
+ text = re.sub(r'[.]{3,}', '...', text)
185
+ text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
186
+ text = re.sub(r'\s+', ' ', text)
187
+ return text.strip().lower()
188
+
189
+ processed = []
190
+ for text in texts:
191
+ processed.append(clean_single_text(text))
192
+ return processed
193
 
194
  # Load dataset
195
  dataset_path = Path("/tmp/data/combined_dataset.csv")
 
198
  return False
199
 
200
  df = pd.read_csv(dataset_path)
201
+ log_step(f"πŸ“Š Loaded dataset with {len(df)} samples")
202
+
203
+ # Data validation and cleaning
204
+ df = df.dropna(subset=['text', 'label'])
205
+ df = df[df['text'].astype(str).str.len() > 10]
206
+
207
+ log_step(f"πŸ“Š After cleaning: {len(df)} samples")
208
+ log_step(f"πŸ“Š Class distribution: {df['label'].value_counts().to_dict()}")
209
 
210
  # Prepare data
211
  X = df['text'].values
 
216
  X, y, test_size=0.2, random_state=42, stratify=y
217
  )
218
 
219
+ log_step(f"πŸ“Š Data split: {len(X_train)} train, {len(X_test)} test")
220
+
221
+ # Create comprehensive pipeline
222
+ text_preprocessor = FunctionTransformer(
223
+ func=preprocess_text_function,
224
+ validate=False
225
+ )
226
+
227
  vectorizer = TfidfVectorizer(
228
  max_features=5000,
229
+ min_df=1,
230
+ max_df=0.95,
231
+ ngram_range=(1, 2),
232
  stop_words='english',
233
+ sublinear_tf=True,
234
+ norm='l2'
235
+ )
236
+
237
+ feature_selector = SelectKBest(
238
+ score_func=chi2,
239
+ k=2000
240
  )
 
 
241
 
242
+ # Create pipeline with Logistic Regression
243
+ pipeline = Pipeline([
244
+ ('preprocess', text_preprocessor),
245
+ ('vectorize', vectorizer),
246
+ ('feature_select', feature_selector),
247
+ ('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
248
+ ])
249
+
250
+ log_step("πŸ”§ Training model with optimized pipeline...")
251
+
252
+ # Hyperparameter tuning for datasets with sufficient samples
253
+ if len(X_train) >= 20:
254
+ log_step("βš™οΈ Performing hyperparameter tuning...")
255
+ param_grid = {
256
+ 'model__C': [0.1, 1, 10],
257
+ 'model__penalty': ['l2']
258
+ }
259
+
260
+ cv_folds = max(2, min(3, len(X_train) // 10))
261
+ grid_search = GridSearchCV(
262
+ pipeline,
263
+ param_grid,
264
+ cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
265
+ scoring='f1_weighted',
266
+ n_jobs=1
267
+ )
268
+
269
+ grid_search.fit(X_train, y_train)
270
+ best_pipeline = grid_search.best_estimator_
271
+
272
+ log_step(f"βœ… Best parameters: {grid_search.best_params_}")
273
+ log_step(f"βœ… Best CV score: {grid_search.best_score_:.4f}")
274
+ else:
275
+ log_step("βš™οΈ Using simple training for small dataset...")
276
+ pipeline.fit(X_train, y_train)
277
+ best_pipeline = pipeline
278
 
279
+ # Evaluate model
280
+ y_pred = best_pipeline.predict(X_test)
281
  accuracy = accuracy_score(y_test, y_pred)
282
+ f1 = f1_score(y_test, y_pred, average='weighted')
283
 
284
+ log_step(f"πŸ“ˆ Model Performance:")
285
+ log_step(f" - Accuracy: {accuracy:.4f}")
286
+ log_step(f" - F1 Score: {f1:.4f}")
287
 
288
+ # Save model artifacts
289
+ log_step("πŸ’Ύ Saving model artifacts...")
290
+
291
+ # Save the complete pipeline
292
+ joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
293
+ log_step("βœ… Saved complete pipeline")
294
+
295
+ # Save individual components for compatibility
296
+ joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
297
+ joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
298
+ log_step("βœ… Saved individual model components")
299
+
300
+ # Generate comprehensive metadata
301
  metadata = {
302
+ "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
303
+ "model_type": "logistic_regression",
304
+ "training_method": "initial_setup",
305
+ "dataset_size": len(df),
306
  "train_size": len(X_train),
307
  "test_size": len(X_test),
308
+ "test_accuracy": float(accuracy),
309
+ "test_f1": float(f1),
310
+ "hyperparameter_tuning": len(X_train) >= 20,
311
+ "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
312
+ "class_distribution": df['label'].value_counts().to_dict(),
313
+ "training_config": {
314
+ "max_features": 5000,
315
+ "ngram_range": [1, 2],
316
+ "feature_selection_k": 2000,
317
+ "test_size": 0.2
318
+ },
319
  "timestamp": datetime.now().isoformat(),
320
+ "initialization_notes": "Model trained during system initialization",
321
+ "ready_for_production": True
322
  }
323
 
324
+ # Save metadata
325
  with open("/tmp/metadata.json", 'w') as f:
326
  json.dump(metadata, f, indent=2)
327
 
328
+ log_step("βœ… Saved comprehensive metadata")
329
+ log_step(f"πŸŽ‰ Initial model training completed successfully!")
330
+ log_step(f"πŸ“Š Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
331
+
332
  return True
333
 
334
  except Exception as e:
335
  log_step(f"❌ Training failed: {str(e)}")
336
+ import traceback
337
+ log_step(f"πŸ” Error details: {traceback.format_exc()}")
338
  return False
339
 
340
 
 
346
  # Activity log
347
  activity_log = [{
348
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
349
+ "event": "System initialized successfully with trained model",
350
+ "level": "INFO"
351
  }]
352
 
353
  with open("/tmp/activity_log.json", 'w') as f:
354
  json.dump(activity_log, f, indent=2)
355
 
356
  # Create empty monitoring logs
357
+ log_dirs = ["/tmp/logs"]
358
+ for log_dir in log_dirs:
359
+ Path(log_dir).mkdir(parents=True, exist_ok=True)
360
+
361
  with open("/tmp/logs/monitoring_log.json", 'w') as f:
362
  json.dump([], f)
363
 
364
+ with open("/tmp/logs/scheduler_execution.json", 'w') as f:
365
+ json.dump([], f)
366
+
367
  log_step("βœ… Initial log files created")
368
  return True
369
 
 
372
  return False
373
 
374
 
375
+ def validate_installation():
376
+ """Validate that the system is properly set up"""
377
+ log_step("πŸ” Validating system installation...")
378
+
379
+ validation_checks = []
380
+
381
+ # Check model files
382
+ model_exists, model_files = check_model_exists()
383
+ validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
384
+
385
+ # Check data files
386
+ data_exists, data_files = check_training_data_exists()
387
+ validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
388
+
389
+ # Check directories
390
+ required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
391
+ dirs_exist = all(Path(d).exists() for d in required_dirs)
392
+ validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
393
+
394
+ # Check logs
395
+ log_exists = Path("/tmp/activity_log.json").exists()
396
+ validation_checks.append(("Log Files", log_exists, "Activity log created"))
397
+
398
+ # Test model loading
399
+ model_loadable = False
400
+ try:
401
+ import joblib
402
+ pipeline = joblib.load("/tmp/pipeline.pkl")
403
+ test_prediction = pipeline.predict(["This is a test news article"])
404
+ model_loadable = True
405
+ validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
406
+ except Exception as e:
407
+ validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
408
+
409
+ # Print validation results
410
+ log_step("πŸ“‹ Validation Results:")
411
+ all_passed = True
412
+ for check_name, passed, details in validation_checks:
413
+ status = "βœ… PASS" if passed else "❌ FAIL"
414
+ log_step(f" {status} {check_name}: {details}")
415
+ if not passed:
416
+ all_passed = False
417
+
418
+ return all_passed, validation_checks
419
+
420
+
421
  def main():
422
+ """Main initialization function with smart training logic"""
423
+ log_step("πŸš€ Starting intelligent system initialization...")
424
+
425
+ # Check if model already exists
426
+ model_exists, existing_model_files = check_model_exists()
427
+
428
+ if model_exists:
429
+ log_step("🎯 EXISTING INSTALLATION DETECTED")
430
+ log_step("πŸ“„ Found existing model files - skipping training")
431
+
432
+ # Load existing metadata to show info
433
+ try:
434
+ with open("/tmp/metadata.json", 'r') as f:
435
+ metadata = json.load(f)
436
+
437
+ log_step(f"πŸ“Š Existing Model Info:")
438
+ log_step(f" - Version: {metadata.get('model_version', 'Unknown')}")
439
+ log_step(f" - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
440
+ log_step(f" - F1 Score: {metadata.get('test_f1', 'Unknown')}")
441
+ log_step(f" - Created: {metadata.get('timestamp', 'Unknown')}")
442
+
443
+ except Exception as e:
444
+ log_step(f"⚠️ Could not read existing metadata: {e}")
445
+
446
+ else:
447
+ log_step("πŸ†• FIRST-TIME INSTALLATION DETECTED")
448
+ log_step("πŸ”§ No existing model found - will train new model")
449
 
450
+ # Run initialization steps
451
  steps = [
452
  ("Directory Creation", create_directories),
453
  ("Dataset Copy", copy_original_datasets),
454
+ ("Dataset Preparation", create_minimal_dataset),
 
455
  ("Log Creation", create_initial_logs)
456
  ]
457
 
458
+ # Add training step only if model doesn't exist
459
+ if not model_exists:
460
+ steps.insert(-1, ("πŸ€– Model Training", run_initial_training))
461
+
462
  failed_steps = []
463
 
464
  for step_name, step_function in steps:
465
  try:
466
+ log_step(f"▢️ Starting: {step_name}")
467
  if step_function():
468
  log_step(f"βœ… {step_name} completed")
469
  else:
 
473
  log_step(f"❌ {step_name} failed: {str(e)}")
474
  failed_steps.append(step_name)
475
 
476
+ # Final validation
477
+ log_step("πŸ” Running final system validation...")
478
+ validation_passed, validation_results = validate_installation()
479
+
480
+ # Summary
481
+ log_step("=" * 60)
482
  if failed_steps:
483
+ log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
484
+ log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
 
485
  else:
486
  log_step("πŸŽ‰ System initialization completed successfully!")
487
 
488
+ if validation_passed:
489
+ log_step("βœ… All validation checks passed!")
490
+ log_step("πŸš€ System is ready for use!")
491
+
492
+ if not model_exists:
493
+ log_step("πŸ€– NEW MODEL TRAINED AND READY")
494
+ log_step("πŸ“Š You can now start making predictions!")
495
+ else:
496
+ log_step("πŸ”„ EXISTING MODEL VALIDATED AND READY")
497
+ log_step("πŸ“Š System restored from previous installation!")
498
+
499
+ else:
500
+ log_step("❌ Some validation checks failed")
501
+ log_step("πŸ”§ Manual intervention may be required")
502
+
503
+ log_step("=" * 60)
504
 
505
 
506
  if __name__ == "__main__":
507
+ main()