Ahmedik95316 commited on
Commit
e847844
Β·
1 Parent(s): 9666aeb

Update initialize_system.py

Browse files
Files changed (1) hide show
  1. initialize_system.py +297 -374
initialize_system.py CHANGED
@@ -6,88 +6,109 @@ import json
6
  from pathlib import Path
7
  from datetime import datetime
8
 
 
 
 
 
 
 
 
 
9
 
10
  def log_step(message):
11
  """Log initialization steps"""
12
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
13
 
14
 
15
- def check_model_exists():
16
- """Check if trained model already exists"""
17
- model_files = [
18
- Path("/tmp/pipeline.pkl"),
19
- Path("/tmp/model.pkl"),
20
- Path("/tmp/vectorizer.pkl"),
21
- Path("/tmp/metadata.json")
22
- ]
23
-
24
- existing_files = [f for f in model_files if f.exists()]
25
-
26
- if len(existing_files) >= 2: # At least pipeline + metadata OR model + vectorizer
27
- log_step(f"βœ… Found {len(existing_files)} existing model files")
28
- return True, existing_files
29
- else:
30
- log_step(f"❌ Missing model files - only found {len(existing_files)}")
31
- return False, existing_files
32
-
33
-
34
- def check_training_data_exists():
35
- """Check if training data is available"""
36
- data_files = [
37
- Path("/tmp/data/combined_dataset.csv"),
38
- Path("/app/data/combined_dataset.csv"),
39
- Path("/tmp/data/kaggle/Fake.csv"),
40
- Path("/tmp/data/kaggle/True.csv")
41
- ]
42
-
43
- existing_data = [f for f in data_files if f.exists()]
44
-
45
- if existing_data:
46
- log_step(f"βœ… Found training data: {[str(f) for f in existing_data]}")
47
- return True, existing_data
48
- else:
49
- log_step("❌ No training data found")
50
- return False, []
51
-
52
-
53
  def create_directories():
54
  """Create necessary directories"""
55
  log_step("Creating directory structure...")
56
 
 
57
  directories = [
58
- "/tmp/data",
59
- "/tmp/data/kaggle",
60
- "/tmp/model",
61
- "/tmp/logs",
62
- "/tmp/results",
63
- "/tmp/backups"
64
  ]
65
 
66
  for dir_path in directories:
67
- Path(dir_path).mkdir(parents=True, exist_ok=True)
68
- log_step(f"βœ… Created {dir_path}")
 
 
 
 
 
 
 
 
 
 
 
 
69
 
 
70
 
71
- def copy_original_datasets():
72
- """Copy original datasets from /app to /tmp"""
73
- log_step("Copying original datasets...")
74
 
75
- source_files = [
76
- ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
77
- ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
78
- ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
79
- ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
80
- ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
81
- ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
 
 
 
 
 
82
  ]
 
 
 
 
 
 
 
 
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  copied_count = 0
85
- for source, dest in source_files:
86
- if Path(source).exists():
87
- Path(dest).parent.mkdir(parents=True, exist_ok=True)
88
- shutil.copy(source, dest)
89
- log_step(f"βœ… Copied {source} to {dest}")
90
- copied_count += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  else:
92
  log_step(f"⚠️ Source file not found: {source}")
93
 
@@ -95,268 +116,184 @@ def copy_original_datasets():
95
 
96
 
97
  def create_minimal_dataset():
98
- """Create a minimal dataset if original doesn't exist"""
99
  log_step("Creating minimal dataset...")
100
 
101
- combined_path = Path("/tmp/data/combined_dataset.csv")
102
 
103
  if combined_path.exists():
104
- log_step("βœ… Combined dataset already exists")
105
  return True
106
 
107
- # Create minimal training data with more samples for better training
108
- minimal_data = pd.DataFrame({
109
- 'text': [
110
- # Real news samples
111
- 'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
112
- 'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
113
- 'Local authorities report significant improvements in air quality following new environmental regulations',
114
- 'Research published in Nature journal shows promising results for renewable energy storage technology',
115
- 'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
116
- 'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
117
- 'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
118
- 'Transportation department announces infrastructure improvements to major highways across the region',
119
- 'Educational institutions implement new digital learning platforms to enhance student engagement',
120
- 'Agricultural studies reveal improved crop yields through sustainable farming practices',
121
- 'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
122
- 'Municipal government approves budget for public transportation expansion project in urban areas',
123
- 'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
124
- 'International trade agreements show positive impact on local businesses and job creation',
125
- 'Environmental protection agency releases report on water quality improvements in major rivers',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- # Fake news samples
128
- 'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
129
- 'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
130
- 'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
131
- 'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
132
- 'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
133
- 'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
134
- 'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
135
- 'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
136
- 'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
137
- 'WARNING: New technology allows complete thought reading through WiFi signals in your home',
138
- 'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
139
- 'UNCOVERED: All news media controlled by single person living in secret underground bunker',
140
- 'PROOF: Time travel already exists but only available to wealthy elite who control world events',
141
- 'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
142
- 'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
143
- ],
144
- 'label': [
145
- # Real news labels (0)
146
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
147
- # Fake news labels (1)
148
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
149
- ]
150
- })
151
-
152
- minimal_data.to_csv(combined_path, index=False)
153
- log_step(f"βœ… Created enhanced minimal dataset with {len(minimal_data)} samples")
154
- log_step(f" - Real news samples: {sum(minimal_data['label'] == 0)}")
155
- log_step(f" - Fake news samples: {sum(minimal_data['label'] == 1)}")
156
- return True
157
 
158
 
159
  def run_initial_training():
160
- """Run comprehensive model training for first-time setup"""
161
- log_step("πŸš€ Starting comprehensive model training for first-time setup...")
162
 
163
  try:
164
- # Import training modules
 
 
 
 
 
 
 
 
 
165
  from sklearn.feature_extraction.text import TfidfVectorizer
166
  from sklearn.linear_model import LogisticRegression
167
- from sklearn.ensemble import RandomForestClassifier
168
- from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
169
  from sklearn.pipeline import Pipeline
170
- from sklearn.feature_selection import SelectKBest, chi2
171
- from sklearn.preprocessing import FunctionTransformer
172
- from sklearn.metrics import accuracy_score, f1_score, classification_report
173
  import joblib
174
- import re
175
-
176
- # Text preprocessing function (same as in train.py)
177
- def preprocess_text_function(texts):
178
- def clean_single_text(text):
179
- text = str(text)
180
- text = re.sub(r'http\S+|www\S+|https\S+', '', text)
181
- text = re.sub(r'\S+@\S+', '', text)
182
- text = re.sub(r'[!]{2,}', '!', text)
183
- text = re.sub(r'[?]{2,}', '?', text)
184
- text = re.sub(r'[.]{3,}', '...', text)
185
- text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
186
- text = re.sub(r'\s+', ' ', text)
187
- return text.strip().lower()
188
-
189
- processed = []
190
- for text in texts:
191
- processed.append(clean_single_text(text))
192
- return processed
193
 
194
  # Load dataset
195
- dataset_path = Path("/tmp/data/combined_dataset.csv")
196
  if not dataset_path.exists():
197
  log_step("❌ No dataset available for training")
198
  return False
199
 
200
  df = pd.read_csv(dataset_path)
201
- log_step(f"πŸ“Š Loaded dataset with {len(df)} samples")
202
 
203
- # Data validation and cleaning
204
- df = df.dropna(subset=['text', 'label'])
205
- df = df[df['text'].astype(str).str.len() > 10]
206
-
207
- log_step(f"πŸ“Š After cleaning: {len(df)} samples")
208
- log_step(f"πŸ“Š Class distribution: {df['label'].value_counts().to_dict()}")
209
 
210
  # Prepare data
211
  X = df['text'].values
212
  y = df['label'].values
213
 
 
 
 
 
214
  # Train-test split
215
  X_train, X_test, y_train, y_test = train_test_split(
216
- X, y, test_size=0.2, random_state=42, stratify=y
217
- )
218
-
219
- log_step(f"πŸ“Š Data split: {len(X_train)} train, {len(X_test)} test")
220
-
221
- # Create comprehensive pipeline
222
- text_preprocessor = FunctionTransformer(
223
- func=preprocess_text_function,
224
- validate=False
225
  )
226
 
227
- vectorizer = TfidfVectorizer(
228
- max_features=5000,
229
- min_df=1,
230
- max_df=0.95,
231
- ngram_range=(1, 2),
232
- stop_words='english',
233
- sublinear_tf=True,
234
- norm='l2'
235
- )
236
-
237
- feature_selector = SelectKBest(
238
- score_func=chi2,
239
- k=2000
240
- )
241
-
242
- # Create pipeline with Logistic Regression
243
  pipeline = Pipeline([
244
- ('vectorize', TfidfVectorizer(
245
- max_features=10000,
246
- min_df=2,
247
- max_df=0.95,
248
- ngram_range=(1, 2),
249
  stop_words='english',
250
- lowercase=True,
251
- strip_accents='ascii'
 
252
  )),
253
  ('model', LogisticRegression(
254
- max_iter=1000,
255
- class_weight='balanced',
256
- random_state=42
257
  ))
258
  ])
259
-
260
- # Fit and save
 
261
  pipeline.fit(X_train, y_train)
262
-
263
  # Evaluate
264
  y_pred = pipeline.predict(X_test)
265
  accuracy = accuracy_score(y_test, y_pred)
266
-
267
- # Save artifacts
268
- joblib.dump(pipeline, "/tmp/pipeline.pkl")
269
- joblib.dump(pipeline.named_steps['model'], "/tmp/model.pkl")
270
- joblib.dump(pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
271
-
272
- log_step("πŸ”§ Training model with optimized pipeline...")
273
-
274
- # Hyperparameter tuning for datasets with sufficient samples
275
- if len(X_train) >= 20:
276
- log_step("βš™οΈ Performing hyperparameter tuning...")
277
- param_grid = {
278
- 'model__C': [0.1, 1, 10],
279
- 'model__penalty': ['l2']
280
- }
281
-
282
- cv_folds = max(2, min(3, len(X_train) // 10))
283
- grid_search = GridSearchCV(
284
- pipeline,
285
- param_grid,
286
- cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
287
- scoring='f1_weighted',
288
- n_jobs=1
289
- )
290
-
291
- grid_search.fit(X_train, y_train)
292
- best_pipeline = grid_search.best_estimator_
293
-
294
- log_step(f"βœ… Best parameters: {grid_search.best_params_}")
295
- log_step(f"βœ… Best CV score: {grid_search.best_score_:.4f}")
296
- else:
297
- log_step("βš™οΈ Using simple training for small dataset...")
298
- pipeline.fit(X_train, y_train)
299
- best_pipeline = pipeline
300
-
301
- # Evaluate model
302
- y_pred = best_pipeline.predict(X_test)
303
- accuracy = accuracy_score(y_test, y_pred)
304
  f1 = f1_score(y_test, y_pred, average='weighted')
305
 
306
- log_step(f"πŸ“ˆ Model Performance:")
307
- log_step(f" - Accuracy: {accuracy:.4f}")
308
- log_step(f" - F1 Score: {f1:.4f}")
309
-
310
- # Save model artifacts
311
- log_step("πŸ’Ύ Saving model artifacts...")
312
 
313
- # Save the complete pipeline
314
- joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
315
- log_step("βœ… Saved complete pipeline")
 
316
 
317
- # Save individual components for compatibility
318
- joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
319
- joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
320
- log_step("βœ… Saved individual model components")
321
-
322
- # Generate comprehensive metadata
323
  metadata = {
324
- "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
325
- "model_type": "logistic_regression",
326
- "training_method": "initial_setup",
327
- "dataset_size": len(df),
328
- "train_size": len(X_train),
329
- "test_size": len(X_test),
330
  "test_accuracy": float(accuracy),
331
  "test_f1": float(f1),
332
- "hyperparameter_tuning": len(X_train) >= 20,
333
- "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
334
- "class_distribution": df['label'].value_counts().to_dict(),
335
- "training_config": {
336
- "max_features": 5000,
337
- "ngram_range": [1, 2],
338
- "feature_selection_k": 2000,
339
- "test_size": 0.2
340
- },
341
  "timestamp": datetime.now().isoformat(),
342
- "initialization_notes": "Model trained during system initialization",
343
- "ready_for_production": True
 
 
344
  }
345
 
346
- # Save metadata
347
- with open("/tmp/metadata.json", 'w') as f:
348
  json.dump(metadata, f, indent=2)
349
 
350
- log_step("βœ… Saved comprehensive metadata")
351
- log_step(f"πŸŽ‰ Initial model training completed successfully!")
352
- log_step(f"πŸ“Š Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
353
-
 
 
 
354
  return True
355
 
356
  except Exception as e:
357
  log_step(f"❌ Training failed: {str(e)}")
358
  import traceback
359
- log_step(f"πŸ” Error details: {traceback.format_exc()}")
360
  return False
361
 
362
 
@@ -368,25 +305,37 @@ def create_initial_logs():
368
  # Activity log
369
  activity_log = [{
370
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
371
- "event": "System initialized successfully with trained model",
372
- "level": "INFO"
 
373
  }]
374
 
375
- with open("/tmp/activity_log.json", 'w') as f:
 
376
  json.dump(activity_log, f, indent=2)
 
377
 
378
  # Create empty monitoring logs
379
- log_dirs = ["/tmp/logs"]
380
- for log_dir in log_dirs:
381
- Path(log_dir).mkdir(parents=True, exist_ok=True)
382
-
383
- with open("/tmp/logs/monitoring_log.json", 'w') as f:
384
  json.dump([], f)
 
 
 
 
 
 
 
 
 
385
 
386
- with open("/tmp/logs/scheduler_execution.json", 'w') as f:
387
- json.dump([], f)
 
 
 
 
388
 
389
- log_step("βœ… Initial log files created")
390
  return True
391
 
392
  except Exception as e:
@@ -394,100 +343,76 @@ def create_initial_logs():
394
  return False
395
 
396
 
397
- def validate_installation():
398
- """Validate that the system is properly set up"""
399
- log_step("πŸ” Validating system installation...")
400
-
401
- validation_checks = []
402
-
403
- # Check model files
404
- model_exists, model_files = check_model_exists()
405
- validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))
406
-
407
- # Check data files
408
- data_exists, data_files = check_training_data_exists()
409
- validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))
410
-
411
- # Check directories
412
- required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
413
- dirs_exist = all(Path(d).exists() for d in required_dirs)
414
- validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))
415
-
416
- # Check logs
417
- log_exists = Path("/tmp/activity_log.json").exists()
418
- validation_checks.append(("Log Files", log_exists, "Activity log created"))
419
-
420
  # Test model loading
421
- model_loadable = False
422
  try:
423
  import joblib
424
- pipeline = joblib.load("/tmp/pipeline.pkl")
425
- test_prediction = pipeline.predict(["This is a test news article"])
426
- model_loadable = True
427
- validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
 
 
 
 
 
 
 
 
 
428
  except Exception as e:
429
- validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))
430
-
431
- # Print validation results
432
- log_step("πŸ“‹ Validation Results:")
433
- all_passed = True
434
- for check_name, passed, details in validation_checks:
435
- status = "βœ… PASS" if passed else "❌ FAIL"
436
- log_step(f" {status} {check_name}: {details}")
437
- if not passed:
438
- all_passed = False
439
-
440
- return all_passed, validation_checks
441
 
442
 
443
  def main():
444
- """Main initialization function with smart training logic"""
445
- log_step("πŸš€ Starting intelligent system initialization...")
446
-
447
- # Check if model already exists
448
- model_exists, existing_model_files = check_model_exists()
449
-
450
- if model_exists:
451
- log_step("🎯 EXISTING INSTALLATION DETECTED")
452
- log_step("πŸ“„ Found existing model files - skipping training")
453
-
454
- # Load existing metadata to show info
455
- try:
456
- with open("/tmp/metadata.json", 'r') as f:
457
- metadata = json.load(f)
458
-
459
- log_step(f"πŸ“Š Existing Model Info:")
460
- log_step(f" - Version: {metadata.get('model_version', 'Unknown')}")
461
- log_step(f" - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
462
- log_step(f" - F1 Score: {metadata.get('test_f1', 'Unknown')}")
463
- log_step(f" - Created: {metadata.get('timestamp', 'Unknown')}")
464
-
465
- except Exception as e:
466
- log_step(f"⚠️ Could not read existing metadata: {e}")
467
-
468
- else:
469
- log_step("πŸ†• FIRST-TIME INSTALLATION DETECTED")
470
- log_step("πŸ”§ No existing model found - will train new model")
471
 
472
- # Run initialization steps
473
  steps = [
474
  ("Directory Creation", create_directories),
475
- ("Dataset Copy", copy_original_datasets),
476
- ("Dataset Preparation", create_minimal_dataset),
477
- ("Log Creation", create_initial_logs)
 
 
478
  ]
479
 
480
- # Add training step only if model doesn't exist
481
- if not model_exists:
482
- steps.insert(-1, ("πŸ€– Model Training", run_initial_training))
483
-
484
  failed_steps = []
 
485
 
486
  for step_name, step_function in steps:
487
  try:
488
- log_step(f"▢️ Starting: {step_name}")
489
  if step_function():
490
  log_step(f"βœ… {step_name} completed")
 
491
  else:
492
  log_step(f"❌ {step_name} failed")
493
  failed_steps.append(step_name)
@@ -495,35 +420,33 @@ def main():
495
  log_step(f"❌ {step_name} failed: {str(e)}")
496
  failed_steps.append(step_name)
497
 
498
- # Final validation
499
- log_step("πŸ” Running final system validation...")
500
- validation_passed, validation_results = validate_installation()
501
-
502
  # Summary
503
- log_step("=" * 60)
 
 
 
 
 
 
504
  if failed_steps:
505
- log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
506
- log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
507
  else:
508
  log_step("πŸŽ‰ System initialization completed successfully!")
509
 
510
- if validation_passed:
511
- log_step("βœ… All validation checks passed!")
512
- log_step("πŸš€ System is ready for use!")
513
-
514
- if not model_exists:
515
- log_step("πŸ€– NEW MODEL TRAINED AND READY")
516
- log_step("πŸ“Š You can now start making predictions!")
517
- else:
518
- log_step("πŸ”„ EXISTING MODEL VALIDATED AND READY")
519
- log_step("πŸ“Š System restored from previous installation!")
520
-
521
- else:
522
- log_step("❌ Some validation checks failed")
523
- log_step("πŸ”§ Manual intervention may be required")
524
 
525
- log_step("=" * 60)
 
 
526
 
527
 
528
  if __name__ == "__main__":
529
- main()
 
 
 
6
  from pathlib import Path
7
  from datetime import datetime
8
 
9
+ # Import the new path manager
10
+ try:
11
+ from path_config import path_manager
12
+ except ImportError:
13
+ # Add current directory to path
14
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
15
+ from path_config import path_manager
16
+
17
 
18
  def log_step(message):
19
  """Log initialization steps"""
20
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
21
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def create_directories():
24
  """Create necessary directories"""
25
  log_step("Creating directory structure...")
26
 
27
+ # Directories are already created by path_manager initialization
28
  directories = [
29
+ path_manager.get_data_path(),
30
+ path_manager.get_model_path(),
31
+ path_manager.get_logs_path(),
32
+ path_manager.get_cache_path(),
33
+ path_manager.get_temp_path()
 
34
  ]
35
 
36
  for dir_path in directories:
37
+ if dir_path.exists():
38
+ log_step(f"βœ… Directory exists: {dir_path}")
39
+ else:
40
+ try:
41
+ dir_path.mkdir(parents=True, exist_ok=True)
42
+ log_step(f"βœ… Created directory: {dir_path}")
43
+ except Exception as e:
44
+ log_step(f"⚠️ Failed to create {dir_path}: {e}")
45
+ return False
46
+
47
+ # Create kaggle subdirectory
48
+ kaggle_dir = path_manager.get_data_path('kaggle')
49
+ kaggle_dir.mkdir(parents=True, exist_ok=True)
50
+ log_step(f"βœ… Created kaggle directory: {kaggle_dir}")
51
 
52
+ return True
53
 
 
 
 
54
 
55
+ def check_existing_datasets():
56
+ """Check for existing datasets in the project structure"""
57
+ log_step("Checking for existing datasets...")
58
+
59
+ # Check for datasets in the current project structure
60
+ base_dir = path_manager.base_paths['base']
61
+
62
+ # Possible source locations
63
+ source_locations = [
64
+ base_dir / "data" / "kaggle" / "Fake.csv",
65
+ base_dir / "data" / "kaggle" / "True.csv",
66
+ base_dir / "data" / "combined_dataset.csv"
67
  ]
68
+
69
+ found_files = []
70
+ for source_file in source_locations:
71
+ if source_file.exists():
72
+ found_files.append(source_file)
73
+ log_step(f"βœ… Found existing dataset: {source_file}")
74
+
75
+ return found_files
76
+
77
 
78
+ def copy_existing_datasets():
79
+ """Copy existing datasets if they're not in the target location"""
80
+ log_step("Copying existing datasets to target locations...")
81
+
82
+ base_dir = path_manager.base_paths['base']
83
+ target_data_dir = path_manager.get_data_path()
84
+
85
+ # Define source-target pairs
86
+ copy_operations = [
87
+ (base_dir / "data" / "kaggle" / "Fake.csv", target_data_dir / "kaggle" / "Fake.csv"),
88
+ (base_dir / "data" / "kaggle" / "True.csv", target_data_dir / "kaggle" / "True.csv"),
89
+ (base_dir / "data" / "combined_dataset.csv", target_data_dir / "combined_dataset.csv")
90
+ ]
91
+
92
  copied_count = 0
93
+ for source, target in copy_operations:
94
+ # Skip if source and target are the same (already in correct location)
95
+ if source == target:
96
+ if source.exists():
97
+ log_step(f"βœ… Dataset already in correct location: {target}")
98
+ copied_count += 1
99
+ continue
100
+
101
+ if source.exists():
102
+ try:
103
+ # Ensure target directory exists
104
+ target.parent.mkdir(parents=True, exist_ok=True)
105
+
106
+ # Copy file
107
+ shutil.copy2(source, target)
108
+ log_step(f"βœ… Copied {source} β†’ {target}")
109
+ copied_count += 1
110
+ except Exception as e:
111
+ log_step(f"⚠️ Failed to copy {source}: {e}")
112
  else:
113
  log_step(f"⚠️ Source file not found: {source}")
114
 
 
116
 
117
 
118
  def create_minimal_dataset():
119
+ """Create a minimal dataset if no existing dataset is found"""
120
  log_step("Creating minimal dataset...")
121
 
122
+ combined_path = path_manager.get_combined_dataset_path()
123
 
124
  if combined_path.exists():
125
+ log_step(f"βœ… Combined dataset already exists: {combined_path}")
126
  return True
127
 
128
+ try:
129
+ # Create minimal training data with diverse examples
130
+ minimal_data = pd.DataFrame({
131
+ 'text': [
132
+ # Real news examples
133
+ 'Scientists at MIT have developed a new renewable energy technology that could revolutionize solar power generation.',
134
+ 'The Federal Reserve announced interest rate decisions following their latest economic review meeting.',
135
+ 'Local authorities report significant improvements in air quality following new environmental regulations.',
136
+ 'Research published in Nature journal reveals new insights about climate change adaptation strategies.',
137
+ 'Economic indicators show steady growth in the manufacturing sector across multiple regions.',
138
+ 'Healthcare officials recommend updated vaccination schedules based on latest medical research findings.',
139
+ 'Transportation department announces infrastructure improvements for major highway systems nationwide.',
140
+ 'Educational institutions implement new digital learning platforms to enhance student engagement.',
141
+ 'Agricultural experts develop drought-resistant crop varieties to improve food security globally.',
142
+ 'Technology companies invest heavily in cybersecurity measures to protect user data privacy.',
143
+
144
+ # Fake news examples
145
+ 'SHOCKING: Government officials secretly planning to control population through mind control technology.',
146
+ 'EXCLUSIVE: Celebrities caught in massive alien communication scandal that mainstream media won\'t report.',
147
+ 'BREAKING: Scientists discover time travel but government hiding the truth from public knowledge.',
148
+ 'EXPOSED: Pharmaceutical companies deliberately spreading diseases to increase their massive profits.',
149
+ 'URGENT: Social media platforms using secret algorithms to brainwash users into political compliance.',
150
+ 'LEAKED: Banking system about to collapse completely, insiders reveal financial catastrophe coming soon.',
151
+ 'CONFIRMED: Weather modification technology being used to create artificial natural disasters worldwide.',
152
+ 'REVEALED: Food companies adding dangerous chemicals that cause instant health problems and addiction.',
153
+ 'CONSPIRACY: Educational system designed to suppress critical thinking and create obedient citizens.',
154
+ 'TRUTH: Technology giants working with foreign powers to undermine national sovereignty completely.'
155
+ ],
156
+ 'label': [
157
+ # Real news labels (0)
158
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
159
+ # Fake news labels (1)
160
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
161
+ ]
162
+ })
163
+
164
+ # Save the dataset
165
+ minimal_data.to_csv(combined_path, index=False)
166
+ log_step(f"βœ… Created minimal dataset with {len(minimal_data)} samples at {combined_path}")
167
+
168
+ # Verify the file was created correctly
169
+ if combined_path.exists():
170
+ df_check = pd.read_csv(combined_path)
171
+ log_step(f"βœ… Verified dataset: {len(df_check)} rows loaded successfully")
172
+ return True
173
+ else:
174
+ log_step("❌ Failed to verify created dataset")
175
+ return False
176
 
177
+ except Exception as e:
178
+ log_step(f"❌ Failed to create minimal dataset: {str(e)}")
179
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
 
182
  def run_initial_training():
183
+ """Run basic model training"""
184
+ log_step("Starting initial model training...")
185
 
186
  try:
187
+ # Check if model already exists
188
+ model_path = path_manager.get_model_file_path()
189
+ vectorizer_path = path_manager.get_vectorizer_path()
190
+ pipeline_path = path_manager.get_pipeline_path()
191
+
192
+ if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
193
+ log_step("βœ… Model files already exist")
194
+ return True
195
+
196
+ # Import required libraries
197
  from sklearn.feature_extraction.text import TfidfVectorizer
198
  from sklearn.linear_model import LogisticRegression
199
+ from sklearn.model_selection import train_test_split
200
+ from sklearn.metrics import accuracy_score, f1_score
201
  from sklearn.pipeline import Pipeline
 
 
 
202
  import joblib
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  # Load dataset
205
+ dataset_path = path_manager.get_combined_dataset_path()
206
  if not dataset_path.exists():
207
  log_step("❌ No dataset available for training")
208
  return False
209
 
210
  df = pd.read_csv(dataset_path)
211
+ log_step(f"Loaded dataset with {len(df)} samples")
212
 
213
+ # Validate dataset
214
+ if len(df) < 10:
215
+ log_step("❌ Dataset too small for training")
216
+ return False
 
 
217
 
218
  # Prepare data
219
  X = df['text'].values
220
  y = df['label'].values
221
 
222
+ # Check class distribution
223
+ class_counts = pd.Series(y).value_counts()
224
+ log_step(f"Class distribution: {class_counts.to_dict()}")
225
+
226
  # Train-test split
227
  X_train, X_test, y_train, y_test = train_test_split(
228
+ X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
 
 
 
 
 
 
 
 
229
  )
230
 
231
+ # Create pipeline with preprocessing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  pipeline = Pipeline([
233
+ ('vectorizer', TfidfVectorizer(
234
+ max_features=5000,
 
 
 
235
  stop_words='english',
236
+ ngram_range=(1, 2),
237
+ min_df=1,
238
+ max_df=0.95
239
  )),
240
  ('model', LogisticRegression(
241
+ max_iter=1000,
242
+ random_state=42,
243
+ class_weight='balanced'
244
  ))
245
  ])
246
+
247
+ # Train model
248
+ log_step("Training model...")
249
  pipeline.fit(X_train, y_train)
250
+
251
  # Evaluate
252
  y_pred = pipeline.predict(X_test)
253
  accuracy = accuracy_score(y_test, y_pred)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  f1 = f1_score(y_test, y_pred, average='weighted')
255
 
256
+ # Save complete pipeline
257
+ joblib.dump(pipeline, pipeline_path)
258
+ log_step(f"βœ… Saved pipeline to {pipeline_path}")
 
 
 
259
 
260
+ # Save individual components for backward compatibility
261
+ joblib.dump(pipeline.named_steps['model'], model_path)
262
+ joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
263
+ log_step(f"βœ… Saved individual components")
264
 
265
+ # Save metadata
 
 
 
 
 
266
  metadata = {
267
+ "model_version": "v1.0_init",
268
+ "model_type": "logistic_regression_pipeline",
 
 
 
 
269
  "test_accuracy": float(accuracy),
270
  "test_f1": float(f1),
271
+ "train_size": len(X_train),
272
+ "test_size": len(X_test),
 
 
 
 
 
 
 
273
  "timestamp": datetime.now().isoformat(),
274
+ "training_method": "initialization",
275
+ "environment": path_manager.environment,
276
+ "data_path": str(dataset_path),
277
+ "class_distribution": class_counts.to_dict()
278
  }
279
 
280
+ metadata_path = path_manager.get_metadata_path()
281
+ with open(metadata_path, 'w') as f:
282
  json.dump(metadata, f, indent=2)
283
 
284
+ log_step(f"βœ… Training completed successfully")
285
+ log_step(f" Accuracy: {accuracy:.4f}")
286
+ log_step(f" F1 Score: {f1:.4f}")
287
+ log_step(f" Model saved to: {model_path}")
288
+ log_step(f" Vectorizer saved to: {vectorizer_path}")
289
+ log_step(f" Pipeline saved to: {pipeline_path}")
290
+
291
  return True
292
 
293
  except Exception as e:
294
  log_step(f"❌ Training failed: {str(e)}")
295
  import traceback
296
+ log_step(f"❌ Traceback: {traceback.format_exc()}")
297
  return False
298
 
299
 
 
305
  # Activity log
306
  activity_log = [{
307
  "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
308
+ "event": "System initialized successfully",
309
+ "level": "INFO",
310
+ "environment": path_manager.environment
311
  }]
312
 
313
+ activity_log_path = path_manager.get_activity_log_path()
314
+ with open(activity_log_path, 'w') as f:
315
  json.dump(activity_log, f, indent=2)
316
+ log_step(f"βœ… Created activity log: {activity_log_path}")
317
 
318
  # Create empty monitoring logs
319
+ monitoring_log_path = path_manager.get_logs_path("monitoring_log.json")
320
+ with open(monitoring_log_path, 'w') as f:
 
 
 
321
  json.dump([], f)
322
+ log_step(f"βœ… Created monitoring log: {monitoring_log_path}")
323
+
324
+ # Create other necessary log files
325
+ log_files = [
326
+ "drift_history.json",
327
+ "drift_alerts.json",
328
+ "scheduler_execution.json",
329
+ "scheduler_errors.json"
330
+ ]
331
 
332
+ for log_file in log_files:
333
+ log_path = path_manager.get_logs_path(log_file)
334
+ if not log_path.exists():
335
+ with open(log_path, 'w') as f:
336
+ json.dump([], f)
337
+ log_step(f"βœ… Created {log_file}")
338
 
 
339
  return True
340
 
341
  except Exception as e:
 
343
  return False
344
 
345
 
346
+ def verify_system():
347
+ """Verify that the system is properly initialized"""
348
+ log_step("Verifying system initialization...")
349
+
350
+ # Check critical files
351
+ critical_files = [
352
+ (path_manager.get_combined_dataset_path(), "Combined dataset"),
353
+ (path_manager.get_model_file_path(), "Model file"),
354
+ (path_manager.get_vectorizer_path(), "Vectorizer file"),
355
+ (path_manager.get_metadata_path(), "Metadata file"),
356
+ (path_manager.get_activity_log_path(), "Activity log")
357
+ ]
358
+
359
+ all_good = True
360
+ for file_path, description in critical_files:
361
+ if file_path.exists():
362
+ log_step(f"βœ… {description}: {file_path}")
363
+ else:
364
+ log_step(f"❌ Missing {description}: {file_path}")
365
+ all_good = False
366
+
 
 
367
  # Test model loading
 
368
  try:
369
  import joblib
370
+ pipeline_path = path_manager.get_pipeline_path()
371
+ if pipeline_path.exists():
372
+ pipeline = joblib.load(pipeline_path)
373
+ test_pred = pipeline.predict(["This is a test text"])
374
+ log_step(f"βœ… Model test prediction successful: {test_pred}")
375
+ else:
376
+ model_path = path_manager.get_model_file_path()
377
+ vectorizer_path = path_manager.get_vectorizer_path()
378
+ model = joblib.load(model_path)
379
+ vectorizer = joblib.load(vectorizer_path)
380
+ test_text_vec = vectorizer.transform(["This is a test text"])
381
+ test_pred = model.predict(test_text_vec)
382
+ log_step(f"βœ… Model component test prediction successful: {test_pred}")
383
  except Exception as e:
384
+ log_step(f"❌ Model test failed: {e}")
385
+ all_good = False
386
+
387
+ return all_good
 
 
 
 
 
 
 
 
388
 
389
 
390
  def main():
391
+ """Main initialization function"""
392
+ log_step("πŸš€ Starting system initialization...")
393
+ log_step(f"🌍 Environment: {path_manager.environment}")
394
+ log_step(f"πŸ“ Base directory: {path_manager.base_paths['base']}")
395
+ log_step(f"πŸ“Š Data directory: {path_manager.base_paths['data']}")
396
+ log_step(f"πŸ€– Model directory: {path_manager.base_paths['model']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
 
398
  steps = [
399
  ("Directory Creation", create_directories),
400
+ ("Existing Dataset Copy", copy_existing_datasets),
401
+ ("Minimal Dataset Creation", create_minimal_dataset),
402
+ ("Model Training", run_initial_training),
403
+ ("Log File Creation", create_initial_logs),
404
+ ("System Verification", verify_system)
405
  ]
406
 
 
 
 
 
407
  failed_steps = []
408
+ completed_steps = []
409
 
410
  for step_name, step_function in steps:
411
  try:
412
+ log_step(f"πŸ”„ Starting: {step_name}")
413
  if step_function():
414
  log_step(f"βœ… {step_name} completed")
415
+ completed_steps.append(step_name)
416
  else:
417
  log_step(f"❌ {step_name} failed")
418
  failed_steps.append(step_name)
 
420
  log_step(f"❌ {step_name} failed: {str(e)}")
421
  failed_steps.append(step_name)
422
 
 
 
 
 
423
  # Summary
424
+ log_step(f"\nπŸ“Š Initialization Summary:")
425
+ log_step(f" βœ… Completed: {len(completed_steps)}/{len(steps)} steps")
426
+ log_step(f" ❌ Failed: {len(failed_steps)}/{len(steps)} steps")
427
+
428
+ if completed_steps:
429
+ log_step(f" Completed steps: {', '.join(completed_steps)}")
430
+
431
  if failed_steps:
432
+ log_step(f" Failed steps: {', '.join(failed_steps)}")
433
+ log_step(f"⚠️ Initialization completed with {len(failed_steps)} failed steps")
434
  else:
435
  log_step("πŸŽ‰ System initialization completed successfully!")
436
 
437
+ # Environment info
438
+ log_step(f"\nπŸ” Environment Information:")
439
+ env_info = path_manager.get_environment_info()
440
+ log_step(f" Environment: {env_info['environment']}")
441
+ log_step(f" Available datasets: {sum(env_info['available_datasets'].values())}")
442
+ log_step(f" Available models: {sum(env_info['available_models'].values())}")
 
 
 
 
 
 
 
 
443
 
444
+ log_step("\n🎯 System ready for use!")
445
+
446
+ return len(failed_steps) == 0
447
 
448
 
449
  if __name__ == "__main__":
450
+ success = main()
451
+ if not success:
452
+ sys.exit(1)