Ahmedik95316 commited on
Commit
2d38242
·
verified ·
1 Parent(s): 6041335

Update initialize_system.py

Browse files

Modified to run the ensemble training at the start

Files changed (1) hide show
  1. initialize_system.py +99 -108
initialize_system.py CHANGED
@@ -1,11 +1,19 @@
1
  import os
2
  import sys
3
  import json
 
4
  import shutil
5
  import pandas as pd
6
  from pathlib import Path
7
  from datetime import datetime
 
 
8
  from sklearn.model_selection import cross_validate
 
 
 
 
 
9
 
10
  # Import the new path manager
11
  try:
@@ -181,7 +189,7 @@ def create_minimal_dataset():
181
 
182
 
183
  def run_initial_training():
184
- """Run basic model training"""
185
  log_step("Starting initial model training...")
186
 
187
  try:
@@ -196,41 +204,92 @@ def run_initial_training():
196
 
197
  # Check if model already exists
198
  if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
199
- log_step("✅ Model files already exist, checking if pipeline needs to be created...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- # If individual components exist but pipeline doesn't, create pipeline
202
- if model_path.exists() and vectorizer_path.exists() and not pipeline_path.exists():
203
- log_step("Creating pipeline from existing components...")
 
 
204
  try:
205
  import joblib
206
- from sklearn.pipeline import Pipeline
207
-
208
- # Load existing components
209
- model = joblib.load(model_path)
210
- vectorizer = joblib.load(vectorizer_path)
211
-
212
- # Create pipeline
213
- pipeline = Pipeline([
214
- ('vectorizer', vectorizer),
215
- ('model', model)
216
- ])
217
-
218
- # Save pipeline
219
- joblib.dump(pipeline, pipeline_path)
220
- log_step(f"✅ Created pipeline from existing components: {pipeline_path}")
221
-
222
  except Exception as e:
223
- log_step(f"⚠️ Failed to create pipeline from existing components: {e}")
224
-
 
 
 
225
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
- # Import required libraries
 
 
 
 
 
 
 
228
  from sklearn.feature_extraction.text import TfidfVectorizer
229
  from sklearn.linear_model import LogisticRegression
230
- from sklearn.model_selection import train_test_split
231
- from sklearn.metrics import accuracy_score, f1_score
232
  from sklearn.pipeline import Pipeline
 
233
  import joblib
 
 
 
 
 
 
 
234
 
235
  # Load dataset
236
  dataset_path = path_manager.get_combined_dataset_path()
@@ -259,7 +318,7 @@ def run_initial_training():
259
  X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
260
  )
261
 
262
- # Create pipeline with preprocessing
263
  pipeline = Pipeline([
264
  ('vectorizer', TfidfVectorizer(
265
  max_features=5000,
@@ -276,9 +335,9 @@ def run_initial_training():
276
  ])
277
 
278
  # Train model with cross-validation
279
- log_step("Training model with cross-validation...")
280
 
281
- # Perform cross-validation before final training
282
  cv_results = cross_validate(
283
  pipeline, X_train, y_train,
284
  cv=3,
@@ -294,63 +353,11 @@ def run_initial_training():
294
  accuracy = accuracy_score(y_test, y_pred)
295
  f1 = f1_score(y_test, y_pred, average='weighted')
296
 
297
- # Save CV results for API access
298
- cv_data = {
299
- "n_splits": 3,
300
- "test_scores": {
301
- "accuracy": {
302
- "mean": float(cv_results['test_accuracy'].mean()),
303
- "std": float(cv_results['test_accuracy'].std()),
304
- "scores": cv_results['test_accuracy'].tolist()
305
- },
306
- "f1": {
307
- "mean": float(cv_results['test_f1_weighted'].mean()),
308
- "std": float(cv_results['test_f1_weighted'].std()),
309
- "scores": cv_results['test_f1_weighted'].tolist()
310
- }
311
- },
312
- "train_scores": {
313
- "accuracy": {
314
- "mean": float(cv_results['train_accuracy'].mean()),
315
- "std": float(cv_results['train_accuracy'].std()),
316
- "scores": cv_results['train_accuracy'].tolist()
317
- },
318
- "f1": {
319
- "mean": float(cv_results['train_f1_weighted'].mean()),
320
- "std": float(cv_results['train_f1_weighted'].std()),
321
- "scores": cv_results['train_f1_weighted'].tolist()
322
- }
323
- }
324
- }
325
-
326
- # Save CV results to file
327
- cv_results_path = path_manager.get_logs_path("cv_results.json")
328
- with open(cv_results_path, 'w') as f:
329
- json.dump(cv_data, f, indent=2)
330
- log_step(f"Saved CV results to: {cv_results_path}")
331
-
332
- # Ensure model directory exists
333
- model_path.parent.mkdir(parents=True, exist_ok=True)
334
-
335
- # Save complete pipeline FIRST (this is the priority)
336
- log_step(f"Saving pipeline to: {pipeline_path}")
337
  joblib.dump(pipeline, pipeline_path)
338
-
339
- # Verify pipeline was saved
340
- if pipeline_path.exists():
341
- log_step(f"✅ Pipeline saved successfully to {pipeline_path}")
342
-
343
- # Test loading the pipeline
344
- try:
345
- test_pipeline = joblib.load(pipeline_path)
346
- test_pred = test_pipeline.predict(["This is a test"])
347
- log_step(f"✅ Pipeline verification successful: {test_pred}")
348
- except Exception as e:
349
- log_step(f"⚠️ Pipeline verification failed: {e}")
350
- else:
351
- log_step(f"❌ Pipeline was not saved to {pipeline_path}")
352
-
353
- # Save individual components for backward compatibility
354
  try:
355
  joblib.dump(pipeline.named_steps['model'], model_path)
356
  joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
@@ -358,45 +365,29 @@ def run_initial_training():
358
  except Exception as e:
359
  log_step(f"⚠️ Failed to save individual components: {e}")
360
 
361
- # Save metadata
362
  metadata = {
363
- "model_version": "v1.0_init",
364
  "model_type": "logistic_regression_pipeline",
365
  "test_accuracy": float(accuracy),
366
  "test_f1": float(f1),
367
- "train_size": len(X_train),
368
- "test_size": len(X_test),
369
  "timestamp": datetime.now().isoformat(),
370
- "training_method": "initialization",
371
- "environment": path_manager.environment,
372
- "data_path": str(dataset_path),
373
- "class_distribution": class_counts.to_dict(),
374
- "pipeline_created": pipeline_path.exists(),
375
- "individual_components_created": model_path.exists() and vectorizer_path.exists(),
376
- # Add CV results to metadata
377
- "cv_f1_mean": float(cv_results['test_f1_weighted'].mean()),
378
- "cv_f1_std": float(cv_results['test_f1_weighted'].std()),
379
- "cv_accuracy_mean": float(cv_results['test_accuracy'].mean()),
380
- "cv_accuracy_std": float(cv_results['test_accuracy'].std())
381
  }
382
 
383
  metadata_path = path_manager.get_metadata_path()
384
  with open(metadata_path, 'w') as f:
385
  json.dump(metadata, f, indent=2)
386
 
387
- log_step(f"✅ Training completed successfully")
388
  log_step(f" Accuracy: {accuracy:.4f}")
389
  log_step(f" F1 Score: {f1:.4f}")
390
- log_step(f" Pipeline saved: {pipeline_path.exists()}")
391
- log_step(f" Model saved to: {model_path}")
392
- log_step(f" Vectorizer saved to: {vectorizer_path}")
393
 
394
  return True
395
-
396
  except Exception as e:
397
- log_step(f"❌ Training failed: {str(e)}")
398
- import traceback
399
- log_step(f"❌ Traceback: {traceback.format_exc()}")
400
  return False
401
 
402
 
 
1
  import os
2
  import sys
3
  import json
4
+ import joblib
5
  import shutil
6
  import pandas as pd
7
  from pathlib import Path
8
  from datetime import datetime
9
+ from sklearn.pipeline import Pipeline
10
+ from model.train import EnhancedModelTrainer
11
  from sklearn.model_selection import cross_validate
12
+ from sklearn.linear_model import LogisticRegression
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.metrics import accuracy_score, f1_score
15
+ from sklearn.feature_extraction.text import TfidfVectorizer
16
+
17
 
18
  # Import the new path manager
19
  try:
 
189
 
190
 
191
  def run_initial_training():
192
+ """Run enhanced ensemble model training with LightGBM"""
193
  log_step("Starting initial model training...")
194
 
195
  try:
 
204
 
205
  # Check if model already exists
206
  if pipeline_path.exists() or (model_path.exists() and vectorizer_path.exists()):
207
+ log_step("✅ Model files already exist, skipping training")
208
+ return True
209
+
210
+ # Import enhanced training components
211
+ import sys
212
+ sys.path.append('/app')
213
+ from model.train import EnhancedModelTrainer
214
+
215
+ log_step("Using Enhanced Model Trainer with ensemble voting...")
216
+
217
+ # Create enhanced trainer with full ensemble configuration
218
+ trainer = EnhancedModelTrainer(
219
+ use_enhanced_features=True, # Enable sentiment, readability, entities, linguistic features
220
+ enable_ensemble=True # Enable LightGBM + Random Forest + Logistic Regression ensemble
221
+ )
222
+
223
+ # Override paths to use the initialization system paths
224
+ trainer.data_path = path_manager.get_combined_dataset_path()
225
+ trainer.pipeline_path = pipeline_path
226
+ trainer.model_path = model_path
227
+ trainer.vectorizer_path = vectorizer_path
228
+ trainer.metadata_path = path_manager.get_metadata_path()
229
+
230
+ log_step("Starting enhanced ensemble training (this may take several minutes)...")
231
+
232
+ # Run the full enhanced training
233
+ success, message = trainer.train_model()
234
+
235
+ if success:
236
+ log_step(f"✅ Enhanced ensemble training completed: {message}")
237
 
238
+ # Verify pipeline was created
239
+ if pipeline_path.exists():
240
+ log_step(f" Enhanced pipeline saved successfully to {pipeline_path}")
241
+
242
+ # Test loading the pipeline
243
  try:
244
  import joblib
245
+ test_pipeline = joblib.load(pipeline_path)
246
+ test_pred = test_pipeline.predict(["This is a test article"])
247
+ log_step(f"✅ Enhanced pipeline verification successful: {test_pred}")
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  except Exception as e:
249
+ log_step(f"⚠️ Enhanced pipeline verification failed: {e}")
250
+ else:
251
+ log_step(f"❌ Enhanced pipeline was not saved to {pipeline_path}")
252
+ return False
253
+
254
  return True
255
+ else:
256
+ log_step(f"❌ Enhanced ensemble training failed: {message}")
257
+ # Fall back to basic training if enhanced training fails
258
+ log_step("Falling back to basic training...")
259
+ return run_initial_training()
260
+
261
+ except ImportError as e:
262
+ log_step(f"⚠️ Enhanced training components not available: {e}")
263
+ log_step("Falling back to basic training...")
264
+ return run_basic_training_fallback()
265
+ except Exception as e:
266
+ log_step(f"❌ Enhanced training failed: {str(e)}")
267
+ import traceback
268
+ log_step(f"❌ Traceback: {traceback.format_exc()}")
269
+ log_step("Falling back to basic training...")
270
+ return run_basic_training_fallback()
271
+
272
 
273
+ def run_basic_training_fallback():
274
+ """Fallback to basic training if enhanced training fails"""
275
+ log_step("Running basic training fallback...")
276
+
277
+ try:
278
+ # Import required libraries for basic training
279
+ import pandas as pd
280
+ from sklearn.model_selection import train_test_split, cross_validate
281
  from sklearn.feature_extraction.text import TfidfVectorizer
282
  from sklearn.linear_model import LogisticRegression
 
 
283
  from sklearn.pipeline import Pipeline
284
+ from sklearn.metrics import accuracy_score, f1_score
285
  import joblib
286
+ import json
287
+ from datetime import datetime
288
+
289
+ # Get paths
290
+ model_path = path_manager.get_model_file_path()
291
+ vectorizer_path = path_manager.get_vectorizer_path()
292
+ pipeline_path = path_manager.get_pipeline_path()
293
 
294
  # Load dataset
295
  dataset_path = path_manager.get_combined_dataset_path()
 
318
  X, y, test_size=0.2, random_state=42, stratify=y if len(class_counts) > 1 else None
319
  )
320
 
321
+ # Create basic pipeline
322
  pipeline = Pipeline([
323
  ('vectorizer', TfidfVectorizer(
324
  max_features=5000,
 
335
  ])
336
 
337
  # Train model with cross-validation
338
+ log_step("Training basic model with cross-validation...")
339
 
340
+ # Perform cross-validation
341
  cv_results = cross_validate(
342
  pipeline, X_train, y_train,
343
  cv=3,
 
353
  accuracy = accuracy_score(y_test, y_pred)
354
  f1 = f1_score(y_test, y_pred, average='weighted')
355
 
356
+ # Save pipeline
357
+ log_step(f"Saving basic pipeline to: {pipeline_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  joblib.dump(pipeline, pipeline_path)
359
+
360
+ # Save individual components for compatibility
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  try:
362
  joblib.dump(pipeline.named_steps['model'], model_path)
363
  joblib.dump(pipeline.named_steps['vectorizer'], vectorizer_path)
 
365
  except Exception as e:
366
  log_step(f"⚠️ Failed to save individual components: {e}")
367
 
368
+ # Save basic metadata
369
  metadata = {
370
+ "model_version": "v1.0_basic_fallback",
371
  "model_type": "logistic_regression_pipeline",
372
  "test_accuracy": float(accuracy),
373
  "test_f1": float(f1),
 
 
374
  "timestamp": datetime.now().isoformat(),
375
+ "training_method": "basic_fallback",
376
+ "environment": path_manager.environment
 
 
 
 
 
 
 
 
 
377
  }
378
 
379
  metadata_path = path_manager.get_metadata_path()
380
  with open(metadata_path, 'w') as f:
381
  json.dump(metadata, f, indent=2)
382
 
383
+ log_step(f"✅ Basic training completed successfully")
384
  log_step(f" Accuracy: {accuracy:.4f}")
385
  log_step(f" F1 Score: {f1:.4f}")
 
 
 
386
 
387
  return True
388
+
389
  except Exception as e:
390
+ log_step(f"❌ Basic training fallback also failed: {str(e)}")
 
 
391
  return False
392
 
393