AlvaroMros commited on
Commit
a90f2dd
·
1 Parent(s): 371767b

Startup model check: 2025-09-01 09:31:10

Browse files
logs/startup_update.log CHANGED
Binary files a/logs/startup_update.log and b/logs/startup_update.log differ
 
output/model_results.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf8df1ba9e26fa98e34bfb1c773e66576cbf89152087c55b70921269c84f39d5
3
- size 27286
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa6c9cdfa89c172663708c5987d5bd43c108003ba1310cd008090c614bb18ee1
3
+ size 27298
src/predict/pipeline.py CHANGED
@@ -190,6 +190,9 @@ class PredictionPipeline:
190
 
191
  should_retrain = self._should_retrain_models()
192
 
 
 
 
193
  for i, model in enumerate(self.models):
194
  model_name = model.__class__.__name__
195
  print(f"\n--- Evaluating Model: {model_name} ---")
@@ -241,6 +244,21 @@ class PredictionPipeline:
241
  'total_fights': len(eval_fights),
242
  'model_status': model_status
243
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  if detailed_report:
246
  self._report_detailed_results()
@@ -249,7 +267,7 @@ class PredictionPipeline:
249
 
250
  # Only train and save models if retraining was performed
251
  if should_retrain:
252
- self._train_and_save_models()
253
 
254
  def run_kfold_cv(self, k: int = 3, holdout_events: int = 1):
255
  """Performs k-fold cross-validation where each fold is a set of events.
@@ -262,6 +280,9 @@ class PredictionPipeline:
262
  # Initialize KFold splitter on events
263
  kf = KFold(n_splits=k, shuffle=True, random_state=42)
264
 
 
 
 
265
  all_fold_metrics = []
266
  for fold_idx, (train_event_idx, test_event_idx) in enumerate(kf.split(event_list), start=1):
267
  train_events = [event_list[i] for i in train_event_idx]
@@ -298,19 +319,32 @@ class PredictionPipeline:
298
 
299
  acc = correct / len(test_set) if test_set else 0.0
300
  fold_results[model_name] = acc
301
-
302
- # Log metrics and model artifact
303
  mlflow.log_metric(f"accuracy_{model_name}", acc)
304
- mlflow.sklearn.log_model(model, f"model_{model_name}")
 
 
 
 
 
305
 
306
  all_fold_metrics.append(fold_results)
307
 
308
- return all_fold_metrics
 
 
 
 
 
 
 
 
 
 
309
 
310
  def update_models_if_new_data(self):
311
  """
312
- Checks for new data and retrains/saves all models on the full dataset if needed.
313
- This does not run any evaluation.
314
  """
315
  print("\n--- Checking for Model Updates ---")
316
 
@@ -318,28 +352,50 @@ class PredictionPipeline:
318
  missing_models = [m for m in self.models if not self._model_exists(m)]
319
  has_new_data = self._has_new_data_since_last_training()
320
 
321
- if missing_models:
322
- missing_names = [m.__class__.__name__ for m in missing_models]
323
- print(f"Missing or invalid model files found for: {missing_names}.")
324
- self._train_and_save_models()
325
- elif has_new_data:
326
- print("New data detected, retraining all models...")
327
- self._train_and_save_models()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  else:
329
  print("No new data detected. Models are already up-to-date.")
330
 
331
- def _train_and_save_models(self):
332
- """Trains all models on the full dataset and saves them."""
333
- print("\n\n--- Training and Saving All Models on Full Dataset ---")
334
 
335
  if not os.path.exists(FIGHTS_CSV_PATH):
336
- print(f"Error: Fights data not found at '{FIGHTS_CSV_PATH}'. Cannot save models.")
337
  return
338
 
339
  with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
340
  all_fights = list(csv.DictReader(f))
341
 
342
- print(f"Training models on all {len(all_fights)} available fights...")
343
 
344
  if not os.path.exists(MODELS_DIR):
345
  os.makedirs(MODELS_DIR)
@@ -352,21 +408,25 @@ class PredictionPipeline:
352
  latest_event_name = latest_fight['event_name']
353
  latest_event_date = latest_fight['event_date']
354
 
355
- for model in self.models:
356
- model_name = model.__class__.__name__
357
- print(f"\n--- Training: {model_name} ---")
 
 
358
  model.train(all_fights)
359
 
360
- # Sanitize and save the model
361
- file_name = f"{model_name}.joblib"
362
  save_path = os.path.join(MODELS_DIR, file_name)
363
  joblib.dump(model, save_path)
364
- print(f"Model saved successfully to {save_path}")
365
 
366
- # Save the last trained event info
367
- if all_fights:
368
- self._save_last_trained_event(latest_event_name, latest_event_date)
369
- print(f"Updated last trained event: {latest_event_name} ({latest_event_date})")
 
 
370
 
371
  def _report_summary(self):
372
  """Prints a concise summary of model performance."""
 
190
 
191
  should_retrain = self._should_retrain_models()
192
 
193
+ # Track best model across all evaluations
194
+ best_model_info = {'accuracy': 0, 'model_name': '', 'model': None}
195
+
196
  for i, model in enumerate(self.models):
197
  model_name = model.__class__.__name__
198
  print(f"\n--- Evaluating Model: {model_name} ---")
 
244
  'total_fights': len(eval_fights),
245
  'model_status': model_status
246
  }
247
+
248
+ # Track best model
249
+ if accuracy > best_model_info['accuracy']:
250
+ best_model_info['accuracy'] = accuracy
251
+ best_model_info['model_name'] = model_name
252
+ best_model_info['model'] = model
253
+
254
+ # Log best model to MLflow
255
+ if best_model_info['model'] is not None:
256
+ mlflow.set_experiment("UFC_Best_Models")
257
+ with mlflow.start_run(run_name="best_model_evaluation"):
258
+ mlflow.log_metric("best_accuracy", best_model_info['accuracy'])
259
+ mlflow.log_param("model_type", best_model_info['model_name'])
260
+ mlflow.sklearn.log_model(best_model_info['model'], "best_model")
261
+ print(f"Best model logged to MLflow: {best_model_info['model_name']} with {best_model_info['accuracy']:.2f}% accuracy")
262
 
263
  if detailed_report:
264
  self._report_detailed_results()
 
267
 
268
  # Only train and save models if retraining was performed
269
  if should_retrain:
270
+ self._train_and_save_best_model(best_model_info)
271
 
272
  def run_kfold_cv(self, k: int = 3, holdout_events: int = 1):
273
  """Performs k-fold cross-validation where each fold is a set of events.
 
280
  # Initialize KFold splitter on events
281
  kf = KFold(n_splits=k, shuffle=True, random_state=42)
282
 
283
+ # Track best model across all folds
284
+ best_model_info = {'accuracy': 0, 'model_name': '', 'model': None}
285
+
286
  all_fold_metrics = []
287
  for fold_idx, (train_event_idx, test_event_idx) in enumerate(kf.split(event_list), start=1):
288
  train_events = [event_list[i] for i in train_event_idx]
 
319
 
320
  acc = correct / len(test_set) if test_set else 0.0
321
  fold_results[model_name] = acc
 
 
322
  mlflow.log_metric(f"accuracy_{model_name}", acc)
323
+
324
+ # Update best model tracking
325
+ if acc > best_model_info['accuracy']:
326
+ best_model_info['accuracy'] = acc
327
+ best_model_info['model_name'] = model_name
328
+ best_model_info['model'] = model
329
 
330
  all_fold_metrics.append(fold_results)
331
 
332
+ # Log the overall best model across all folds
333
+ if best_model_info['model'] is not None:
334
+ mlflow.set_experiment("UFC_Best_Models")
335
+ with mlflow.start_run(run_name="kfold_best_model"):
336
+ mlflow.log_metric("best_accuracy", best_model_info['accuracy'])
337
+ mlflow.log_param("model_type", best_model_info['model_name'])
338
+ mlflow.log_param("k_folds", k)
339
+ mlflow.sklearn.log_model(best_model_info['model'], "best_model")
340
+ print(f"Overall best model from k-fold CV: {best_model_info['model_name']} with {best_model_info['accuracy']:.2%} accuracy")
341
+
342
+ return all_fold_metrics, best_model_info
343
 
344
  def update_models_if_new_data(self):
345
  """
346
+ Checks for new data and retrains/saves the best model on the full dataset if needed.
347
+ This runs a quick evaluation to determine the best model.
348
  """
349
  print("\n--- Checking for Model Updates ---")
350
 
 
352
  missing_models = [m for m in self.models if not self._model_exists(m)]
353
  has_new_data = self._has_new_data_since_last_training()
354
 
355
+ if missing_models or has_new_data:
356
+ print("Running quick evaluation to find best model...")
357
+
358
+ # Quick evaluation to find best model
359
+ self._load_and_split_data()
360
+ eval_fights = [f for f in self.test_fights if f['winner'] not in ["Draw", "NC", ""]]
361
+
362
+ best_model_info = {'accuracy': 0, 'model_name': '', 'model': None}
363
+
364
+ for model in self.models:
365
+ model_name = model.__class__.__name__
366
+ print(f"Evaluating {model_name}...")
367
+
368
+ model.train(self.train_fights)
369
+ correct = 0
370
+ for fight in eval_fights:
371
+ prediction = model.predict(fight)
372
+ if prediction.get('winner') == fight['winner']:
373
+ correct += 1
374
+
375
+ accuracy = (correct / len(eval_fights)) * 100 if eval_fights else 0
376
+
377
+ if accuracy > best_model_info['accuracy']:
378
+ best_model_info['accuracy'] = accuracy
379
+ best_model_info['model_name'] = model_name
380
+ best_model_info['model'] = model
381
+
382
+ print(f"Best model: {best_model_info['model_name']} with {best_model_info['accuracy']:.2f}% accuracy")
383
+ self._train_and_save_best_model(best_model_info)
384
  else:
385
  print("No new data detected. Models are already up-to-date.")
386
 
387
+ def _train_and_save_best_model(self, best_model_info):
388
+ """Trains only the best performing model on the full dataset and saves it."""
389
+ print("\n\n--- Training and Saving Best Model on Full Dataset ---")
390
 
391
  if not os.path.exists(FIGHTS_CSV_PATH):
392
+ print(f"Error: Fights data not found at '{FIGHTS_CSV_PATH}'. Cannot save model.")
393
  return
394
 
395
  with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
396
  all_fights = list(csv.DictReader(f))
397
 
398
+ print(f"Training best model on all {len(all_fights)} available fights...")
399
 
400
  if not os.path.exists(MODELS_DIR):
401
  os.makedirs(MODELS_DIR)
 
408
  latest_event_name = latest_fight['event_name']
409
  latest_event_date = latest_fight['event_date']
410
 
411
+ if best_model_info['model'] is not None:
412
+ model = best_model_info['model']
413
+ model_name = best_model_info['model_name']
414
+
415
+ print(f"\n--- Training Best Model: {model_name} ---")
416
  model.train(all_fights)
417
 
418
+ # Sanitize and save the best model
419
+ file_name = f"best_{model_name}_{best_model_info['accuracy']:.2f}%.joblib"
420
  save_path = os.path.join(MODELS_DIR, file_name)
421
  joblib.dump(model, save_path)
422
+ print(f"Best model saved successfully to {save_path} with {best_model_info['accuracy']:.2f}% accuracy")
423
 
424
+ # Save the last trained event info
425
+ if all_fights:
426
+ self._save_last_trained_event(latest_event_name, latest_event_date)
427
+ print(f"Updated last trained event: {latest_event_name} ({latest_event_date})")
428
+ else:
429
+ print("No best model found to train and save.")
430
 
431
  def _report_summary(self):
432
  """Prints a concise summary of model performance."""
wa.py ADDED
File without changes