Spaces:
Sleeping
Sleeping
Commit
·
a90f2dd
1
Parent(s):
371767b
Startup model check: 2025-09-01 09:31:10
Browse files- logs/startup_update.log +0 -0
- output/model_results.json +2 -2
- src/predict/pipeline.py +89 -29
- wa.py +0 -0
logs/startup_update.log
CHANGED
Binary files a/logs/startup_update.log and b/logs/startup_update.log differ
|
|
output/model_results.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa6c9cdfa89c172663708c5987d5bd43c108003ba1310cd008090c614bb18ee1
|
3 |
+
size 27298
|
src/predict/pipeline.py
CHANGED
@@ -190,6 +190,9 @@ class PredictionPipeline:
|
|
190 |
|
191 |
should_retrain = self._should_retrain_models()
|
192 |
|
|
|
|
|
|
|
193 |
for i, model in enumerate(self.models):
|
194 |
model_name = model.__class__.__name__
|
195 |
print(f"\n--- Evaluating Model: {model_name} ---")
|
@@ -241,6 +244,21 @@ class PredictionPipeline:
|
|
241 |
'total_fights': len(eval_fights),
|
242 |
'model_status': model_status
|
243 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
if detailed_report:
|
246 |
self._report_detailed_results()
|
@@ -249,7 +267,7 @@ class PredictionPipeline:
|
|
249 |
|
250 |
# Only train and save models if retraining was performed
|
251 |
if should_retrain:
|
252 |
-
self.
|
253 |
|
254 |
def run_kfold_cv(self, k: int = 3, holdout_events: int = 1):
|
255 |
"""Performs k-fold cross-validation where each fold is a set of events.
|
@@ -262,6 +280,9 @@ class PredictionPipeline:
|
|
262 |
# Initialize KFold splitter on events
|
263 |
kf = KFold(n_splits=k, shuffle=True, random_state=42)
|
264 |
|
|
|
|
|
|
|
265 |
all_fold_metrics = []
|
266 |
for fold_idx, (train_event_idx, test_event_idx) in enumerate(kf.split(event_list), start=1):
|
267 |
train_events = [event_list[i] for i in train_event_idx]
|
@@ -298,19 +319,32 @@ class PredictionPipeline:
|
|
298 |
|
299 |
acc = correct / len(test_set) if test_set else 0.0
|
300 |
fold_results[model_name] = acc
|
301 |
-
|
302 |
-
# Log metrics and model artifact
|
303 |
mlflow.log_metric(f"accuracy_{model_name}", acc)
|
304 |
-
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
all_fold_metrics.append(fold_results)
|
307 |
|
308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
def update_models_if_new_data(self):
|
311 |
"""
|
312 |
-
Checks for new data and retrains/saves
|
313 |
-
This
|
314 |
"""
|
315 |
print("\n--- Checking for Model Updates ---")
|
316 |
|
@@ -318,28 +352,50 @@ class PredictionPipeline:
|
|
318 |
missing_models = [m for m in self.models if not self._model_exists(m)]
|
319 |
has_new_data = self._has_new_data_since_last_training()
|
320 |
|
321 |
-
if missing_models:
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
else:
|
329 |
print("No new data detected. Models are already up-to-date.")
|
330 |
|
331 |
-
def
|
332 |
-
"""Trains
|
333 |
-
print("\n\n--- Training and Saving
|
334 |
|
335 |
if not os.path.exists(FIGHTS_CSV_PATH):
|
336 |
-
print(f"Error: Fights data not found at '{FIGHTS_CSV_PATH}'. Cannot save
|
337 |
return
|
338 |
|
339 |
with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
|
340 |
all_fights = list(csv.DictReader(f))
|
341 |
|
342 |
-
print(f"Training
|
343 |
|
344 |
if not os.path.exists(MODELS_DIR):
|
345 |
os.makedirs(MODELS_DIR)
|
@@ -352,21 +408,25 @@ class PredictionPipeline:
|
|
352 |
latest_event_name = latest_fight['event_name']
|
353 |
latest_event_date = latest_fight['event_date']
|
354 |
|
355 |
-
|
356 |
-
|
357 |
-
|
|
|
|
|
358 |
model.train(all_fights)
|
359 |
|
360 |
-
# Sanitize and save the model
|
361 |
-
file_name = f"{model_name}
|
362 |
save_path = os.path.join(MODELS_DIR, file_name)
|
363 |
joblib.dump(model, save_path)
|
364 |
-
print(f"
|
365 |
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
|
|
|
|
370 |
|
371 |
def _report_summary(self):
|
372 |
"""Prints a concise summary of model performance."""
|
|
|
190 |
|
191 |
should_retrain = self._should_retrain_models()
|
192 |
|
193 |
+
# Track best model across all evaluations
|
194 |
+
best_model_info = {'accuracy': 0, 'model_name': '', 'model': None}
|
195 |
+
|
196 |
for i, model in enumerate(self.models):
|
197 |
model_name = model.__class__.__name__
|
198 |
print(f"\n--- Evaluating Model: {model_name} ---")
|
|
|
244 |
'total_fights': len(eval_fights),
|
245 |
'model_status': model_status
|
246 |
}
|
247 |
+
|
248 |
+
# Track best model
|
249 |
+
if accuracy > best_model_info['accuracy']:
|
250 |
+
best_model_info['accuracy'] = accuracy
|
251 |
+
best_model_info['model_name'] = model_name
|
252 |
+
best_model_info['model'] = model
|
253 |
+
|
254 |
+
# Log best model to MLflow
|
255 |
+
if best_model_info['model'] is not None:
|
256 |
+
mlflow.set_experiment("UFC_Best_Models")
|
257 |
+
with mlflow.start_run(run_name="best_model_evaluation"):
|
258 |
+
mlflow.log_metric("best_accuracy", best_model_info['accuracy'])
|
259 |
+
mlflow.log_param("model_type", best_model_info['model_name'])
|
260 |
+
mlflow.sklearn.log_model(best_model_info['model'], "best_model")
|
261 |
+
print(f"Best model logged to MLflow: {best_model_info['model_name']} with {best_model_info['accuracy']:.2f}% accuracy")
|
262 |
|
263 |
if detailed_report:
|
264 |
self._report_detailed_results()
|
|
|
267 |
|
268 |
# Only train and save models if retraining was performed
|
269 |
if should_retrain:
|
270 |
+
self._train_and_save_best_model(best_model_info)
|
271 |
|
272 |
def run_kfold_cv(self, k: int = 3, holdout_events: int = 1):
|
273 |
"""Performs k-fold cross-validation where each fold is a set of events.
|
|
|
280 |
# Initialize KFold splitter on events
|
281 |
kf = KFold(n_splits=k, shuffle=True, random_state=42)
|
282 |
|
283 |
+
# Track best model across all folds
|
284 |
+
best_model_info = {'accuracy': 0, 'model_name': '', 'model': None}
|
285 |
+
|
286 |
all_fold_metrics = []
|
287 |
for fold_idx, (train_event_idx, test_event_idx) in enumerate(kf.split(event_list), start=1):
|
288 |
train_events = [event_list[i] for i in train_event_idx]
|
|
|
319 |
|
320 |
acc = correct / len(test_set) if test_set else 0.0
|
321 |
fold_results[model_name] = acc
|
|
|
|
|
322 |
mlflow.log_metric(f"accuracy_{model_name}", acc)
|
323 |
+
|
324 |
+
# Update best model tracking
|
325 |
+
if acc > best_model_info['accuracy']:
|
326 |
+
best_model_info['accuracy'] = acc
|
327 |
+
best_model_info['model_name'] = model_name
|
328 |
+
best_model_info['model'] = model
|
329 |
|
330 |
all_fold_metrics.append(fold_results)
|
331 |
|
332 |
+
# Log the overall best model across all folds
|
333 |
+
if best_model_info['model'] is not None:
|
334 |
+
mlflow.set_experiment("UFC_Best_Models")
|
335 |
+
with mlflow.start_run(run_name="kfold_best_model"):
|
336 |
+
mlflow.log_metric("best_accuracy", best_model_info['accuracy'])
|
337 |
+
mlflow.log_param("model_type", best_model_info['model_name'])
|
338 |
+
mlflow.log_param("k_folds", k)
|
339 |
+
mlflow.sklearn.log_model(best_model_info['model'], "best_model")
|
340 |
+
print(f"Overall best model from k-fold CV: {best_model_info['model_name']} with {best_model_info['accuracy']:.2%} accuracy")
|
341 |
+
|
342 |
+
return all_fold_metrics, best_model_info
|
343 |
|
344 |
def update_models_if_new_data(self):
|
345 |
"""
|
346 |
+
Checks for new data and retrains/saves the best model on the full dataset if needed.
|
347 |
+
This runs a quick evaluation to determine the best model.
|
348 |
"""
|
349 |
print("\n--- Checking for Model Updates ---")
|
350 |
|
|
|
352 |
missing_models = [m for m in self.models if not self._model_exists(m)]
|
353 |
has_new_data = self._has_new_data_since_last_training()
|
354 |
|
355 |
+
if missing_models or has_new_data:
|
356 |
+
print("Running quick evaluation to find best model...")
|
357 |
+
|
358 |
+
# Quick evaluation to find best model
|
359 |
+
self._load_and_split_data()
|
360 |
+
eval_fights = [f for f in self.test_fights if f['winner'] not in ["Draw", "NC", ""]]
|
361 |
+
|
362 |
+
best_model_info = {'accuracy': 0, 'model_name': '', 'model': None}
|
363 |
+
|
364 |
+
for model in self.models:
|
365 |
+
model_name = model.__class__.__name__
|
366 |
+
print(f"Evaluating {model_name}...")
|
367 |
+
|
368 |
+
model.train(self.train_fights)
|
369 |
+
correct = 0
|
370 |
+
for fight in eval_fights:
|
371 |
+
prediction = model.predict(fight)
|
372 |
+
if prediction.get('winner') == fight['winner']:
|
373 |
+
correct += 1
|
374 |
+
|
375 |
+
accuracy = (correct / len(eval_fights)) * 100 if eval_fights else 0
|
376 |
+
|
377 |
+
if accuracy > best_model_info['accuracy']:
|
378 |
+
best_model_info['accuracy'] = accuracy
|
379 |
+
best_model_info['model_name'] = model_name
|
380 |
+
best_model_info['model'] = model
|
381 |
+
|
382 |
+
print(f"Best model: {best_model_info['model_name']} with {best_model_info['accuracy']:.2f}% accuracy")
|
383 |
+
self._train_and_save_best_model(best_model_info)
|
384 |
else:
|
385 |
print("No new data detected. Models are already up-to-date.")
|
386 |
|
387 |
+
def _train_and_save_best_model(self, best_model_info):
|
388 |
+
"""Trains only the best performing model on the full dataset and saves it."""
|
389 |
+
print("\n\n--- Training and Saving Best Model on Full Dataset ---")
|
390 |
|
391 |
if not os.path.exists(FIGHTS_CSV_PATH):
|
392 |
+
print(f"Error: Fights data not found at '{FIGHTS_CSV_PATH}'. Cannot save model.")
|
393 |
return
|
394 |
|
395 |
with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
|
396 |
all_fights = list(csv.DictReader(f))
|
397 |
|
398 |
+
print(f"Training best model on all {len(all_fights)} available fights...")
|
399 |
|
400 |
if not os.path.exists(MODELS_DIR):
|
401 |
os.makedirs(MODELS_DIR)
|
|
|
408 |
latest_event_name = latest_fight['event_name']
|
409 |
latest_event_date = latest_fight['event_date']
|
410 |
|
411 |
+
if best_model_info['model'] is not None:
|
412 |
+
model = best_model_info['model']
|
413 |
+
model_name = best_model_info['model_name']
|
414 |
+
|
415 |
+
print(f"\n--- Training Best Model: {model_name} ---")
|
416 |
model.train(all_fights)
|
417 |
|
418 |
+
# Sanitize and save the best model
|
419 |
+
file_name = f"best_{model_name}_{best_model_info['accuracy']:.2f}%.joblib"
|
420 |
save_path = os.path.join(MODELS_DIR, file_name)
|
421 |
joblib.dump(model, save_path)
|
422 |
+
print(f"Best model saved successfully to {save_path} with {best_model_info['accuracy']:.2f}% accuracy")
|
423 |
|
424 |
+
# Save the last trained event info
|
425 |
+
if all_fights:
|
426 |
+
self._save_last_trained_event(latest_event_name, latest_event_date)
|
427 |
+
print(f"Updated last trained event: {latest_event_name} ({latest_event_date})")
|
428 |
+
else:
|
429 |
+
print("No best model found to train and save.")
|
430 |
|
431 |
def _report_summary(self):
|
432 |
"""Prints a concise summary of model performance."""
|
wa.py
ADDED
File without changes
|