FusOn-pLM / fuson_plm /benchmarking /xgboost_predictor.py

dependencies and embedding_exploration benchmark

c43fbc6 about 2 months ago

2.88 kB

	from sklearn.model_selection import train_test_split, StratifiedKFold
	from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score, average_precision_score
	from fuson_plm.utils.logging import log_update
	import time
	import xgboost as xgb
	import numpy as np
	import pandas as pd

	def train_final_predictor(X_train, y_train, n_estimators=50,tree_method="hist"):
	clf = xgb.XGBClassifier(n_estimators=n_estimators, tree_method=tree_method)
	clf.fit(X_train, y_train)
	return clf

	def evaluate_predictor(clf,X_test,y_test,class1_thresh=None):
	# Predicting the labels on test set
	y_pred_test = clf.predict(X_test) # labels with automatic thresholds
	y_pred_prob_test = clf.predict_proba(X_test)[:, 1]
	if class1_thresh is not None: y_pred_customthresh_test = np.where(np.array(y_pred_prob_test) >= class1_thresh, 1, 0)

	# Calculating metrics - automatic
	accuracy = accuracy_score(y_test, y_pred_test)
	precision = precision_score(y_test, y_pred_test)
	recall = recall_score(y_test, y_pred_test)
	f1 = f1_score(y_test, y_pred_test)
	auroc_prob = roc_auc_score(y_test, y_pred_prob_test)
	auprc_prob = average_precision_score(y_test, y_pred_prob_test)
	auroc_label = roc_auc_score(y_test, y_pred_test)
	auprc_label = average_precision_score(y_test, y_pred_test)

	automatic_stats_df = pd.DataFrame(data={
	'Accuracy': [accuracy],
	'Precision': [precision],
	'Recall': [recall],
	'F1 Score': [f1],
	'AUROC': [auroc_prob],
	'AUROC Label': [auroc_label],
	'AUPRC': [auprc_prob],
	'AUPRC Label': [auprc_label]
	})

	# Calculating metrics - custom threshold (note that probability ones won't change)
	if class1_thresh is not None:
	accuracy_custom = accuracy_score(y_test, y_pred_customthresh_test)
	precision_custom = precision_score(y_test, y_pred_customthresh_test)
	recall_custom = recall_score(y_test, y_pred_customthresh_test)
	f1_custom = f1_score(y_test, y_pred_customthresh_test)
	auroc_prob_custom = roc_auc_score(y_test, y_pred_prob_test)
	auprc_prob_custom = average_precision_score(y_test, y_pred_prob_test)
	auroc_label_custom = roc_auc_score(y_test, y_pred_customthresh_test)
	auprc_label_custom = average_precision_score(y_test, y_pred_customthresh_test)

	custom_stats_df = pd.DataFrame(data={
	'Accuracy': [accuracy_custom],
	'Precision': [precision_custom],
	'Recall': [recall_custom],
	'F1 Score': [f1_custom],
	'AUROC': [auroc_prob_custom],
	'AUROC Label': [auroc_label_custom],
	'AUPRC': [auprc_prob_custom],
	'AUPRC Label': [auprc_label_custom]
	})
	else:
	custom_stats_df = None

	return automatic_stats_df, custom_stats_df