Tschoui's picture
Upload 3 files
9afbc33 verified
import os
import math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from rdkit.ML.Scoring.Scoring import CalcBEDROC
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, average_precision_score, \
matthews_corrcoef, precision_score, recall_score, f1_score, confusion_matrix
def specificity_score(true_labels, predicted_labels):
tn, fp, _, _ = confusion_matrix(true_labels, predicted_labels).ravel()
specificity = tn / (tn + fp)
return specificity
MAIN_DIR = '' # todo add project dir
def balanced_mcc_score(sensitivity, specificity, prevalence):
"""Returns the Matthews' correlation coefficient at the given
sensitivity, specificity and prevalence.
Parameters
----------
sensitivity : float
The sensitivity of the model
specificity : float
The specificity of the model
prevalence : float
The prevalence of the test set
Returns
------
float
Matthews' correlation coefficient as a float
"""
numerator = sensitivity + specificity - 1
denominatorFirstTerm = sensitivity + (1 - specificity)*(1 - prevalence) / prevalence
denominatorSecondTerm = specificity + (1 -sensitivity)*prevalence/(1 - prevalence)
denominator = math.sqrt(denominatorFirstTerm * denominatorSecondTerm)
if sensitivity == 1 and specificity == 0:
denominator = 1
if sensitivity == 0 and specificity == 1:
denominator = 1.
return(numerator / denominator)
def ef_top_per(predictions, prevalance, top_frac=0.01):
n = int(len(predictions) * top_frac)
predictions = sorted(predictions, reverse=True)[:n]
f = np.sum(np.round(predictions)) / n
return f / prevalance
def compute_metrics(df):
"""
Compute a set of classification metric for single set of predictions.
Args:
df : dataframe with true labels in 'Label' column and probabilistic predictions in 'Prediction' column
Returns:
df_metrics: dataframe with metrics in 'Metric' column and values in 'Value' column
"""
true_labels = df['Label']
prevalance = sum(true_labels) / len(true_labels)
predictions = df['Prediction']
# print(true_labels.value_counts())
# print(predictions.max())
acc = accuracy_score(true_labels, predictions.round())
bacc = balanced_accuracy_score(true_labels, predictions.round())
precision = precision_score(true_labels, predictions.round(), zero_division=0.0)
recall = recall_score(true_labels, predictions.round())
specificity = specificity_score(true_labels, predictions.round())
mcc = matthews_corrcoef(true_labels, predictions.round())
bmcc = balanced_mcc_score(recall, specificity, prevalance)
f1 = f1_score(true_labels, predictions.round())
auc = roc_auc_score(true_labels, predictions)
ap = average_precision_score(true_labels, predictions)
dap = ap - prevalance
scores = df.sort_values(by='Prediction', ascending=False)[['Label', 'Prediction']].values
bedroc = CalcBEDROC(scores, 0, 20)
ef = ef_top_per(predictions, prevalance, 0.01)
metrics_dict = {'ACC': acc, 'BACC': bacc, 'MCC': mcc, 'BMCC': bmcc, 'Precision': precision, 'Recall': recall, 'F1-score': f1,
'AUC': auc, 'dAP': dap, 'BEDROC': bedroc, 'EF-1%' : ef}
df_metrics = pd.DataFrame(metrics_dict.items(), columns=['Metric', 'Value'])
return df_metrics
def get_metrics(
tasks : list[str] = ['AID', 'UID'],
models : list[str] = ['MHNfs', 'RF'],
settings : list[str] = ['1+1x3', '1+3x3', '1+7x3', '2+2x3', '2+6x3', '2+14x3', '4+4x3', '4+12x3', '4+28x3', '8+8x3', '8+24x3', '8+56x3'],
overwrite: bool = False):
"""
Computes classification metrics for each combination.
"""
file = f'{MAIN_DIR}/results_used.csv.gz'
if overwrite:
df = pd.DataFrame()
else:
df = pd.read_csv(file)
path_preprocessed = "" # todo
df_pubchem = pd.read_csv(path_preprocessed)
for task in tasks:
for model in models:
for setting in settings:
dir = f'{MAIN_DIR}/predictions/{model}/{task}/{setting}'
try:
targets = [x[:-4] for x in os.listdir(dir)]
pubchem_targets = df_pubchem[task].astype(str).unique().tolist()
for target in tqdm(targets, desc=f'{task} - {model} - {setting}'):
if target not in pubchem_targets:
continue
# Skip already computed targets
if not overwrite and any((df['Model'] == model) & (df['Setting'] == setting) & (df['Task'] == task) & (df['TID'] == target)):
continue
# Load predictions
df_task = pd.read_csv(f'{dir}/{target}.csv')
# Retrieve oragnism and L1 protein classification
try:
org = df_pubchem.loc[df_pubchem[task] == target, 'Organism'].values[0]
l1 = df_pubchem.loc[df_pubchem[task] == target, 'L1'].values[0]
except:
org = df_pubchem.loc[df_pubchem[task] == int(target), 'Organism'].values[0]
l1 = df_pubchem.loc[df_pubchem[task] == int(target), 'L1'].values[0]
if l1 == None:
print(target, l1)
# Compute metrics for each fold
for fold in df_task.Fold.unique():
metrics = (compute_metrics(df_task[df_task.Fold == fold]).assign(
Model=model, Task=task, TID=target, Organism=org, L1=l1, Setting=setting, Fold=fold,
)
).rename(columns={'Target' : task})
df = pd.concat([df, metrics], ignore_index=True)
except Exception as e:
print(e)
raise e
df.to_csv(file, index=False)
if __name__ == '__main__':
#get_metrics()
get_metrics(settings=['1+7x3', '2+6x3', '4+4x3', '2+14x3', '4+12x3','8+8x3'], overwrite=True)