svincoff's picture
caid benchmark
bae913a
# first few imports, just to set CUDA_VISIBLE_DEVICES before importing any torch libraries
import fuson_plm.benchmarking.caid.config as config
import os
os.environ['CUDA_VISIBLE_DEVICES'] = config.CUDA_VISIBLE_DEVICES
# remaining imports
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
from datetime import datetime
import logging
from fuson_plm.benchmarking.embed import embed_dataset_for_benchmark
from fuson_plm.benchmarking.caid.model import DisorderPredictor
from fuson_plm.benchmarking.caid.utils import DisorderDataset, get_dataloader, check_dataloaders
from fuson_plm.benchmarking.caid.plot import make_auroc_curve, make_benchmark_auroc_curve
from fuson_plm.utils.logging import get_local_time, open_logfile, log_update, print_configpy
# configure Transformers logger to only show messages that are ERROR or more severe
logging.getLogger("transformers").setLevel(logging.ERROR)
def check_env_variables():
log_update("\nChecking on environment variables...")
log_update(f"\tCUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}")
log_update(f"\ttorch.cuda.device_count(): {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
log_update(f"\t\tDevice {i}: {torch.cuda.get_device_name(i)}")
def check_splits(df):
# make sure everything has a split
if len(df.loc[df['split'].isna()])>0:
raise Exception("Error: not every benchmarking sequence has been allocated to a split (train or test)")
# make sure the only things are train and test
if len({'train','test'} - set(df['split'].unique()))!=0:
raise Exception("Error: splits column should only have \'train\' and \'test\'.")
# make sure there are no duplicate sequences
if len(df.loc[df['Sequence'].duplicated()])>0:
raise Exception("Error: duplicate sequences provided")
# Training function
def train(model, train_loader, optimizer, n_epochs, criterion, device):
"""
Trains the model for a single epoch.
Args:
model (nn.Module): model that will be trained
dataloader (DataLoader): PyTorch DataLoader with training data
optimizer (torch.optim): optimizer
criterion (nn.Module): loss function
device (torch.device): device (GPU or CPU to train the model
Returns:
total_loss (float): model loss
"""
# Training loop
model.train()
# Avg loss across epochs
avg_train_losses = []
# Loop through epochs
for epoch in range(1, 1+n_epochs):
log_update(f"EPOCH {epoch}/{n_epochs}")
# Initialize loss for the epoch to 0
total_train_loss = 0
# Make update settings
total_steps = len(train_loader)
update_interval = total_steps // min(20,total_steps) # update semi-frequently
prog_bar = tqdm(total=total_steps, leave=True, file=sys.stdout)
# Iterate through batches
#with tqdm(enumerate(train_loader,start=1), total=len(train_loader), desc='Training Batch', leave=True, position=0) as pbar:
#for batch_idx, (embeddings, labels) in pbar:
for batch_idx, (_, embeddings, labels) in enumerate(train_loader, start=1):
# Move tensors to device
embeddings, labels = embeddings.to(device), labels.to(device)
# Forward pass
optimizer.zero_grad()
outputs = model(embeddings)
loss = criterion(outputs, labels)
loss.backward()
# Parameter updates
optimizer.step()
# Update loss
total_train_loss += loss.item()
if batch_idx % update_interval == 0 or batch_idx == total_steps:
prog_bar.update(update_interval)
sys.stdout.flush()
prog_bar.close()
# Calculate avg loss for the epoch
avg_train_loss = total_train_loss / total_steps
avg_train_losses.append(avg_train_loss)
return avg_train_losses
# Evaluation function
def evaluate(model, test_loader, device):
"""
Performs inference on a trained model
Args:
model (nn.Module): the trained model
test_loader (DataLoader): PyTorch DataLoader with testing data
device (torch.device): device (GPU or CPU) to be used for inference
Returns:
preds (list): predicted per-residue disorder labels
true_labels (list): ground truth per-residue disorder labels
"""
model.eval()
test_sequences, test_preds, true_labels = [], [], []
# Make update settings
total_steps = len(test_loader)
update_interval = total_steps // min(20,total_steps) # update semi-frequently
prog_bar = tqdm(total=total_steps, leave=True, file=sys.stdout)
with torch.no_grad():
for batch_idx, (sequences, embeddings, labels) in enumerate(test_loader,start=1):
embeddings, labels = embeddings.to(device), labels.to(device)
# forward pass
outputs = model(embeddings)
assert len(sequences)==1 # the batch size should be 1; make sure
test_sequences.append(sequences[0])
test_preds.append(outputs.cpu().numpy())
true_labels.append(labels.cpu().numpy())
if batch_idx % update_interval == 0 or batch_idx == total_steps:
prog_bar.update(update_interval)
sys.stdout.flush()
prog_bar.close()
return test_sequences, test_preds, true_labels
# Evaluation function
def benchmark(model, bench_loader, device):
"""
Performs inference on a trained model
Args:
model (nn.Module): the trained model
bench_loader (DataLoader): PyTorch DataLoader with benchmarking data
device (torch.device): device (GPU or CPU) to be used for inference
Returns:
preds (list): predicted per-residue disorder labels
true_labels (list): ground truth per-residue disorder labels
"""
model.eval()
bench_sequences, bench_preds, true_labels = [], [], []
# Make update settings
total_steps = len(bench_loader)
update_interval = total_steps // min(20,total_steps) # update semi-frequently
prog_bar = tqdm(total=total_steps, leave=True, file=sys.stdout)
with torch.no_grad():
for batch_idx, (sequences, embeddings, labels) in enumerate(bench_loader,start=1):
embeddings, labels = embeddings.to(device), labels.to(device)
# forward pass
outputs = model(embeddings)
assert len(sequences)==1 # the batch size should be 1; make sure
bench_sequences.append(sequences[0])
bench_preds.append(outputs.cpu().numpy())
true_labels.append(labels.cpu().numpy())
if batch_idx % update_interval == 0 or batch_idx == total_steps:
prog_bar.update(update_interval)
sys.stdout.flush()
prog_bar.close()
return bench_sequences, bench_preds, true_labels
def grid_search_caid_predictor(embedding_path, details, output_dir, param_grid, overwrite_saved_model=True):
# prepare the grid search
grid = ParameterGrid(param_grid)
# initialize dict
training_hyperparams = {
"learning_rate": None,
"num_epochs": None,
"num_layers": None,
"num_heads": None,
"dropout": None
}
for params in grid:
# Update hyperparameters
training_hyperparams.update(params)
log_update(f"\nHyperparams:{training_hyperparams}")
train_and_evaluate_caid_predictor(embedding_path, details, output_dir, training_hyperparams, overwrite_saved_model=overwrite_saved_model)
def find_best_hyperparams(output_dir, param_grid):
# Isolate the columns that define the hyperparameters
param_cols = [f"caid_model_{k}" for k in param_grid.keys()]
# Read in the files with all the stats
test_metrics = pd.read_csv(f'{output_dir}/caid_hyperparam_screen_test_metrics.csv')
train_losses = pd.read_csv(f'{output_dir}/caid_hyperparam_screen_train_losses.csv')
bench_metrics = pd.read_csv(f'{output_dir}/caid_hyperparam_screen_fusion_benchmark_metrics.csv')
# Replace nan with empty string for epoch
test_metrics['Model Epoch'] = test_metrics['Model Epoch'].fillna('')
train_losses['Model Epoch'] = train_losses['Model Epoch'].fillna('')
bench_metrics['Model Epoch'] = bench_metrics['Model Epoch'].fillna('')
# Find the hyperparams that produced the best test metrics for each model; then save all relevant numbers in one file
benchmarked_model_key = ['Model Type','Model Name','Model Epoch'] # uniquely defines the model being benchmarked
ordered_priority_stats = ['AUROC','F1 Score','Accuracy','Precision','Recall']
sort_order = benchmarked_model_key + ordered_priority_stats
sort_bools = [True]*len(benchmarked_model_key) + [False]*len(ordered_priority_stats)
test_metrics = test_metrics.sort_values(
sort_order,
ascending=sort_bools
).groupby(benchmarked_model_key).head(1).reset_index(drop=True)
# Find the last-epoch losses for each model and hyperparameters
group_order = benchmarked_model_key+param_cols
sort_order = group_order+["caid_model_epoch"]
sort_bools = [True]*(len(group_order))+[False]*1
train_losses = train_losses.sort_values(
by=sort_order,
ascending=sort_bools,
).groupby(group_order).head(1).reset_index(drop=True)
# Combine test and train results
merge_cols = benchmarked_model_key+param_cols+['path_to_model']
combined_results = pd.merge(
test_metrics,train_losses,
on=merge_cols,
how='left'
)
# Combine with benchmark results
bench_metrics = bench_metrics.rename(columns = {'AUROC': 'Fusion AUROC',
'F1 Score': 'Fusion F1 Score',
'Accuracy': 'Fusion Accuracy',
'Precision': 'Fusion Precision',
'Recall': 'Fusion Recall'})
combined_results = pd.merge(
combined_results,bench_metrics,
on=merge_cols,
how='left'
)
# reorder columns
combined_results = combined_results[[
'Model Type','Model Name','Model Epoch',
'Accuracy','Precision','Recall','F1 Score','AUROC',
'Fusion Accuracy','Fusion Precision','Fusion Recall','Fusion F1 Score','Fusion AUROC',
'caid_model_learning_rate','caid_model_num_epochs','caid_model_num_layers','caid_model_num_heads','caid_model_dropout','caid_model_epoch','caid_model_loss','path_to_model'
]]
combined_results.to_csv(f"{output_dir}/best_caid_model_results.csv",index=False)
def get_fresh_model(training_hyperparams, device):
input_dim, hidden_dim = 1280, 1280
model = DisorderPredictor(
input_dim=input_dim,
hidden_dim=hidden_dim,
num_layers=training_hyperparams["num_layers"],
num_heads=training_hyperparams["num_heads"],
dropout=training_hyperparams['dropout']
)
model.to(device) # Push model to device (should be GPU)
return model
def predict_from_best_thresh(prob_and_label_df, seq_label_dict=None):
"""
Finds the best prediction threshold for disorder by maximizing F1 Score. Makes predictions
Args:
prob_and_label_df: DataFrame with columns: sequence,prob_1
seq_label_dict: dictionary of sequences to true labels. e.g. 'MKLP': '1100'
Returns:
prob_and_label_df: new version of original dataframe with added columns: threshold,pred_labels
"""
# Use seq_label_dict to insert labels
prob_and_label_df['labels'] = prob_and_label_df['sequence'].map(seq_label_dict)
# EVERYTHING should have a label!!
assert prob_and_label_df['labels'].notna().all()
probs = ','.join(prob_and_label_df['prob_1'].tolist())
probs = [float(x) for x in probs.split(",")]
true_labels = ''.join(prob_and_label_df['labels'].tolist())
true_labels = [int(x) for x in list(true_labels)]
total_aas = sum(prob_and_label_df['sequence'].str.len())
log_update(f"\tLength of dataframe (number of seqs in dataset): {len(prob_and_label_df)}")
log_update(f"\tTotal AAs in dataset: {total_aas}\ttotal probabilities: {len(probs)}\ttotal labels: {len(true_labels)}")
y_true = np.array(true_labels) # True labels
y_probs = np.array(probs) # Predicted probabilities
# Compute precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_true, y_probs)
precision = precision[:-1]
recall = recall[:-1]
# Calculate F1 scores for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
# Find the threshold that maximizes the F1 score
best_threshold_index = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_index]
# Compute AUPRC
auprc = average_precision_score(y_true, y_probs)
log_update(f"\tBest Threshold: {best_threshold}")
log_update(f"\tBest F1 Score: {f1_scores[best_threshold_index]:.2f}")
log_update(f"\tAUPRC: {auprc:.2f}")
# Edit the original DataFrame
# Add threshold
prob_and_label_df['threshold'] = [best_threshold]*len(prob_and_label_df)
# Make predictions using this new threshold
prob_and_label_df['pred_labels'] = prob_and_label_df['prob_1'].apply(lambda x: ['1' if float(y)>best_threshold else '0' for y in x.split(",")])
prob_and_label_df['pred_labels'] = prob_and_label_df['pred_labels'].apply(lambda x: ''.join(x))
log_update("\tUsed calculated threshold to construct predicted labels for dataset")
return prob_and_label_df
def train_and_evaluate_caid_predictor(embedding_path, details, output_dir, training_hyperparams, overwrite_saved_model=True):
# unpack the details dictioanry
benchmark_model_type = details['model_type']
benchmark_model_name = details['model']
benchmark_model_epoch = details['epoch']
# define model save directories and make if they don't exist
model_outer_folder = f"trained_models/{benchmark_model_type}"
if not(np.isnan(benchmark_model_epoch)): model_outer_folder+=f"/{benchmark_model_name}/epoch{benchmark_model_epoch}"
model_full_folder=f"{model_outer_folder}/lr{training_hyperparams['learning_rate']}_bs{1}_hd{1280}_epochs{training_hyperparams['num_epochs']}_layers{training_hyperparams['num_layers']}_heads{training_hyperparams['num_heads']}_drpt{training_hyperparams['dropout']}"
l_model_full_folder = model_full_folder.split("/")
for i in range(0,len(l_model_full_folder)):
newdir="/".join(l_model_full_folder[:i+1])
os.makedirs(newdir, exist_ok=True)
# see if we've trained the model before
model_full_path = f"{model_full_folder}/model.pth"
train_new_model=True #initially, we believe we're training a new model. Let's make sure we want to.
if os.path.exists(model_full_path):
# If the model exists and we ARE allowed to overwrite, still train
if overwrite_saved_model:
log_update(f"\nOverwriting previously trained model with same hyperparams at {model_full_path}")
# If the model exists and we are NOT allowed to overwrite, don't train
else:
log_update(f"\nWARNING: this model may already be trained at {model_full_path}. Skipping")
train_new_model=False
# If training new model, get new model stats.
if train_new_model:
max_length=4500+2
# make Dataloaders
train_dataloader = get_dataloader('splits/train_df.csv', embedding_path, max_length=max_length, batch_size=1, shuffle=True)
test_dataloader = get_dataloader('splits/test_df.csv', embedding_path, max_length=max_length, batch_size=1, shuffle=False)
benchmark_dataloader = get_dataloader('splits/fusion_bench_df.csv', embedding_path, max_length=max_length, batch_size=1, shuffle=False)
# Set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize the model and set it to deice
model = get_fresh_model(training_hyperparams, device)
# Initialize optimizer
optimizer = optim.Adam(model.parameters(), lr=training_hyperparams["learning_rate"])
criterion = nn.BCELoss()
num_epochs = training_hyperparams['num_epochs']
################# Train
# Train loop
avg_train_losses = train(model, train_dataloader, optimizer, num_epochs, criterion, device)
# Save teh train curve results
formatted_hyperparams = {f"caid_model_{k}":v for k, v in training_hyperparams.items()}
train_loss_df = pd.DataFrame.from_dict(formatted_hyperparams,orient='index').T
train_loss_df['caid_model_epoch'] = [list(range(1,1+num_epochs))]
train_loss_df['caid_model_loss'] = [avg_train_losses]
train_loss_df[['Model Type','Model Name','Model Epoch']] = [[benchmark_model_type,benchmark_model_name,benchmark_model_epoch]]
train_loss_df = train_loss_df.explode(['caid_model_epoch', 'caid_model_loss'])
# Save loss results - both to the model folder (including hyperparams), AND to the current results folder
train_loss_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_train_losses.csv'
train_loss_individual_results_csv_path = f'{model_full_folder}/caid_train_losses.csv'
train_loss_df.to_csv(train_loss_individual_results_csv_path,mode='w',index=False)
train_loss_df['path_to_model'] = model_full_path
if not(os.path.exists(train_loss_combined_results_csv_path)):
train_loss_df.to_csv(train_loss_combined_results_csv_path,index=False)
else:
train_loss_df.to_csv(train_loss_combined_results_csv_path,mode='a',index=False,header=False)
log_update(f"Final train loss: {avg_train_losses[-1]:.4f}")
################# Test
# Evaluate model on test sequences
test_sequences, test_preds, test_labels = evaluate(model, test_dataloader, device)
test_metrics = calculate_metrics(test_preds, test_labels)
# Make dataframe of test metric results
test_results_df = pd.DataFrame.from_dict(test_metrics,orient='index').T
test_results_df[['Model Type','Model Name','Model Epoch']] = [[benchmark_model_type,benchmark_model_name,benchmark_model_epoch]]
# add the hyperparameters to this
hyperparams_df = pd.DataFrame.from_dict(formatted_hyperparams,orient='index').T
test_results_df = pd.concat([test_results_df,hyperparams_df],axis=1)
# Make dataframe of test probabilities (for AUROC curve)
# Create a pandas DataFrame
prob_and_label_df = pd.DataFrame(data = {
'sequence': test_sequences,
'prob_1': [arr.flatten() for arr in test_preds]
})
prob_and_label_df['prob_1'] = prob_and_label_df['prob_1'].apply(
lambda prob_list: ",".join([f"{round(x, 3):.3f}" for x in prob_list])
)
prob_and_label_df['Model Type'] = [benchmark_model_type]*len(prob_and_label_df)
prob_and_label_df['Model Name'] = [benchmark_model_name]*len(prob_and_label_df)
prob_and_label_df['Model Epoch'] = [benchmark_model_epoch]*len(prob_and_label_df)
# Save test results - both to the model folder (including hyperparams), AND to the current results folder
test_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_test_metrics.csv'
test_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_test_metrics.csv'
test_results_df.to_csv(test_results_csv_path,mode='w',index=False)
test_results_df['path_to_model'] = model_full_path
if not(os.path.exists(test_combined_results_csv_path)):
test_results_df.to_csv(test_combined_results_csv_path,index=False)
else:
test_results_df.to_csv(test_combined_results_csv_path,mode='a',index=False,header=False)
# Save test probs - only to model folder
test_probs_csv_path = f'{model_full_folder}/caid_hyperparam_screen_test_probs.csv'
seq_label_dict = pd.read_csv('splits/test_df.csv')
seq_label_dict = dict(zip(seq_label_dict['Sequence'],seq_label_dict['Label']))
log_update("Finding best threshold for CAID test set predictions based on maximizing F1 Score...")
prob_and_label_df = predict_from_best_thresh(prob_and_label_df, seq_label_dict=seq_label_dict)
prob_and_label_df[['sequence','prob_1','threshold','pred_labels']].to_csv(test_probs_csv_path,mode='w',index=False)
log_update(f"Test performance: {test_metrics}")
################# Benchmark
# Evaluate model on benchmark sequences
benchmark_sequences, benchmark_preds, benchmark_labels = evaluate(model, benchmark_dataloader, device)
benchmark_metrics = calculate_metrics(benchmark_preds, benchmark_labels)
# Make dataframe of benchmark metric results
benchmark_results_df = pd.DataFrame.from_dict(benchmark_metrics,orient='index').T
benchmark_results_df[['Model Type','Model Name','Model Epoch']] = [[benchmark_model_type,benchmark_model_name,benchmark_model_epoch]]
# add the hyperparameters to this
hyperparams_df = pd.DataFrame.from_dict(formatted_hyperparams,orient='index').T
benchmark_results_df = pd.concat([benchmark_results_df,hyperparams_df],axis=1)
# Make dataframe of benchmark probabilities (for AUROC curve)
# Create a pandas DataFrame
prob_and_label_df = pd.DataFrame(data = {
'sequence': benchmark_sequences,
'prob_1': [arr.flatten() for arr in benchmark_preds]
})
prob_and_label_df['prob_1'] = prob_and_label_df['prob_1'].apply(
lambda prob_list: ",".join([f"{round(x, 3):.3f}" for x in prob_list])
)
prob_and_label_df['Model Type'] = [benchmark_model_type]*len(prob_and_label_df)
prob_and_label_df['Model Name'] = [benchmark_model_name]*len(prob_and_label_df)
prob_and_label_df['Model Epoch'] = [benchmark_model_epoch]*len(prob_and_label_df)
# Save benchmark results - both to the model folder (including hyperparams), AND to the current results folder
benchmark_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
benchmark_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
benchmark_results_df.to_csv(benchmark_results_csv_path,mode='w',index=False)
benchmark_results_df['path_to_model'] = model_full_path
if not(os.path.exists(benchmark_combined_results_csv_path)):
benchmark_results_df.to_csv(benchmark_combined_results_csv_path,index=False)
else:
benchmark_results_df.to_csv(benchmark_combined_results_csv_path,mode='a',index=False,header=False)
# Save benchmark probs - only to model folder
benchmark_probs_csv_path = f'{model_full_folder}/caid_hyperparam_screen_fusion_benchmark_probs.csv'
seq_label_dict = pd.read_csv('splits/fusion_bench_df.csv')
seq_label_dict = dict(zip(seq_label_dict['Sequence'],seq_label_dict['Label']))
log_update("Finding best threshold for fusion benchmark set predictions based on maximizing F1 Score...")
prob_and_label_df = predict_from_best_thresh(prob_and_label_df, seq_label_dict=seq_label_dict)
prob_and_label_df[['sequence','prob_1','threshold','pred_labels']].to_csv(benchmark_probs_csv_path,mode='w',index=False)
log_update(f"benchmark performance: {benchmark_metrics}")
################# Save model
# Save model and metrics for this hyperparameter combination in the trained models folder
torch.save(model.state_dict(), model_full_path)
# if we didn't train again, still add those results to this benchmarking run so that they all get compared together
else:
# Load the appropriate train loses
train_loss_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_train_losses.csv'
train_loss_individual_results_csv_path = f'{model_full_folder}/caid_train_losses.csv'
train_loss_individual_results = pd.read_csv(train_loss_individual_results_csv_path)
train_loss_individual_results['path_to_model'] = [model_full_path]*len(train_loss_individual_results)
# Add these results to the combined results file for this run if it exists; otherwise create new combined results file
if not(os.path.exists(train_loss_combined_results_csv_path)):
train_loss_individual_results.to_csv(train_loss_combined_results_csv_path,index=False)
else:
train_loss_individual_results.to_csv(train_loss_combined_results_csv_path,mode='a',index=False,header=False)
# Load the appropriate test stats
test_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_test_metrics.csv'
test_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_test_metrics.csv'
test_individual_results = pd.read_csv(test_results_csv_path)
test_individual_results['path_to_model'] = [model_full_path]*len(test_individual_results)
# Add these results to the combined results file for this run if it exists; otherwise create new combined results file
if not(os.path.exists(test_combined_results_csv_path)):
test_individual_results.to_csv(test_combined_results_csv_path,index=False)
else:
test_individual_results.to_csv(test_combined_results_csv_path,mode='a',index=False,header=False)
# Load the appropriate benchmark stats
benchmark_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
benchmark_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
benchmark_individual_results = pd.read_csv(benchmark_results_csv_path)
benchmark_individual_results['path_to_model'] = [model_full_path]*len(benchmark_individual_results)
# Add these results to the combined results file for this run if it exists; otherwise create new combined results file
if not(os.path.exists(benchmark_combined_results_csv_path)):
benchmark_individual_results.to_csv(benchmark_combined_results_csv_path,index=False)
else:
benchmark_individual_results.to_csv(benchmark_combined_results_csv_path,mode='a',index=False,header=False)
# Metrics calculation
def calculate_metrics(preds, labels, threshold=0.5):
"""
Calculates metrics to assess model performance
Args:
preds (list): model's predictions (probabilities)
labels (list): ground truth labels
threshold (float): minimum threshold a prediction must be met to be considered disordered
Returns:
accuracy (float): accuracy
precision (float): precision
recall (float): recall
f1 (float): F1 score
roc_auc (float): AUROC score
"""
flat_binary_preds, flat_prob_preds, flat_labels = [], [], []
for pred, label in zip(preds, labels):
flat_binary_preds.extend((pred > threshold).astype(int).flatten()) # binary preds are 1 or 0; 1 if the prob > threshold
flat_prob_preds.extend(pred.flatten())
flat_labels.extend(label.flatten())
flat_binary_preds = np.array(flat_binary_preds)
flat_prob_preds = np.array(flat_prob_preds)
flat_labels = np.array(flat_labels)
accuracy = accuracy_score(flat_labels, flat_binary_preds)
precision = precision_score(flat_labels, flat_binary_preds)
recall = recall_score(flat_labels, flat_binary_preds)
f1 = f1_score(flat_labels, flat_binary_preds)
roc_auc = roc_auc_score(flat_labels, flat_prob_preds)
# make dictionary of the results and return it
metrics_dict = {
'Accuracy': accuracy,
'Precision': precision,
'Recall': recall,
'F1 Score': f1,
'AUROC': roc_auc
}
return metrics_dict
def main():
# make output directory for this run
os.makedirs('results',exist_ok=True)
output_dir = f'results/{get_local_time()}'
os.makedirs(output_dir,exist_ok=True)
with open_logfile(f'{output_dir}/caid_benchmark_log.txt'):
# print configurations
print_configpy(config)
# Verify that the environment variables are set correctly
check_env_variables()
# make embeddings if needed
all_embedding_paths = embed_dataset_for_benchmark(
fuson_ckpts=config.FUSONPLM_CKPTS,
input_data_path='splits/splits.csv',
input_fname='CAID2_competition_sequences',
average=False, seq_col='Sequence',
benchmark_fusonplm=config.BENCHMARK_FUSONPLM,
benchmark_esm=config.BENCHMARK_ESM,
benchmark_fo_puncta_ml=False,
overwrite=config.PERMISSION_TO_OVERWRITE_EMBEDDINGS)
# load the splits with labels
splits_df = pd.read_csv('splits/splits.csv')
log_update(f"\nSplit breakdown...\n\t{len(splits_df.loc[splits_df['Split']=='Train'])} train seqs\n\t{len(splits_df.loc[splits_df['Split']=='Test'])} test seqs")
log_update("\nTraining and evaluating models")
# Set hyperparameters for disorder predictor
param_grid = {
'learning_rate': [5e-5],
'num_heads': [5, 8, 10],
'num_layers': [2, 4, 6],
'dropout': [0.2, 0.5],
'num_epochs': [2]
}
# loop through the embedding paths and train each one
for embedding_path, details in all_embedding_paths.items():
log_update(f"\nBenchmarking embeddings at: {embedding_path}")
grid_search_caid_predictor(embedding_path, details, output_dir, param_grid, overwrite_saved_model=config.PERMISSION_TO_OVERWRITE_MODELS)
# find the best grid search performer
find_best_hyperparams(output_dir, param_grid)
# make plots
#### caid test set
best_caid_model_results = pd.read_csv(f"{output_dir}/best_caid_model_results.csv")
#### fusion benchmark set
best_caid_model_results_benchmark = best_caid_model_results.drop(columns=
['AUROC','F1 Score','Accuracy','Precision','Recall']
).rename(columns={
'Fusion AUROC': 'AUROC',
'Fusion F1 Score': 'F1 Score',
'Fusion Accuracy': 'Accuracy',
'Fusion Precision': 'Precision',
'Fusion Recall': 'Recall'
})
if __name__ == "__main__":
main()