FusOn-pLM / fuson_plm /benchmarking /caid /train.py

caid benchmark

bae913a about 2 months ago

31.7 kB

	# first few imports, just to set CUDA_VISIBLE_DEVICES before importing any torch libraries
	import fuson_plm.benchmarking.caid.config as config
	import os
	os.environ['CUDA_VISIBLE_DEVICES'] = config.CUDA_VISIBLE_DEVICES

	# remaining imports
	import torch
	import torch.nn as nn
	import torch.optim as optim
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score

	from sklearn.model_selection import ParameterGrid
	from tqdm import tqdm
	import pandas as pd
	import numpy as np
	import sys
	from datetime import datetime
	import logging

	from fuson_plm.benchmarking.embed import embed_dataset_for_benchmark
	from fuson_plm.benchmarking.caid.model import DisorderPredictor
	from fuson_plm.benchmarking.caid.utils import DisorderDataset, get_dataloader, check_dataloaders
	from fuson_plm.benchmarking.caid.plot import make_auroc_curve, make_benchmark_auroc_curve
	from fuson_plm.utils.logging import get_local_time, open_logfile, log_update, print_configpy

	# configure Transformers logger to only show messages that are ERROR or more severe
	logging.getLogger("transformers").setLevel(logging.ERROR)

	def check_env_variables():
	log_update("\nChecking on environment variables...")
	log_update(f"\tCUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}")
	log_update(f"\ttorch.cuda.device_count(): {torch.cuda.device_count()}")
	for i in range(torch.cuda.device_count()):
	log_update(f"\t\tDevice {i}: {torch.cuda.get_device_name(i)}")

	def check_splits(df):
	# make sure everything has a split
	if len(df.loc[df['split'].isna()])>0:
	raise Exception("Error: not every benchmarking sequence has been allocated to a split (train or test)")
	# make sure the only things are train and test
	if len({'train','test'} - set(df['split'].unique()))!=0:
	raise Exception("Error: splits column should only have \'train\' and \'test\'.")
	# make sure there are no duplicate sequences
	if len(df.loc[df['Sequence'].duplicated()])>0:
	raise Exception("Error: duplicate sequences provided")

	# Training function
	def train(model, train_loader, optimizer, n_epochs, criterion, device):
	"""
	Trains the model for a single epoch.
	Args:
	model (nn.Module): model that will be trained
	dataloader (DataLoader): PyTorch DataLoader with training data
	optimizer (torch.optim): optimizer
	criterion (nn.Module): loss function
	device (torch.device): device (GPU or CPU to train the model
	Returns:
	total_loss (float): model loss
	"""
	# Training loop
	model.train()

	# Avg loss across epochs
	avg_train_losses = []

	# Loop through epochs
	for epoch in range(1, 1+n_epochs):
	log_update(f"EPOCH {epoch}/{n_epochs}")

	# Initialize loss for the epoch to 0
	total_train_loss = 0

	# Make update settings
	total_steps = len(train_loader)
	update_interval = total_steps // min(20,total_steps) # update semi-frequently
	prog_bar = tqdm(total=total_steps, leave=True, file=sys.stdout)

	# Iterate through batches
	#with tqdm(enumerate(train_loader,start=1), total=len(train_loader), desc='Training Batch', leave=True, position=0) as pbar:
	#for batch_idx, (embeddings, labels) in pbar:
	for batch_idx, (_, embeddings, labels) in enumerate(train_loader, start=1):
	# Move tensors to device
	embeddings, labels = embeddings.to(device), labels.to(device)

	# Forward pass
	optimizer.zero_grad()
	outputs = model(embeddings)

	loss = criterion(outputs, labels)
	loss.backward()

	# Parameter updates
	optimizer.step()

	# Update loss
	total_train_loss += loss.item()

	if batch_idx % update_interval == 0 or batch_idx == total_steps:
	prog_bar.update(update_interval)
	sys.stdout.flush()

	prog_bar.close()

	# Calculate avg loss for the epoch
	avg_train_loss = total_train_loss / total_steps
	avg_train_losses.append(avg_train_loss)

	return avg_train_losses


	# Evaluation function
	def evaluate(model, test_loader, device):
	"""
	Performs inference on a trained model
	Args:
	model (nn.Module): the trained model
	test_loader (DataLoader): PyTorch DataLoader with testing data
	device (torch.device): device (GPU or CPU) to be used for inference
	Returns:
	preds (list): predicted per-residue disorder labels
	true_labels (list): ground truth per-residue disorder labels
	"""
	model.eval()
	test_sequences, test_preds, true_labels = [], [], []

	# Make update settings
	total_steps = len(test_loader)
	update_interval = total_steps // min(20,total_steps) # update semi-frequently
	prog_bar = tqdm(total=total_steps, leave=True, file=sys.stdout)

	with torch.no_grad():
	for batch_idx, (sequences, embeddings, labels) in enumerate(test_loader,start=1):
	embeddings, labels = embeddings.to(device), labels.to(device)

	# forward pass
	outputs = model(embeddings)

	assert len(sequences)==1 # the batch size should be 1; make sure
	test_sequences.append(sequences[0])
	test_preds.append(outputs.cpu().numpy())
	true_labels.append(labels.cpu().numpy())

	if batch_idx % update_interval == 0 or batch_idx == total_steps:
	prog_bar.update(update_interval)
	sys.stdout.flush()
	prog_bar.close()
	return test_sequences, test_preds, true_labels

	# Evaluation function
	def benchmark(model, bench_loader, device):
	"""
	Performs inference on a trained model
	Args:
	model (nn.Module): the trained model
	bench_loader (DataLoader): PyTorch DataLoader with benchmarking data
	device (torch.device): device (GPU or CPU) to be used for inference
	Returns:
	preds (list): predicted per-residue disorder labels
	true_labels (list): ground truth per-residue disorder labels
	"""
	model.eval()
	bench_sequences, bench_preds, true_labels = [], [], []

	# Make update settings
	total_steps = len(bench_loader)
	update_interval = total_steps // min(20,total_steps) # update semi-frequently
	prog_bar = tqdm(total=total_steps, leave=True, file=sys.stdout)

	with torch.no_grad():
	for batch_idx, (sequences, embeddings, labels) in enumerate(bench_loader,start=1):
	embeddings, labels = embeddings.to(device), labels.to(device)

	# forward pass
	outputs = model(embeddings)

	assert len(sequences)==1 # the batch size should be 1; make sure
	bench_sequences.append(sequences[0])
	bench_preds.append(outputs.cpu().numpy())
	true_labels.append(labels.cpu().numpy())

	if batch_idx % update_interval == 0 or batch_idx == total_steps:
	prog_bar.update(update_interval)
	sys.stdout.flush()
	prog_bar.close()
	return bench_sequences, bench_preds, true_labels

	def grid_search_caid_predictor(embedding_path, details, output_dir, param_grid, overwrite_saved_model=True):
	# prepare the grid search
	grid = ParameterGrid(param_grid)

	# initialize dict
	training_hyperparams = {
	"learning_rate": None,
	"num_epochs": None,
	"num_layers": None,
	"num_heads": None,
	"dropout": None
	}

	for params in grid:
	# Update hyperparameters
	training_hyperparams.update(params)
	log_update(f"\nHyperparams:{training_hyperparams}")
	train_and_evaluate_caid_predictor(embedding_path, details, output_dir, training_hyperparams, overwrite_saved_model=overwrite_saved_model)


	def find_best_hyperparams(output_dir, param_grid):
	# Isolate the columns that define the hyperparameters
	param_cols = [f"caid_model_{k}" for k in param_grid.keys()]

	# Read in the files with all the stats
	test_metrics = pd.read_csv(f'{output_dir}/caid_hyperparam_screen_test_metrics.csv')
	train_losses = pd.read_csv(f'{output_dir}/caid_hyperparam_screen_train_losses.csv')
	bench_metrics = pd.read_csv(f'{output_dir}/caid_hyperparam_screen_fusion_benchmark_metrics.csv')

	# Replace nan with empty string for epoch
	test_metrics['Model Epoch'] = test_metrics['Model Epoch'].fillna('')
	train_losses['Model Epoch'] = train_losses['Model Epoch'].fillna('')
	bench_metrics['Model Epoch'] = bench_metrics['Model Epoch'].fillna('')

	# Find the hyperparams that produced the best test metrics for each model; then save all relevant numbers in one file
	benchmarked_model_key = ['Model Type','Model Name','Model Epoch'] # uniquely defines the model being benchmarked
	ordered_priority_stats = ['AUROC','F1 Score','Accuracy','Precision','Recall']
	sort_order = benchmarked_model_key + ordered_priority_stats
	sort_bools = [True]len(benchmarked_model_key) + [False]len(ordered_priority_stats)
	test_metrics = test_metrics.sort_values(
	sort_order,
	ascending=sort_bools
	).groupby(benchmarked_model_key).head(1).reset_index(drop=True)

	# Find the last-epoch losses for each model and hyperparameters
	group_order = benchmarked_model_key+param_cols
	sort_order = group_order+["caid_model_epoch"]
	sort_bools = [True](len(group_order))+[False]1
	train_losses = train_losses.sort_values(
	by=sort_order,
	ascending=sort_bools,
	).groupby(group_order).head(1).reset_index(drop=True)

	# Combine test and train results
	merge_cols = benchmarked_model_key+param_cols+['path_to_model']
	combined_results = pd.merge(
	test_metrics,train_losses,
	on=merge_cols,
	how='left'
	)
	# Combine with benchmark results
	bench_metrics = bench_metrics.rename(columns = {'AUROC': 'Fusion AUROC',
	'F1 Score': 'Fusion F1 Score',
	'Accuracy': 'Fusion Accuracy',
	'Precision': 'Fusion Precision',
	'Recall': 'Fusion Recall'})
	combined_results = pd.merge(
	combined_results,bench_metrics,
	on=merge_cols,
	how='left'
	)

	# reorder columns
	combined_results = combined_results[[
	'Model Type','Model Name','Model Epoch',
	'Accuracy','Precision','Recall','F1 Score','AUROC',
	'Fusion Accuracy','Fusion Precision','Fusion Recall','Fusion F1 Score','Fusion AUROC',
	'caid_model_learning_rate','caid_model_num_epochs','caid_model_num_layers','caid_model_num_heads','caid_model_dropout','caid_model_epoch','caid_model_loss','path_to_model'
	]]
	combined_results.to_csv(f"{output_dir}/best_caid_model_results.csv",index=False)

	def get_fresh_model(training_hyperparams, device):
	input_dim, hidden_dim = 1280, 1280
	model = DisorderPredictor(
	input_dim=input_dim,
	hidden_dim=hidden_dim,
	num_layers=training_hyperparams["num_layers"],
	num_heads=training_hyperparams["num_heads"],
	dropout=training_hyperparams['dropout']
	)
	model.to(device) # Push model to device (should be GPU)

	return model

	def predict_from_best_thresh(prob_and_label_df, seq_label_dict=None):
	"""
	Finds the best prediction threshold for disorder by maximizing F1 Score. Makes predictions
	Args:
	prob_and_label_df: DataFrame with columns: sequence,prob_1
	seq_label_dict: dictionary of sequences to true labels. e.g. 'MKLP': '1100'
	Returns:
	prob_and_label_df: new version of original dataframe with added columns: threshold,pred_labels
	"""
	# Use seq_label_dict to insert labels
	prob_and_label_df['labels'] = prob_and_label_df['sequence'].map(seq_label_dict)
	# EVERYTHING should have a label!!
	assert prob_and_label_df['labels'].notna().all()

	probs = ','.join(prob_and_label_df['prob_1'].tolist())
	probs = [float(x) for x in probs.split(",")]
	true_labels = ''.join(prob_and_label_df['labels'].tolist())
	true_labels = [int(x) for x in list(true_labels)]
	total_aas = sum(prob_and_label_df['sequence'].str.len())
	log_update(f"\tLength of dataframe (number of seqs in dataset): {len(prob_and_label_df)}")
	log_update(f"\tTotal AAs in dataset: {total_aas}\ttotal probabilities: {len(probs)}\ttotal labels: {len(true_labels)}")

	y_true = np.array(true_labels) # True labels
	y_probs = np.array(probs) # Predicted probabilities

	# Compute precision, recall, and thresholds
	precision, recall, thresholds = precision_recall_curve(y_true, y_probs)
	precision = precision[:-1]
	recall = recall[:-1]
	# Calculate F1 scores for each threshold
	f1_scores = 2 * (precision * recall) / (precision + recall)

	# Find the threshold that maximizes the F1 score
	best_threshold_index = np.argmax(f1_scores)
	best_threshold = thresholds[best_threshold_index]

	# Compute AUPRC
	auprc = average_precision_score(y_true, y_probs)

	log_update(f"\tBest Threshold: {best_threshold}")
	log_update(f"\tBest F1 Score: {f1_scores[best_threshold_index]:.2f}")
	log_update(f"\tAUPRC: {auprc:.2f}")

	# Edit the original DataFrame
	# Add threshold
	prob_and_label_df['threshold'] = [best_threshold]*len(prob_and_label_df)
	# Make predictions using this new threshold
	prob_and_label_df['pred_labels'] = prob_and_label_df['prob_1'].apply(lambda x: ['1' if float(y)>best_threshold else '0' for y in x.split(",")])
	prob_and_label_df['pred_labels'] = prob_and_label_df['pred_labels'].apply(lambda x: ''.join(x))
	log_update("\tUsed calculated threshold to construct predicted labels for dataset")
	return prob_and_label_df


	def train_and_evaluate_caid_predictor(embedding_path, details, output_dir, training_hyperparams, overwrite_saved_model=True):
	# unpack the details dictioanry
	benchmark_model_type = details['model_type']
	benchmark_model_name = details['model']
	benchmark_model_epoch = details['epoch']

	# define model save directories and make if they don't exist
	model_outer_folder = f"trained_models/{benchmark_model_type}"
	if not(np.isnan(benchmark_model_epoch)): model_outer_folder+=f"/{benchmark_model_name}/epoch{benchmark_model_epoch}"
	model_full_folder=f"{model_outer_folder}/lr{training_hyperparams['learning_rate']}_bs{1}_hd{1280}_epochs{training_hyperparams['num_epochs']}_layers{training_hyperparams['num_layers']}_heads{training_hyperparams['num_heads']}_drpt{training_hyperparams['dropout']}"
	l_model_full_folder = model_full_folder.split("/")
	for i in range(0,len(l_model_full_folder)):
	newdir="/".join(l_model_full_folder[:i+1])
	os.makedirs(newdir, exist_ok=True)

	# see if we've trained the model before
	model_full_path = f"{model_full_folder}/model.pth"
	train_new_model=True #initially, we believe we're training a new model. Let's make sure we want to.
	if os.path.exists(model_full_path):
	# If the model exists and we ARE allowed to overwrite, still train
	if overwrite_saved_model:
	log_update(f"\nOverwriting previously trained model with same hyperparams at {model_full_path}")
	# If the model exists and we are NOT allowed to overwrite, don't train
	else:
	log_update(f"\nWARNING: this model may already be trained at {model_full_path}. Skipping")
	train_new_model=False

	# If training new model, get new model stats.
	if train_new_model:
	max_length=4500+2
	# make Dataloaders
	train_dataloader = get_dataloader('splits/train_df.csv', embedding_path, max_length=max_length, batch_size=1, shuffle=True)
	test_dataloader = get_dataloader('splits/test_df.csv', embedding_path, max_length=max_length, batch_size=1, shuffle=False)
	benchmark_dataloader = get_dataloader('splits/fusion_bench_df.csv', embedding_path, max_length=max_length, batch_size=1, shuffle=False)

	# Set device to GPU
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Initialize the model and set it to deice
	model = get_fresh_model(training_hyperparams, device)

	# Initialize optimizer
	optimizer = optim.Adam(model.parameters(), lr=training_hyperparams["learning_rate"])
	criterion = nn.BCELoss()
	num_epochs = training_hyperparams['num_epochs']

	################# Train
	# Train loop
	avg_train_losses = train(model, train_dataloader, optimizer, num_epochs, criterion, device)
	# Save teh train curve results
	formatted_hyperparams = {f"caid_model_{k}":v for k, v in training_hyperparams.items()}
	train_loss_df = pd.DataFrame.from_dict(formatted_hyperparams,orient='index').T
	train_loss_df['caid_model_epoch'] = [list(range(1,1+num_epochs))]
	train_loss_df['caid_model_loss'] = [avg_train_losses]
	train_loss_df[['Model Type','Model Name','Model Epoch']] = [[benchmark_model_type,benchmark_model_name,benchmark_model_epoch]]
	train_loss_df = train_loss_df.explode(['caid_model_epoch', 'caid_model_loss'])

	# Save loss results - both to the model folder (including hyperparams), AND to the current results folder
	train_loss_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_train_losses.csv'
	train_loss_individual_results_csv_path = f'{model_full_folder}/caid_train_losses.csv'
	train_loss_df.to_csv(train_loss_individual_results_csv_path,mode='w',index=False)
	train_loss_df['path_to_model'] = model_full_path
	if not(os.path.exists(train_loss_combined_results_csv_path)):
	train_loss_df.to_csv(train_loss_combined_results_csv_path,index=False)
	else:
	train_loss_df.to_csv(train_loss_combined_results_csv_path,mode='a',index=False,header=False)

	log_update(f"Final train loss: {avg_train_losses[-1]:.4f}")

	################# Test
	# Evaluate model on test sequences
	test_sequences, test_preds, test_labels = evaluate(model, test_dataloader, device)
	test_metrics = calculate_metrics(test_preds, test_labels)
	# Make dataframe of test metric results
	test_results_df = pd.DataFrame.from_dict(test_metrics,orient='index').T
	test_results_df[['Model Type','Model Name','Model Epoch']] = [[benchmark_model_type,benchmark_model_name,benchmark_model_epoch]]
	# add the hyperparameters to this
	hyperparams_df = pd.DataFrame.from_dict(formatted_hyperparams,orient='index').T
	test_results_df = pd.concat([test_results_df,hyperparams_df],axis=1)

	# Make dataframe of test probabilities (for AUROC curve)
	# Create a pandas DataFrame
	prob_and_label_df = pd.DataFrame(data = {
	'sequence': test_sequences,
	'prob_1': [arr.flatten() for arr in test_preds]
	})
	prob_and_label_df['prob_1'] = prob_and_label_df['prob_1'].apply(
	lambda prob_list: ",".join([f"{round(x, 3):.3f}" for x in prob_list])
	)
	prob_and_label_df['Model Type'] = [benchmark_model_type]*len(prob_and_label_df)
	prob_and_label_df['Model Name'] = [benchmark_model_name]*len(prob_and_label_df)
	prob_and_label_df['Model Epoch'] = [benchmark_model_epoch]*len(prob_and_label_df)

	# Save test results - both to the model folder (including hyperparams), AND to the current results folder
	test_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_test_metrics.csv'
	test_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_test_metrics.csv'
	test_results_df.to_csv(test_results_csv_path,mode='w',index=False)
	test_results_df['path_to_model'] = model_full_path
	if not(os.path.exists(test_combined_results_csv_path)):
	test_results_df.to_csv(test_combined_results_csv_path,index=False)
	else:
	test_results_df.to_csv(test_combined_results_csv_path,mode='a',index=False,header=False)

	# Save test probs - only to model folder
	test_probs_csv_path = f'{model_full_folder}/caid_hyperparam_screen_test_probs.csv'
	seq_label_dict = pd.read_csv('splits/test_df.csv')
	seq_label_dict = dict(zip(seq_label_dict['Sequence'],seq_label_dict['Label']))
	log_update("Finding best threshold for CAID test set predictions based on maximizing F1 Score...")
	prob_and_label_df = predict_from_best_thresh(prob_and_label_df, seq_label_dict=seq_label_dict)
	prob_and_label_df[['sequence','prob_1','threshold','pred_labels']].to_csv(test_probs_csv_path,mode='w',index=False)

	log_update(f"Test performance: {test_metrics}")

	################# Benchmark
	# Evaluate model on benchmark sequences
	benchmark_sequences, benchmark_preds, benchmark_labels = evaluate(model, benchmark_dataloader, device)
	benchmark_metrics = calculate_metrics(benchmark_preds, benchmark_labels)
	# Make dataframe of benchmark metric results
	benchmark_results_df = pd.DataFrame.from_dict(benchmark_metrics,orient='index').T
	benchmark_results_df[['Model Type','Model Name','Model Epoch']] = [[benchmark_model_type,benchmark_model_name,benchmark_model_epoch]]
	# add the hyperparameters to this
	hyperparams_df = pd.DataFrame.from_dict(formatted_hyperparams,orient='index').T
	benchmark_results_df = pd.concat([benchmark_results_df,hyperparams_df],axis=1)

	# Make dataframe of benchmark probabilities (for AUROC curve)
	# Create a pandas DataFrame
	prob_and_label_df = pd.DataFrame(data = {
	'sequence': benchmark_sequences,
	'prob_1': [arr.flatten() for arr in benchmark_preds]
	})
	prob_and_label_df['prob_1'] = prob_and_label_df['prob_1'].apply(
	lambda prob_list: ",".join([f"{round(x, 3):.3f}" for x in prob_list])
	)
	prob_and_label_df['Model Type'] = [benchmark_model_type]*len(prob_and_label_df)
	prob_and_label_df['Model Name'] = [benchmark_model_name]*len(prob_and_label_df)
	prob_and_label_df['Model Epoch'] = [benchmark_model_epoch]*len(prob_and_label_df)

	# Save benchmark results - both to the model folder (including hyperparams), AND to the current results folder
	benchmark_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
	benchmark_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
	benchmark_results_df.to_csv(benchmark_results_csv_path,mode='w',index=False)
	benchmark_results_df['path_to_model'] = model_full_path
	if not(os.path.exists(benchmark_combined_results_csv_path)):
	benchmark_results_df.to_csv(benchmark_combined_results_csv_path,index=False)
	else:
	benchmark_results_df.to_csv(benchmark_combined_results_csv_path,mode='a',index=False,header=False)

	# Save benchmark probs - only to model folder
	benchmark_probs_csv_path = f'{model_full_folder}/caid_hyperparam_screen_fusion_benchmark_probs.csv'
	seq_label_dict = pd.read_csv('splits/fusion_bench_df.csv')
	seq_label_dict = dict(zip(seq_label_dict['Sequence'],seq_label_dict['Label']))
	log_update("Finding best threshold for fusion benchmark set predictions based on maximizing F1 Score...")
	prob_and_label_df = predict_from_best_thresh(prob_and_label_df, seq_label_dict=seq_label_dict)
	prob_and_label_df[['sequence','prob_1','threshold','pred_labels']].to_csv(benchmark_probs_csv_path,mode='w',index=False)

	log_update(f"benchmark performance: {benchmark_metrics}")

	################# Save model
	# Save model and metrics for this hyperparameter combination in the trained models folder
	torch.save(model.state_dict(), model_full_path)

	# if we didn't train again, still add those results to this benchmarking run so that they all get compared together
	else:
	# Load the appropriate train loses
	train_loss_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_train_losses.csv'
	train_loss_individual_results_csv_path = f'{model_full_folder}/caid_train_losses.csv'
	train_loss_individual_results = pd.read_csv(train_loss_individual_results_csv_path)
	train_loss_individual_results['path_to_model'] = [model_full_path]*len(train_loss_individual_results)
	# Add these results to the combined results file for this run if it exists; otherwise create new combined results file
	if not(os.path.exists(train_loss_combined_results_csv_path)):
	train_loss_individual_results.to_csv(train_loss_combined_results_csv_path,index=False)
	else:
	train_loss_individual_results.to_csv(train_loss_combined_results_csv_path,mode='a',index=False,header=False)

	# Load the appropriate test stats
	test_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_test_metrics.csv'
	test_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_test_metrics.csv'
	test_individual_results = pd.read_csv(test_results_csv_path)
	test_individual_results['path_to_model'] = [model_full_path]*len(test_individual_results)
	# Add these results to the combined results file for this run if it exists; otherwise create new combined results file
	if not(os.path.exists(test_combined_results_csv_path)):
	test_individual_results.to_csv(test_combined_results_csv_path,index=False)
	else:
	test_individual_results.to_csv(test_combined_results_csv_path,mode='a',index=False,header=False)

	# Load the appropriate benchmark stats
	benchmark_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
	benchmark_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
	benchmark_individual_results = pd.read_csv(benchmark_results_csv_path)
	benchmark_individual_results['path_to_model'] = [model_full_path]*len(benchmark_individual_results)
	# Add these results to the combined results file for this run if it exists; otherwise create new combined results file
	if not(os.path.exists(benchmark_combined_results_csv_path)):
	benchmark_individual_results.to_csv(benchmark_combined_results_csv_path,index=False)
	else:
	benchmark_individual_results.to_csv(benchmark_combined_results_csv_path,mode='a',index=False,header=False)

	# Metrics calculation
	def calculate_metrics(preds, labels, threshold=0.5):
	"""
	Calculates metrics to assess model performance
	Args:
	preds (list): model's predictions (probabilities)
	labels (list): ground truth labels
	threshold (float): minimum threshold a prediction must be met to be considered disordered
	Returns:
	accuracy (float): accuracy
	precision (float): precision
	recall (float): recall
	f1 (float): F1 score
	roc_auc (float): AUROC score
	"""
	flat_binary_preds, flat_prob_preds, flat_labels = [], [], []

	for pred, label in zip(preds, labels):
	flat_binary_preds.extend((pred > threshold).astype(int).flatten()) # binary preds are 1 or 0; 1 if the prob > threshold
	flat_prob_preds.extend(pred.flatten())
	flat_labels.extend(label.flatten())

	flat_binary_preds = np.array(flat_binary_preds)
	flat_prob_preds = np.array(flat_prob_preds)
	flat_labels = np.array(flat_labels)

	accuracy = accuracy_score(flat_labels, flat_binary_preds)
	precision = precision_score(flat_labels, flat_binary_preds)
	recall = recall_score(flat_labels, flat_binary_preds)
	f1 = f1_score(flat_labels, flat_binary_preds)
	roc_auc = roc_auc_score(flat_labels, flat_prob_preds)

	# make dictionary of the results and return it
	metrics_dict = {
	'Accuracy': accuracy,
	'Precision': precision,
	'Recall': recall,
	'F1 Score': f1,
	'AUROC': roc_auc
	}

	return metrics_dict

	def main():
	# make output directory for this run
	os.makedirs('results',exist_ok=True)
	output_dir = f'results/{get_local_time()}'
	os.makedirs(output_dir,exist_ok=True)

	with open_logfile(f'{output_dir}/caid_benchmark_log.txt'):
	# print configurations
	print_configpy(config)

	# Verify that the environment variables are set correctly
	check_env_variables()

	# make embeddings if needed
	all_embedding_paths = embed_dataset_for_benchmark(
	fuson_ckpts=config.FUSONPLM_CKPTS,
	input_data_path='splits/splits.csv',
	input_fname='CAID2_competition_sequences',
	average=False, seq_col='Sequence',
	benchmark_fusonplm=config.BENCHMARK_FUSONPLM,
	benchmark_esm=config.BENCHMARK_ESM,
	benchmark_fo_puncta_ml=False,
	overwrite=config.PERMISSION_TO_OVERWRITE_EMBEDDINGS)

	# load the splits with labels
	splits_df = pd.read_csv('splits/splits.csv')
	log_update(f"\nSplit breakdown...\n\t{len(splits_df.loc[splits_df['Split']=='Train'])} train seqs\n\t{len(splits_df.loc[splits_df['Split']=='Test'])} test seqs")

	log_update("\nTraining and evaluating models")
	# Set hyperparameters for disorder predictor

	param_grid = {
	'learning_rate': [5e-5],
	'num_heads': [5, 8, 10],
	'num_layers': [2, 4, 6],
	'dropout': [0.2, 0.5],
	'num_epochs': [2]
	}

	# loop through the embedding paths and train each one
	for embedding_path, details in all_embedding_paths.items():
	log_update(f"\nBenchmarking embeddings at: {embedding_path}")

	grid_search_caid_predictor(embedding_path, details, output_dir, param_grid, overwrite_saved_model=config.PERMISSION_TO_OVERWRITE_MODELS)

	# find the best grid search performer
	find_best_hyperparams(output_dir, param_grid)

	# make plots
	#### caid test set
	best_caid_model_results = pd.read_csv(f"{output_dir}/best_caid_model_results.csv")
	#### fusion benchmark set
	best_caid_model_results_benchmark = best_caid_model_results.drop(columns=
	['AUROC','F1 Score','Accuracy','Precision','Recall']
	).rename(columns={
	'Fusion AUROC': 'AUROC',
	'Fusion F1 Score': 'F1 Score',
	'Fusion Accuracy': 'Accuracy',
	'Fusion Precision': 'Precision',
	'Fusion Recall': 'Recall'
	})

	if __name__ == "__main__":
	main()