# first few imports, just to set CUDA_VISIBLE_DEVICES before importing any torch libraries
import fuson_plm.benchmarking.caid.config as config
import os
os.environ['CUDA_VISIBLE_DEVICES'] = config.CUDA_VISIBLE_DEVICES

# remaining imports
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, average_precision_score

from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
from datetime import datetime
import logging

from fuson_plm.benchmarking.embed import embed_dataset_for_benchmark
from fuson_plm.benchmarking.caid.model import DisorderPredictor
from fuson_plm.benchmarking.caid.utils import DisorderDataset, get_dataloader, check_dataloaders
from fuson_plm.benchmarking.caid.plot import make_auroc_curve, make_benchmark_auroc_curve
from fuson_plm.utils.logging import get_local_time, open_logfile, log_update, print_configpy

# configure Transformers logger to only show messages that are ERROR or more severe 
logging.getLogger("transformers").setLevel(logging.ERROR)

def check_env_variables():
    log_update("\nChecking on environment variables...")
    log_update(f"\tCUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}")
    log_update(f"\ttorch.cuda.device_count(): {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        log_update(f"\t\tDevice {i}: {torch.cuda.get_device_name(i)}")
        
def check_splits(df):
    # make sure everything has a split
    if len(df.loc[df['split'].isna()])>0:
        raise Exception("Error: not every benchmarking sequence has been allocated to a split (train or test)")
    # make sure the only things are train and test
    if len({'train','test'} - set(df['split'].unique()))!=0:
        raise Exception("Error: splits column should only have \'train\' and \'test\'.")
    # make sure there are no duplicate sequences
    if len(df.loc[df['Sequence'].duplicated()])>0:
        raise Exception("Error: duplicate sequences provided")

# Training function
def train(model, train_loader, optimizer, n_epochs, criterion, device):
    """
    Trains the model for a single epoch.
    Args:
        model (nn.Module): model that will be trained
        dataloader (DataLoader): PyTorch DataLoader with training data
        optimizer (torch.optim): optimizer
        criterion (nn.Module): loss function
        device (torch.device): device (GPU or CPU to train the model
    Returns:
        total_loss (float): model loss
    """
    # Training loop
    model.train()
    
    # Avg loss across epochs
    avg_train_losses = []
    
    # Loop through epochs
    for epoch in range(1, 1+n_epochs):
        log_update(f"EPOCH {epoch}/{n_epochs}")
        
        # Initialize loss for the epoch to 0
        total_train_loss = 0
        
        # Make update settings
        total_steps = len(train_loader)
        update_interval = total_steps // min(20,total_steps) # update semi-frequently
        prog_bar = tqdm(total=total_steps, leave=True, file=sys.stdout)
        
        # Iterate through batches
        #with tqdm(enumerate(train_loader,start=1), total=len(train_loader), desc='Training Batch', leave=True, position=0) as pbar:
            #for batch_idx, (embeddings, labels) in pbar:
        for batch_idx, (_, embeddings, labels) in  enumerate(train_loader, start=1):
            # Move tensors to device
            embeddings, labels = embeddings.to(device), labels.to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(embeddings)

            loss = criterion(outputs, labels)
            loss.backward()
            
            # Parameter updates
            optimizer.step()
            
            # Update loss
            total_train_loss += loss.item()

            if batch_idx % update_interval == 0 or batch_idx == total_steps:
                prog_bar.update(update_interval)
                sys.stdout.flush()
                
        prog_bar.close()
    
        # Calculate avg loss for the epoch
        avg_train_loss = total_train_loss / total_steps
        avg_train_losses.append(avg_train_loss)
        
    return avg_train_losses


# Evaluation function
def evaluate(model, test_loader, device):
    """
    Performs inference on a trained model
    Args:
        model (nn.Module): the trained model
        test_loader (DataLoader): PyTorch DataLoader with testing data
        device (torch.device): device (GPU or CPU) to be used for inference
    Returns:
        preds (list): predicted per-residue disorder labels
        true_labels (list): ground truth per-residue disorder labels
    """
    model.eval()
    test_sequences, test_preds, true_labels = [], [], []
    
    # Make update settings
    total_steps = len(test_loader)
    update_interval = total_steps // min(20,total_steps) # update semi-frequently
    prog_bar = tqdm(total=total_steps, leave=True, file=sys.stdout)
    
    with torch.no_grad():
        for batch_idx, (sequences, embeddings, labels) in enumerate(test_loader,start=1):
            embeddings, labels = embeddings.to(device), labels.to(device)
            
            # forward pass
            outputs = model(embeddings)
            
            assert len(sequences)==1    # the batch size should be 1; make sure
            test_sequences.append(sequences[0])
            test_preds.append(outputs.cpu().numpy())
            true_labels.append(labels.cpu().numpy())
            
            if batch_idx % update_interval == 0 or batch_idx == total_steps:
                prog_bar.update(update_interval)
                sys.stdout.flush()
        prog_bar.close()
    return test_sequences, test_preds, true_labels

# Evaluation function
def benchmark(model, bench_loader, device):
    """
    Performs inference on a trained model
    Args:
        model (nn.Module): the trained model
        bench_loader (DataLoader): PyTorch DataLoader with benchmarking data
        device (torch.device): device (GPU or CPU) to be used for inference
    Returns:
        preds (list): predicted per-residue disorder labels
        true_labels (list): ground truth per-residue disorder labels
    """
    model.eval()
    bench_sequences, bench_preds, true_labels = [], [], []
    
    # Make update settings
    total_steps = len(bench_loader)
    update_interval = total_steps // min(20,total_steps) # update semi-frequently
    prog_bar = tqdm(total=total_steps, leave=True, file=sys.stdout)
    
    with torch.no_grad():
        for batch_idx, (sequences, embeddings, labels) in enumerate(bench_loader,start=1):
            embeddings, labels = embeddings.to(device), labels.to(device)
            
            # forward pass
            outputs = model(embeddings)
            
            assert len(sequences)==1    # the batch size should be 1; make sure
            bench_sequences.append(sequences[0])
            bench_preds.append(outputs.cpu().numpy())
            true_labels.append(labels.cpu().numpy())
            
            if batch_idx % update_interval == 0 or batch_idx == total_steps:
                prog_bar.update(update_interval)
                sys.stdout.flush()
        prog_bar.close()
    return bench_sequences, bench_preds, true_labels

def grid_search_caid_predictor(embedding_path, details, output_dir, param_grid, overwrite_saved_model=True):
    # prepare the grid search
    grid = ParameterGrid(param_grid)
    
    # initialize dict
    training_hyperparams = {
        "learning_rate": None,
        "num_epochs": None,
        "num_layers": None,
        "num_heads": None,
        "dropout": None
    }
    
    for params in grid:
        # Update hyperparameters
        training_hyperparams.update(params)
        log_update(f"\nHyperparams:{training_hyperparams}")
        train_and_evaluate_caid_predictor(embedding_path, details, output_dir, training_hyperparams, overwrite_saved_model=overwrite_saved_model)
        
        
def find_best_hyperparams(output_dir, param_grid):
    # Isolate the columns that define the hyperparameters
    param_cols = [f"caid_model_{k}" for k in param_grid.keys()]
    
    # Read in the files with all the stats
    test_metrics = pd.read_csv(f'{output_dir}/caid_hyperparam_screen_test_metrics.csv')
    train_losses = pd.read_csv(f'{output_dir}/caid_hyperparam_screen_train_losses.csv')
    bench_metrics = pd.read_csv(f'{output_dir}/caid_hyperparam_screen_fusion_benchmark_metrics.csv')
    
    # Replace nan with empty string for epoch
    test_metrics['Model Epoch'] = test_metrics['Model Epoch'].fillna('')
    train_losses['Model Epoch'] = train_losses['Model Epoch'].fillna('')
    bench_metrics['Model Epoch'] = bench_metrics['Model Epoch'].fillna('')
    
    # Find the hyperparams that produced the best test metrics for each model; then save all relevant numbers in one file
    benchmarked_model_key = ['Model Type','Model Name','Model Epoch']   # uniquely defines the model being benchmarked
    ordered_priority_stats = ['AUROC','F1 Score','Accuracy','Precision','Recall']
    sort_order = benchmarked_model_key + ordered_priority_stats
    sort_bools = [True]*len(benchmarked_model_key) + [False]*len(ordered_priority_stats)
    test_metrics = test_metrics.sort_values(
            sort_order, 
            ascending=sort_bools
        ).groupby(benchmarked_model_key).head(1).reset_index(drop=True)

    # Find the last-epoch losses for each model and hyperparameters
    group_order = benchmarked_model_key+param_cols
    sort_order = group_order+["caid_model_epoch"]
    sort_bools = [True]*(len(group_order))+[False]*1
    train_losses = train_losses.sort_values(
            by=sort_order,
            ascending=sort_bools,
        ).groupby(group_order).head(1).reset_index(drop=True)
    
    # Combine test and train results
    merge_cols = benchmarked_model_key+param_cols+['path_to_model']
    combined_results = pd.merge(
        test_metrics,train_losses,
        on=merge_cols,
        how='left'
        )
    # Combine with benchmark results
    bench_metrics = bench_metrics.rename(columns = {'AUROC': 'Fusion AUROC',
                                                 'F1 Score': 'Fusion F1 Score',
                                                 'Accuracy': 'Fusion Accuracy',
                                                 'Precision': 'Fusion Precision',
                                                 'Recall': 'Fusion Recall'})
    combined_results = pd.merge(
        combined_results,bench_metrics,
        on=merge_cols,
        how='left'
        )
    
    # reorder columns
    combined_results = combined_results[[
        'Model Type','Model Name','Model Epoch',
        'Accuracy','Precision','Recall','F1 Score','AUROC',
        'Fusion Accuracy','Fusion Precision','Fusion Recall','Fusion F1 Score','Fusion AUROC',
        'caid_model_learning_rate','caid_model_num_epochs','caid_model_num_layers','caid_model_num_heads','caid_model_dropout','caid_model_epoch','caid_model_loss','path_to_model'
    ]]
    combined_results.to_csv(f"{output_dir}/best_caid_model_results.csv",index=False)

def get_fresh_model(training_hyperparams, device):
    input_dim, hidden_dim = 1280, 1280
    model = DisorderPredictor(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=training_hyperparams["num_layers"],
        num_heads=training_hyperparams["num_heads"],
        dropout=training_hyperparams['dropout']
    )
    model.to(device) # Push model to device (should be GPU)
    
    return model

def predict_from_best_thresh(prob_and_label_df, seq_label_dict=None):
    """
    Finds the best prediction threshold for disorder by maximizing F1 Score. Makes predictions
    Args:
        prob_and_label_df: DataFrame with columns: sequence,prob_1
        seq_label_dict: dictionary of sequences to true labels. e.g. 'MKLP': '1100'
    Returns:
        prob_and_label_df: new version of original dataframe with added columns: threshold,pred_labels
    """
    # Use seq_label_dict to insert labels
    prob_and_label_df['labels'] = prob_and_label_df['sequence'].map(seq_label_dict)
    # EVERYTHING should have a label!!
    assert prob_and_label_df['labels'].notna().all()
    
    probs = ','.join(prob_and_label_df['prob_1'].tolist())
    probs = [float(x) for x in probs.split(",")]
    true_labels = ''.join(prob_and_label_df['labels'].tolist())
    true_labels = [int(x) for x in list(true_labels)]
    total_aas = sum(prob_and_label_df['sequence'].str.len())
    log_update(f"\tLength of dataframe (number of seqs in dataset): {len(prob_and_label_df)}")
    log_update(f"\tTotal AAs in dataset: {total_aas}\ttotal probabilities: {len(probs)}\ttotal labels: {len(true_labels)}")

    y_true = np.array(true_labels)  # True labels
    y_probs = np.array(probs)  # Predicted probabilities

    # Compute precision, recall, and thresholds
    precision, recall, thresholds = precision_recall_curve(y_true, y_probs)
    precision = precision[:-1]
    recall = recall[:-1]
    # Calculate F1 scores for each threshold
    f1_scores = 2 * (precision * recall) / (precision + recall)

    # Find the threshold that maximizes the F1 score
    best_threshold_index = np.argmax(f1_scores)
    best_threshold = thresholds[best_threshold_index]

    # Compute AUPRC
    auprc = average_precision_score(y_true, y_probs)

    log_update(f"\tBest Threshold: {best_threshold}")
    log_update(f"\tBest F1 Score: {f1_scores[best_threshold_index]:.2f}")
    log_update(f"\tAUPRC: {auprc:.2f}")

    # Edit the original DataFrame
    # Add threshold
    prob_and_label_df['threshold'] = [best_threshold]*len(prob_and_label_df)
    # Make predictions using this new threshold
    prob_and_label_df['pred_labels'] = prob_and_label_df['prob_1'].apply(lambda x: ['1' if float(y)>best_threshold else '0' for y in x.split(",")])
    prob_and_label_df['pred_labels'] = prob_and_label_df['pred_labels'].apply(lambda x: ''.join(x))
    log_update("\tUsed calculated threshold to construct predicted labels for dataset")
    return prob_and_label_df
    
    
def train_and_evaluate_caid_predictor(embedding_path, details, output_dir, training_hyperparams, overwrite_saved_model=True):
    # unpack the details dictioanry
    benchmark_model_type = details['model_type']
    benchmark_model_name = details['model']
    benchmark_model_epoch = details['epoch']
    
    # define model save directories and make if they don't exist
    model_outer_folder = f"trained_models/{benchmark_model_type}"
    if not(np.isnan(benchmark_model_epoch)): model_outer_folder+=f"/{benchmark_model_name}/epoch{benchmark_model_epoch}"
    model_full_folder=f"{model_outer_folder}/lr{training_hyperparams['learning_rate']}_bs{1}_hd{1280}_epochs{training_hyperparams['num_epochs']}_layers{training_hyperparams['num_layers']}_heads{training_hyperparams['num_heads']}_drpt{training_hyperparams['dropout']}"
    l_model_full_folder = model_full_folder.split("/")
    for i in range(0,len(l_model_full_folder)):
        newdir="/".join(l_model_full_folder[:i+1])
        os.makedirs(newdir, exist_ok=True)
        
    # see if we've trained the model before 
    model_full_path = f"{model_full_folder}/model.pth"
    train_new_model=True       #initially, we believe we're training a new model. Let's make sure we want to. 
    if os.path.exists(model_full_path):
        # If the model exists and we ARE allowed to overwrite, still train
        if overwrite_saved_model:
            log_update(f"\nOverwriting previously trained model with same hyperparams at {model_full_path}")
        # If the model exists and we are NOT allowed to overwrite, don't train
        else:
            log_update(f"\nWARNING: this model may already be trained at {model_full_path}. Skipping")
            train_new_model=False
    
    # If training new model, get new model stats. 
    if train_new_model:
        max_length=4500+2
        # make Dataloaders
        train_dataloader = get_dataloader('splits/train_df.csv', embedding_path, max_length=max_length, batch_size=1, shuffle=True)
        test_dataloader = get_dataloader('splits/test_df.csv', embedding_path, max_length=max_length, batch_size=1, shuffle=False)
        benchmark_dataloader = get_dataloader('splits/fusion_bench_df.csv', embedding_path, max_length=max_length, batch_size=1, shuffle=False)
            
        # Set device to GPU
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Initialize the model and set it to deice 
        model = get_fresh_model(training_hyperparams, device)
        
        # Initialize optimizer
        optimizer = optim.Adam(model.parameters(), lr=training_hyperparams["learning_rate"])
        criterion = nn.BCELoss()
        num_epochs = training_hyperparams['num_epochs']

        ################# Train
        # Train loop
        avg_train_losses = train(model, train_dataloader, optimizer, num_epochs, criterion, device)
        # Save teh train curve results
        formatted_hyperparams = {f"caid_model_{k}":v for k, v in training_hyperparams.items()}
        train_loss_df = pd.DataFrame.from_dict(formatted_hyperparams,orient='index').T
        train_loss_df['caid_model_epoch'] = [list(range(1,1+num_epochs))]
        train_loss_df['caid_model_loss'] = [avg_train_losses]
        train_loss_df[['Model Type','Model Name','Model Epoch']] = [[benchmark_model_type,benchmark_model_name,benchmark_model_epoch]]
        train_loss_df = train_loss_df.explode(['caid_model_epoch', 'caid_model_loss'])
        
        # Save loss results - both to the model folder (including hyperparams), AND to the current results folder
        train_loss_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_train_losses.csv'
        train_loss_individual_results_csv_path = f'{model_full_folder}/caid_train_losses.csv'
        train_loss_df.to_csv(train_loss_individual_results_csv_path,mode='w',index=False)
        train_loss_df['path_to_model'] = model_full_path
        if not(os.path.exists(train_loss_combined_results_csv_path)):
            train_loss_df.to_csv(train_loss_combined_results_csv_path,index=False)
        else:
            train_loss_df.to_csv(train_loss_combined_results_csv_path,mode='a',index=False,header=False)
        
        log_update(f"Final train loss: {avg_train_losses[-1]:.4f}")

        ################# Test
        # Evaluate model on test sequences
        test_sequences, test_preds, test_labels = evaluate(model, test_dataloader, device)
        test_metrics = calculate_metrics(test_preds, test_labels)
        # Make dataframe of test metric results
        test_results_df = pd.DataFrame.from_dict(test_metrics,orient='index').T
        test_results_df[['Model Type','Model Name','Model Epoch']] = [[benchmark_model_type,benchmark_model_name,benchmark_model_epoch]]
        # add the hyperparameters to this
        hyperparams_df = pd.DataFrame.from_dict(formatted_hyperparams,orient='index').T
        test_results_df = pd.concat([test_results_df,hyperparams_df],axis=1)
        
        # Make dataframe of test probabilities (for AUROC curve)
        # Create a pandas DataFrame
        prob_and_label_df = pd.DataFrame(data = {
            'sequence': test_sequences,
            'prob_1': [arr.flatten() for arr in test_preds]
        })
        prob_and_label_df['prob_1'] = prob_and_label_df['prob_1'].apply(
            lambda prob_list: ",".join([f"{round(x, 3):.3f}" for x in prob_list])
        )
        prob_and_label_df['Model Type'] = [benchmark_model_type]*len(prob_and_label_df)
        prob_and_label_df['Model Name'] = [benchmark_model_name]*len(prob_and_label_df)
        prob_and_label_df['Model Epoch'] = [benchmark_model_epoch]*len(prob_and_label_df) 
        
        # Save test results - both to the model folder (including hyperparams), AND to the current results folder
        test_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_test_metrics.csv'
        test_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_test_metrics.csv'
        test_results_df.to_csv(test_results_csv_path,mode='w',index=False)
        test_results_df['path_to_model'] = model_full_path
        if not(os.path.exists(test_combined_results_csv_path)):
            test_results_df.to_csv(test_combined_results_csv_path,index=False)
        else:
            test_results_df.to_csv(test_combined_results_csv_path,mode='a',index=False,header=False)
        
        # Save test probs - only to model folder
        test_probs_csv_path = f'{model_full_folder}/caid_hyperparam_screen_test_probs.csv'
        seq_label_dict = pd.read_csv('splits/test_df.csv')
        seq_label_dict = dict(zip(seq_label_dict['Sequence'],seq_label_dict['Label']))
        log_update("Finding best threshold for CAID test set predictions based on maximizing F1 Score...")
        prob_and_label_df = predict_from_best_thresh(prob_and_label_df, seq_label_dict=seq_label_dict)
        prob_and_label_df[['sequence','prob_1','threshold','pred_labels']].to_csv(test_probs_csv_path,mode='w',index=False)
            
        log_update(f"Test performance: {test_metrics}")
        
        ################# Benchmark
        # Evaluate model on benchmark sequences
        benchmark_sequences, benchmark_preds, benchmark_labels = evaluate(model, benchmark_dataloader, device)
        benchmark_metrics = calculate_metrics(benchmark_preds, benchmark_labels)
        # Make dataframe of benchmark metric results
        benchmark_results_df = pd.DataFrame.from_dict(benchmark_metrics,orient='index').T
        benchmark_results_df[['Model Type','Model Name','Model Epoch']] = [[benchmark_model_type,benchmark_model_name,benchmark_model_epoch]]
        # add the hyperparameters to this
        hyperparams_df = pd.DataFrame.from_dict(formatted_hyperparams,orient='index').T
        benchmark_results_df = pd.concat([benchmark_results_df,hyperparams_df],axis=1)
        
        # Make dataframe of benchmark probabilities (for AUROC curve)
        # Create a pandas DataFrame
        prob_and_label_df = pd.DataFrame(data = {
            'sequence': benchmark_sequences,
            'prob_1': [arr.flatten() for arr in benchmark_preds]
        })
        prob_and_label_df['prob_1'] = prob_and_label_df['prob_1'].apply(
            lambda prob_list: ",".join([f"{round(x, 3):.3f}" for x in prob_list])
        )
        prob_and_label_df['Model Type'] = [benchmark_model_type]*len(prob_and_label_df)
        prob_and_label_df['Model Name'] = [benchmark_model_name]*len(prob_and_label_df)
        prob_and_label_df['Model Epoch'] = [benchmark_model_epoch]*len(prob_and_label_df) 
        
        # Save benchmark results - both to the model folder (including hyperparams), AND to the current results folder
        benchmark_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
        benchmark_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
        benchmark_results_df.to_csv(benchmark_results_csv_path,mode='w',index=False)
        benchmark_results_df['path_to_model'] = model_full_path
        if not(os.path.exists(benchmark_combined_results_csv_path)):
            benchmark_results_df.to_csv(benchmark_combined_results_csv_path,index=False)
        else:
            benchmark_results_df.to_csv(benchmark_combined_results_csv_path,mode='a',index=False,header=False)
        
        # Save benchmark probs - only to model folder
        benchmark_probs_csv_path = f'{model_full_folder}/caid_hyperparam_screen_fusion_benchmark_probs.csv'
        seq_label_dict = pd.read_csv('splits/fusion_bench_df.csv')
        seq_label_dict = dict(zip(seq_label_dict['Sequence'],seq_label_dict['Label']))
        log_update("Finding best threshold for fusion benchmark set predictions based on maximizing F1 Score...")
        prob_and_label_df = predict_from_best_thresh(prob_and_label_df, seq_label_dict=seq_label_dict)
        prob_and_label_df[['sequence','prob_1','threshold','pred_labels']].to_csv(benchmark_probs_csv_path,mode='w',index=False)
            
        log_update(f"benchmark performance: {benchmark_metrics}")

        ################# Save model
        # Save model and metrics for this hyperparameter combination in the trained models folder 
        torch.save(model.state_dict(), model_full_path)

    # if we didn't train again, still add those results to this benchmarking run so that they all get compared together
    else:
        # Load the appropriate train loses 
        train_loss_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_train_losses.csv'
        train_loss_individual_results_csv_path = f'{model_full_folder}/caid_train_losses.csv'
        train_loss_individual_results = pd.read_csv(train_loss_individual_results_csv_path)
        train_loss_individual_results['path_to_model'] = [model_full_path]*len(train_loss_individual_results)
        # Add these results to the combined results file for this run if it exists; otherwise create new combined results file
        if not(os.path.exists(train_loss_combined_results_csv_path)):
            train_loss_individual_results.to_csv(train_loss_combined_results_csv_path,index=False)
        else:
            train_loss_individual_results.to_csv(train_loss_combined_results_csv_path,mode='a',index=False,header=False)
            
        # Load the appropriate test stats
        test_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_test_metrics.csv'
        test_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_test_metrics.csv'
        test_individual_results = pd.read_csv(test_results_csv_path)
        test_individual_results['path_to_model'] = [model_full_path]*len(test_individual_results)
        # Add these results to the combined results file for this run if it exists; otherwise create new combined results file
        if not(os.path.exists(test_combined_results_csv_path)):
            test_individual_results.to_csv(test_combined_results_csv_path,index=False)
        else:
            test_individual_results.to_csv(test_combined_results_csv_path,mode='a',index=False,header=False)
            
        # Load the appropriate benchmark stats
        benchmark_combined_results_csv_path = f'{output_dir}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
        benchmark_results_csv_path = f'{model_full_folder}/caid_hyperparam_screen_fusion_benchmark_metrics.csv'
        benchmark_individual_results = pd.read_csv(benchmark_results_csv_path)
        benchmark_individual_results['path_to_model'] = [model_full_path]*len(benchmark_individual_results)
        # Add these results to the combined results file for this run if it exists; otherwise create new combined results file
        if not(os.path.exists(benchmark_combined_results_csv_path)):
            benchmark_individual_results.to_csv(benchmark_combined_results_csv_path,index=False)
        else:
            benchmark_individual_results.to_csv(benchmark_combined_results_csv_path,mode='a',index=False,header=False)
        
# Metrics calculation
def calculate_metrics(preds, labels, threshold=0.5):
    """
    Calculates metrics to assess model performance
    Args:
        preds (list): model's predictions (probabilities)
        labels (list): ground truth labels
        threshold (float): minimum threshold a prediction must be met to be considered disordered
    Returns:
        accuracy (float): accuracy
        precision (float): precision
        recall (float): recall
        f1 (float): F1 score
        roc_auc (float): AUROC score
    """
    flat_binary_preds, flat_prob_preds, flat_labels = [], [], []

    for pred, label in zip(preds, labels):
        flat_binary_preds.extend((pred > threshold).astype(int).flatten())   # binary preds are 1 or 0; 1 if the prob > threshold
        flat_prob_preds.extend(pred.flatten())
        flat_labels.extend(label.flatten())

    flat_binary_preds = np.array(flat_binary_preds)
    flat_prob_preds = np.array(flat_prob_preds)
    flat_labels = np.array(flat_labels)

    accuracy = accuracy_score(flat_labels, flat_binary_preds)
    precision = precision_score(flat_labels, flat_binary_preds)
    recall = recall_score(flat_labels, flat_binary_preds)
    f1 = f1_score(flat_labels, flat_binary_preds)
    roc_auc = roc_auc_score(flat_labels, flat_prob_preds)
    
    # make dictionary of the results and return it
    metrics_dict = {
        'Accuracy': accuracy, 
        'Precision': precision, 
        'Recall': recall, 
        'F1 Score': f1, 
        'AUROC': roc_auc
    }

    return metrics_dict

def main():
    # make output directory for this run
    os.makedirs('results',exist_ok=True)
    output_dir = f'results/{get_local_time()}'
    os.makedirs(output_dir,exist_ok=True)
    
    with open_logfile(f'{output_dir}/caid_benchmark_log.txt'):
        # print configurations 
        print_configpy(config)
        
        # Verify that the environment variables are set correctly 
        check_env_variables()
        
        # make embeddings if needed
        all_embedding_paths = embed_dataset_for_benchmark(
                                            fuson_ckpts=config.FUSONPLM_CKPTS, 
                                            input_data_path='splits/splits.csv', 
                                            input_fname='CAID2_competition_sequences', 
                                            average=False, seq_col='Sequence',
                                            benchmark_fusonplm=config.BENCHMARK_FUSONPLM, 
                                            benchmark_esm=config.BENCHMARK_ESM, 
                                            benchmark_fo_puncta_ml=False, 
                                            overwrite=config.PERMISSION_TO_OVERWRITE_EMBEDDINGS)
        
        # load the splits with labels
        splits_df = pd.read_csv('splits/splits.csv')
        log_update(f"\nSplit breakdown...\n\t{len(splits_df.loc[splits_df['Split']=='Train'])} train seqs\n\t{len(splits_df.loc[splits_df['Split']=='Test'])} test seqs")
        
        log_update("\nTraining and evaluating models")
        # Set hyperparameters for disorder predictor 
        
        param_grid = {
        'learning_rate': [5e-5],
        'num_heads': [5, 8, 10],
        'num_layers': [2, 4, 6],
        'dropout': [0.2, 0.5],
        'num_epochs': [2]
        }
        
        # loop through the embedding paths and train each one
        for embedding_path, details in all_embedding_paths.items():
            log_update(f"\nBenchmarking embeddings at: {embedding_path}")
            
            grid_search_caid_predictor(embedding_path, details, output_dir, param_grid, overwrite_saved_model=config.PERMISSION_TO_OVERWRITE_MODELS)
            
        # find the best grid search performer
        find_best_hyperparams(output_dir, param_grid)
        
        # make plots
        #### caid test set
        best_caid_model_results = pd.read_csv(f"{output_dir}/best_caid_model_results.csv")
        #### fusion benchmark set
        best_caid_model_results_benchmark = best_caid_model_results.drop(columns=
            ['AUROC','F1 Score','Accuracy','Precision','Recall']
            ).rename(columns={
                'Fusion AUROC': 'AUROC',
                'Fusion F1 Score': 'F1 Score',
                'Fusion Accuracy': 'Accuracy',
                'Fusion Precision': 'Precision',
                'Fusion Recall': 'Recall'
                })
        
if __name__ == "__main__":
    main()