import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches
from matplotlib import font_manager
import matplotlib.patches as patches
from sklearn.metrics import roc_curve, auc, r2_score

from fuson_plm.utils.visualizing import set_font

global caid2_winners, caid2_model_rankings
caid2_winners = pd.DataFrame(data=
        {
        'Model Name': ['Dispredict3','flDPnn2','flDPnn','flDPlr','flDPlr2','DisoPred',
                       'IDP-Fusion','ESpritz-D','DeepIDP-2L','disomine','DISOPRED3-diso','IUPred3',
                       'AlphaFold-rsa','AlphaFold-pLDDT'],    # do the top 6 models, and IUPred because it's well-known
        'AUROC': [0.838,0.836,0.833,0.827,0.821,0.821,
                  0.818,0.802,0.800,0.797,0.692,0.755,0.747,0.695],
})
caid2_winners['Model Type'] = ['caid2_competition']*len(caid2_winners)
caid2_winners['Model Epoch'] = [np.nan]*len(caid2_winners)

caid2_model_rankings = {
    'Dispredict3': 1,
    'flDPnn2': 2,
    'flDPnn': 3,
    'flDPlr': 4,
    'flDPlr2': 5,
    'DisoPred': 6,
    'IDP-Fusion': 7,
    'ESpritz-D': 8,
    'DeepIDP-2L': 9,
    'disomine': 10,
    'DISOPRED3-diso': 35,
    'IUPred3': 21,
    'AlphaFold-rsa': 24,
    'AlphaFold-pLDDT': 34
}

# Method for lengthening the model name
def lengthen_model_name(row):
    model_type = row['Model Type']
    name = row['Model Name']
    epoch = row['Model Epoch']
    
    if 'esm' in name:
        return name
    if 'puncta' in name:
        return name
    if model_type=='caid2_competition':
        return name
    
    return f'{name}_e{epoch}'

# Method for shortening the model name for display
def shorten_model_name(row):
    model_type = row['Model Type']
    name = row['Model Name']
    epoch = row['Model Epoch']
    
    if 'esm' in name:
        return 'ESM-2-650M'
    if model_type=='caid2_competition':
        return name
    
    if 'snp_' in name:
        prob_type = 'snp'
    elif 'uniform_' in name:
        prob_type = 'uni'
    
    layers = name.split('layers')[0].split('_')[-1]
    maskrate = name.split('mask')[1].split('-', 1)[0]
    kqv_tag = name.split('layers_')[1].split('_')[0]
    dt = name.split('mask')[1].split('-', 1)[1]
    
    return f'{prob_type}_{layers}L_{kqv_tag}_mask{maskrate}_{dt}_e{epoch}'

def make_heatmap(df, results_dir='.', gold_standard_model_name="esm2_t33_650M_UR50D",split="test",thresh=None,ax=None):
    # Set font to Ubuntu
    set_font()
    
    # Declare columns to compare: metrics
    columns_to_compare = ['AUROC']
    
    # Define the literature-reported values for CAID competition winners - only IF the split is not "benchmark"
    if not(split=="benchmark"):
        df = pd.concat([df,caid2_winners])
    
    # Create Short Model Name and Full Model Name columns for later use
    df['Model Epoch'] = df['Model Epoch'].apply(lambda x: str(int(x)) if not(np.isnan(x)) else '')
    df['Short Model Name'] = df.apply(lambda row: shorten_model_name(row),axis=1)
    df['Full Model Name'] = df.apply(lambda row: lengthen_model_name(row), axis=1)
    
    # Isolate gold standard row for later comparison
    gold_standard = df[df['Full Model Name'] == gold_standard_model_name].reset_index(drop=True).iloc[0]
    gold_standard_short_model_name = df[df['Full Model Name'] == gold_standard_model_name]['Short Model Name'].item()

    # Create a new dataframe for the heatmap; sort by model type and place gold standard on top
    heatmap_data = df[['Model Type','Short Model Name','Full Model Name'] + columns_to_compare].copy()
    heatmap_data['is_gold_standard'] = (heatmap_data['Full Model Name'] == gold_standard_model_name).astype(int)
    heatmap_data = heatmap_data.sort_values(by=['is_gold_standard','Model Type','AUROC'], ascending=[False,True,False]).reset_index(drop=True).drop(columns=['is_gold_standard'])
    # Save the original values before calculating differences so we can use them for annotation
    original_values = heatmap_data[columns_to_compare].copy()
    
    # Calculate differences from the gold standard
    for col in columns_to_compare:
        heatmap_data[col] = heatmap_data[col] - gold_standard[col]

    # Create a color map where values equal to 0 are white, above are red, and below are blue
    cmap = sns.color_palette("coolwarm", as_cmap=True)  # other option is diverging_palette(220, 20, as_cmap=True)

    ### Make the plot
    # can plot on a bigger plot, or make it an individual plot
    if ax is None:
        tallsize = max(8, 8 +.25*(len(heatmap_data)-26))
        fig, ax = plt.subplots(1, 1, figsize=(8, tallsize), dpi=300)
        
    # Plot the heatmap with original values as annotations
    hm = sns.heatmap(heatmap_data.set_index('Short Model Name').drop(columns=['Model Type','Full Model Name']),
                    annot=False, fmt='', cmap=cmap, center=0, 
                    cbar_kws={'label': 'Difference from Gold Standard'})
    
    # Explicitly set tick labels to prevent them from being messed up
    ax.set_yticklabels(heatmap_data['Short Model Name'], rotation=0, fontsize=12)
    # Add padding to the y-axis label
    ax.set_ylabel("Short Model Name", labelpad=20)  # Increase the labelpad value to add more padding

    # Bold any values values that exceed the gold standard
    for i in range(original_values.shape[0]):
        for j in range(original_values.shape[1]):
            value = original_values.iloc[i, j]
            if value > gold_standard[columns_to_compare[j]]:
                ax.text(j + 0.5, i + 0.5, f'{value:.3f}', ha='center', va='center', fontweight='bold', color='black')
            else:
                ax.text(j + 0.5, i + 0.5, f'{value:.3f}', ha='center', va='center', color='black')
                
    # Add horizontal lines between different model types
    model_type_series = heatmap_data['Model Type'].values
    last_index = 0
    labels_positions = []  # To store the positions for labels
    for i in range(1, len(model_type_series)):
        if model_type_series[i] != model_type_series[i - 1]:
            hm.axhline(i, color='white', linewidth=8)  # Draw a thick white line between groups
            labels_positions.append((last_index + i) / 2)  # Store the midpoint for labeling
            last_index = i

    # Add label for the last group
    labels_positions.append((last_index + len(model_type_series)) / 2)
    
    # Italic and bold models that win AUROC; apply yellow coloring to gold standard model
    for ytick, model_name in enumerate(heatmap_data['Short Model Name']):
        if model_name == gold_standard_short_model_name:
            # color yellow
            label = ax.get_yticklabels()[ytick]
            #label.set_color('gold')
            label.set_bbox(dict(facecolor='gold', alpha=0.5, edgecolor='gold'))
        if model_name != gold_standard_short_model_name:
            auroc_value = original_values.loc[ytick, 'AUROC']
            
            # Apply bold and italic for wins on either AUROC or F1 Score
            if (auroc_value > gold_standard['AUROC']):
                label = ax.get_yticklabels()[ytick]
                #label.set_style('italic')
                #label.set_weight('bold')
                label.set_bbox(dict(facecolor='red', alpha=0.3, edgecolor='red'))
    
    # Make legend
    gold_patch = mpatches.Patch(color='gold', alpha=0.5, label='Gold Standard')
    red_patch = mpatches.Patch(color='red', alpha=0.5, label='Winner')
    plt.legend(handles=[gold_patch, red_patch], loc='best', bbox_to_anchor=(0, 0))  # You can change loc to position the legend

    split_fname_dict = {
        "testing": "CAID2_test",
        "training": "CAID2_train",
        "benchmark": "FusionPDB_pLDDT_disorder"
    }
    split_title_dict = {
        "testing": "CAID-2 Disorder Prediction",
        "training": "CAID-2 Disorder Prediction",
        "benchmark": "FusionPDB_pLDDT Disorder Prediction"
    }
    ax.set_title(split_title_dict[split])
    
    # Rotate the color bar label
    cbar = hm.collections[0].colorbar
    cbar.ax.yaxis.set_label_position('right')
    cbar.ax.yaxis.set_ticks_position('right')
    cbar.set_label('Difference from Gold Standard', rotation=270, labelpad=20)  # Rotate 270 degrees and add some padding
    
    # Set tight layout using fig
    fig.tight_layout(rect=[0, 0, 0.95, 1])  # Add extra padding on the right side to fit the label

    plt.savefig(f"{results_dir}/{split_fname_dict[split]}_heatmap_vs_{gold_standard_model_name}.png")

# Plot AUROC curve of ONE model of interest on its fusion pdb performance
def make_benchmark_auroc_curve(results_dir='.', seq_label_dict=None, path_to_results_of_interest='', model_alias=None):
    # Isolate the information for the model we'll be plotting
    benchmark_model = path_to_results_of_interest.split('trained_models/')[1].split('/')
    benchmark_model_type = benchmark_model[0]
    benchmark_model_epoch = np.nan
    benchmark_model_hyperparams = None
    if len(benchmark_model)==5: 
        benchmark_model_name = benchmark_model[1]
        benchmark_model_epoch = benchmark_model[2].split('epoch')[1]
        benchmark_model_hyperparams = benchmark_model[3]
    else:
        benchmark_model_name = benchmark_model[0]
        benchmark_model_hyperparams = benchmark_model[1]
    benchmark_model_info = pd.DataFrame(data={
        'Model Type': [benchmark_model_type], 'Model Name': [benchmark_model_name], 'Model Epoch': [benchmark_model_epoch]
    })
    if model_alias is None:
        model_alias = benchmark_model_info.apply(lambda row: shorten_model_name(row),axis=1).iloc[0]
        
    color_map = {
        model_alias: 'black'
    }
    method_results = {model_alias: path_to_results_of_interest}
    method_results = {k:v for k,v in method_results.items() if v not in [None, '']}
    
    set_font()
    plt.figure(figsize=(10,6),dpi=300)
    
    # To store AUROC values and corresponding labels for sorting
    roc_data = []
    # Read each result file and plot the metrics
    for method, path in method_results.items():
        df = pd.read_csv(path) # columns = prob_1,labels

        # Extract probabilities and labels
        prob_1 = ",".join(df['prob_1'].tolist())
        df['labels'] = df['sequence'].apply(lambda x: seq_label_dict[x])
        labels = "".join(df['labels'].tolist())
        prob_1 = [float(x) for x in prob_1.split(",")]
        labels = [int(x) for x in list(labels)]
        sequences = "".join(df['sequence'].tolist())
        assert len(prob_1)==len(labels)==len(sequences)

        # Compute ROC curve and ROC area
        fpr, tpr, thresholds = roc_curve(labels, prob_1)
        roc_auc = auc(fpr, tpr)
        
        # Store data for sorting later
        roc_data.append((method, fpr, tpr, roc_auc))
        
    # Sort the methods by AUROC values
    roc_data = sorted(roc_data, key=lambda x: x[3], reverse=True)

    # Plot sorted ROC curves
    for method, fpr, tpr, roc_auc in roc_data:
        if method == model_alias:
            plt.plot(fpr, tpr, color=color_map[method], lw=2, label=f'{method} ({roc_auc:0.3f})')
        else:
            plt.plot(fpr, tpr, color=color_map[method], lw=1, alpha=0.7, label=f'{method} ({roc_auc:0.3f})')

    # Set other stylistic elements
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.plot([0, 1], [0, 1], color='darkgrey', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    
    # After plotting the ROC curves, customize the legend
    handles, labels = plt.gca().get_legend_handles_labels()

    # Create the legend first
    legend = plt.legend(handles, labels, loc="center left", bbox_to_anchor=(1, 0.5))

    # Iterate through the legend's text labels
    for text in legend.get_texts():
        if model_alias in text.get_text():
            text.set_fontweight('bold')  # Bold the alias model
        
    plt.tight_layout()
    plt.savefig(f'{results_dir}/FusionPDB_pLDDT_disorder_{model_alias}_AUROC_curve.png')
    
# Plot AUROC curve of ONE model of interest with all the CAID models
def make_auroc_curve(results_dir='.', seq_label_dict=None, seq_ids_dict=None, path_to_results_of_interest='', model_alias=None, path_to_esm_results=None, with_rankings=False):
    # Isolate the information for the model we'll be plotting
    benchmark_model = path_to_results_of_interest.split('trained_models/')[1].split('/')
    benchmark_model_type = benchmark_model[0]
    benchmark_model_epoch = np.nan
    benchmark_model_hyperparams = None
    if len(benchmark_model)==5: 
        benchmark_model_name = benchmark_model[1]
        benchmark_model_epoch = benchmark_model[2].split('epoch')[1]
        benchmark_model_hyperparams = benchmark_model[3]
    else:
        benchmark_model_name = benchmark_model[0]
        benchmark_model_hyperparams = benchmark_model[1]
    benchmark_model_info = pd.DataFrame(data={
        'Model Type': [benchmark_model_type], 'Model Name': [benchmark_model_name], 'Model Epoch': [benchmark_model_epoch]
    })
    if model_alias is None:
        model_alias = benchmark_model_info.apply(lambda row: shorten_model_name(row),axis=1).iloc[0]
        
    color_map = {
        'Dispredict3': '#d62727',           #1
        'flDPnn2': '#ff7f0f',               #2
        'flDPnn': '#1f77b4',                #3
        'flDPlr': '#bcbd21',                #4
        'flDPlr2': '#16becf',               #5
        'DisoPred': '#1f77b4',              #6
        'IDP-Fusion': '#d62727',            #7
        'ESpritz-D': '#8b564c',             #8
        'DeepIDP-2L': '#e377c2',            #9
        'disomine': '#e377c2',                #10
        'DISOPRED3-diso': '#ff892d',             
        'IUPred3': '#8b564c',
        'AlphaFold-rsa': '#2ba02b',
        'AlphaFold-pLDDT': '#ff892d',
        model_alias: 'black'
    }
    method_results = {'Dispredict3': 'processed_data/caid2_competition_results/Dispredict3_CAID-2_Disorder_NOX.csv',
                    'flDPnn2': 'processed_data/caid2_competition_results/flDPnn2_CAID-2_Disorder_NOX.csv',
                    'flDPnn': 'processed_data/caid2_competition_results/flDPnn_CAID-2_Disorder_NOX.csv',
                    'flDPlr': 'processed_data/caid2_competition_results/flDPtr_CAID-2_Disorder_NOX.csv',   # name doesn't match but this is what it is in raw download
                    'flDPlr2': 'processed_data/caid2_competition_results/flDPlr2_CAID-2_Disorder_NOX.csv',
                    'DisoPred': 'processed_data/caid2_competition_results/DisoPred_CAID-2_Disorder_NOX.csv',
                    'IDP-Fusion': 'processed_data/caid2_competition_results/IDP-Fusion_CAID-2_Disorder_NOX.csv',        
                    'ESpritz-D': 'processed_data/caid2_competition_results/ESpritz-D_CAID-2_Disorder_NOX.csv',           
                    'DeepIDP-2L': 'processed_data/caid2_competition_results/DeepIDP-2L_CAID-2_Disorder_NOX.csv',          
                    'disomine': 'processed_data/caid2_competition_results/disomine_CAID-2_Disorder_NOX.csv',              
                    'DISOPRED3-diso': 'processed_data/caid2_competition_results/DISOPRED3-diso_CAID-2_Disorder_NOX.csv',             
                    'AlphaFold-rsa': 'processed_data/caid2_competition_results/AlphaFold-rsa_CAID-2_Disorder_NOX.csv',
                    'AlphaFold-pLDDT': 'processed_data/caid2_competition_results/AlphaFold-disorder_CAID-2_Disorder_NOX.csv',        # name doesn't match but this is what it is in raw download
                    'IUPred3': 'processed_data/caid2_competition_results/IUPred3_CAID-2_Disorder_NOX.csv',
                    model_alias: path_to_results_of_interest
                }
    if path_to_esm_results is not None:
        method_results['ESM-2-650M'] = path_to_esm_results
        color_map['ESM-2-650M'] = 'black'
        
    method_results = {k:v for k,v in method_results.items() if v not in [None, '']}
    
    set_font()
    plt.figure(figsize=(12,6),dpi=300)
    
    # To store AUROC values and corresponding labels for sorting
    merged_preds = pd.DataFrame(data={'sequence':[]})
    merged_tpr_fpr = pd.DataFrame(data={'model': [],'fpr':[],'tpr':[]})
    roc_data = []
    # Read each result file and plot the metrics
    for method, path in method_results.items():
        df = pd.read_csv(path) # columns = prob_1,labels
        merged_preds = pd.merge(merged_preds, 
                                df.rename(columns={'prob_1':f"{method}_prob_1"})[['sequence',f"{method}_prob_1",]],
                                on=['sequence'],how='outer')
        
        # Extract probabilities and labels
        prob_1 = ",".join(df['prob_1'].tolist())
        df['labels'] = df['sequence'].apply(lambda x: seq_label_dict[x])
        labels = "".join(df['labels'].tolist())
        prob_1 = [float(x) for x in prob_1.split(",")]
        labels = [int(x) for x in list(labels)]
        sequences = "".join(df['sequence'].tolist())
        assert len(prob_1)==len(labels)==len(sequences)

        # Compute ROC curve and ROC area
        fpr, tpr, thresholds = roc_curve(labels, prob_1)
        new_tpr_fpr = pd.DataFrame(data={
            'model': [method]*len(fpr),
            'fpr': fpr, 'tpr': tpr
        })
        merged_tpr_fpr = pd.concat([merged_tpr_fpr,new_tpr_fpr])
        roc_auc = auc(fpr, tpr)
        
        if method==model_alias:
            path_to_og_metrics = path_to_results_of_interest.rsplit('/',1)[0]+'/caid_hyperparam_screen_test_metrics.csv'
            og_metrics = pd.read_csv(path_to_og_metrics)
            roc_auc = og_metrics['AUROC'][0]
        
        # Store data for sorting later
        roc_data.append((method, fpr, tpr, roc_auc))
       
    # Save the merged dataframe as source data
    merged_preds['labels'] = merged_preds['sequence'].apply(lambda x: seq_label_dict[x])
    merged_preds['labels'] = merged_preds['labels'].apply(lambda x: ",".join([str(y) for y in x]))
    merged_preds['ids'] = merged_preds['sequence'].apply(lambda x: seq_ids_dict[x])
    merged_preds.drop(columns={'sequence'}).to_csv(f"{results_dir}/CAID_prediction_source_data.csv",index=False)
    merged_tpr_fpr.to_csv(f"{results_dir}/CAID_fpr_tpr_source_data.csv",index=False)
    # Sort the methods by AUROC values
    roc_data = sorted(roc_data, key=lambda x: x[3], reverse=True)
    
    # figure out the labels
    labels = {method: method for method in method_results}
    if with_rankings:
        for method in labels:
            if method in caid2_model_rankings:
                labels[method] = f"{caid2_model_rankings[method]}. {method}"

    # Plot sorted ROC curves
    for method, fpr, tpr, roc_auc in roc_data:
        if method=='ESM-2-650M' and path_to_esm_results is not None:
            plt.plot(fpr, tpr, color=color_map[method], lw=2, linestyle='--', label=f'{labels[method]} ({roc_auc:0.3f})')
        elif method == model_alias:
            plt.plot(fpr, tpr, color=color_map[method], lw=2, label=f'{labels[method]} ({roc_auc:0.3f})')
        else:
            plt.plot(fpr, tpr, color=color_map[method], lw=1, alpha=0.7, label=f'{labels[method]} ({roc_auc:0.3f})')

    # Set other stylistic elements
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.plot([0, 1], [0, 1], color='darkgrey', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate', fontsize=22)
    plt.ylabel('True Positive Rate', fontsize=22)
    plt.title('CAID2 Disorder NOX Dataset: ROC Curve', fontsize=22)
    
    # After plotting the ROC curves, customize the legend
    handles, labels = plt.gca().get_legend_handles_labels()

    # Create the legend first
    legend = plt.legend(handles, labels, loc="center left", bbox_to_anchor=(1.1, 0.5), fontsize=16)

    # Iterate through the legend's text labels
    for text in legend.get_texts():
        if model_alias in text.get_text():
            text.set_fontweight('bold')  # Bold the alias model
        elif (path_to_esm_results is not None) and "ESM-2-650M" in text.get_text():
            text.set_fontweight('bold') # Bold ESM if we're comparing to it
        
    plt.tight_layout()
    figpath = f'{results_dir}/CAID2_{model_alias}_AUROC_curve.png'
    if path_to_esm_results is not None:
        figpath = f'{results_dir}/CAID2_{model_alias}_with_ESM_AUROC_curve.png'
    plt.savefig(figpath)

    
def plot_disorder_content_scatter(train_labels, test_labels, benchmark_labels, savepath='splits/disorder_content_scatter.png'):
    """
    Compare disorder content between the train, test, and fusion benchmark sets based on the TRUE labels.
    Each labels vector should have ['11110000','0001110',...] format.  
    """
    
    # Get train disorder distribution
    train_lengths = []
    train_frac_disorder = []
    for vec in train_labels:
        veclist = [int(x) for x in vec]
        train_lengths.append(len(veclist))
        train_frac_disorder.append(sum(veclist)/len(veclist))
    
    # Get test disorder distribution
    test_lengths = []
    test_frac_disorder = []
    for vec in test_labels:
        veclist = [int(x) for x in vec]
        test_lengths.append(len(veclist))
        test_frac_disorder.append(sum(veclist)/len(veclist))
        
    # Get benchmark disorder distribution
    benchmark_lengths = []
    benchmark_frac_disorder = []
    for vec in benchmark_labels:
        veclist = [int(x) for x in vec]
        benchmark_lengths.append(len(veclist))
        benchmark_frac_disorder.append(sum(veclist)/len(veclist))
    
    # make a plot
    set_font()
    color_map = {
    'train': '#0072B2',
    'test': '#E69F00',
    'fusion': 'purple'
    }
    
    # Plotting
    fig, ax = plt.subplots(figsize=(10, 6))

    ax.scatter(train_lengths, train_frac_disorder, color=color_map['train'], label='Train', alpha=0.7)
    ax.scatter(test_lengths, test_frac_disorder, color=color_map['test'], label='Test', alpha=0.7)
    ax.scatter(benchmark_lengths, benchmark_frac_disorder, color=color_map['fusion'], label='Fusion', alpha=0.7)

    # Labels and title
    ax.set_xlabel('Length')
    ax.set_ylabel('Fraction of Disorder')
    ax.set_title('Length vs. Fraction of Disorder for Train, Test, and Benchmark Datasets')
    ax.legend()
    plt.tight_layout()
    plt.savefig(savepath)

def plot_disorder_content_hist(labels, ids, title="data", color="black", savepath='splits/disorder_content_histograms.png'):
    """
    Compare disorder content between the train, test, and fusion benchmark sets based on the TRUE labels.
    Each labels vector should have ['11110000','0001110',...] format.  
    """
    set_font()
    
    # Get disorder distribution
    lengths = []
    frac_disorder = []
    for vec in labels:
        veclist = [int(x) for x in vec]
        lengths.append(len(veclist))
        frac_disorder.append(100*sum(veclist)/len(veclist)) # make it a percent, i like this better
    
    # save the source data
    source_data = pd.DataFrame(data={
        'ID': ids,
        'Percent_Disordered': frac_disorder 
    })
    source_data['Percent_Disordered'] = source_data['Percent_Disordered'].round(3)
    source_data.to_csv(savepath.replace(".png","_source_data.csv"),index=False)
    
    fig, ax = plt.subplots(1, 1, figsize=(20, 12))

    # Plot histogram for train data
    title_fontsize = 70
    axislabel_fontsize = 70
    tick_fontsize = 50
    ax.hist(frac_disorder, bins=20, color=color, alpha=0.7)
    ax.set_title(title, fontsize=title_fontsize)
    ax.set_xlabel('% Disordered', fontsize=axislabel_fontsize)
    ax.set_ylabel('Count', fontsize=axislabel_fontsize)
    ax.grid(True)
    ax.set_axisbelow(True)
    ax.tick_params(axis='both', which='major', labelsize=tick_fontsize)
    
    # Calculate the mean and median of the percent coverage
    mean_coverage = np.mean(frac_disorder)
    median_coverage = np.median(frac_disorder)

    # Add vertical line for the mean
    ax.axvline(mean_coverage, color='black', linestyle='--', linewidth=2, label=f'Mean: {mean_coverage:.1f}%')
    
    # Add vertical line for the median
    ax.axvline(median_coverage, color='black', linestyle='-', linewidth=2, label=f'Median: {median_coverage:.1f}%')

    ax.legend(fontsize=50, title_fontsize=50)

    plt.tight_layout()
    plt.savefig(savepath)

def plot_group_disorder_content_hist(train_labels, test_labels, benchmark_labels, savepath='splits/disorder_content_histograms.png',orient='horizontal'):
    """
    Compare disorder content between the train, test, and fusion benchmark sets based on the TRUE labels.
    Each labels vector should have ['11110000','0001110',...] format.  
    """
    
    # Get train disorder distribution
    train_lengths = []
    train_frac_disorder = []
    for vec in train_labels:
        veclist = [int(x) for x in vec]
        train_lengths.append(len(veclist))
        train_frac_disorder.append(sum(veclist)/len(veclist))
    
    # Get test disorder distribution
    test_lengths = []
    test_frac_disorder = []
    for vec in test_labels:
        veclist = [int(x) for x in vec]
        test_lengths.append(len(veclist))
        test_frac_disorder.append(sum(veclist)/len(veclist))
        
    # Get benchmark disorder distribution
    benchmark_lengths = []
    benchmark_frac_disorder = []
    for vec in benchmark_labels:
        veclist = [int(x) for x in vec]
        benchmark_lengths.append(len(veclist))
        benchmark_frac_disorder.append(sum(veclist)/len(veclist))
    
    # make a plot
    set_font()
    color_map = {
    'train': '#0072B2',
    'test': '#E69F00',
    'fusion': 'mediumpurple'
    }
    
    # Create a 1x3 subplot (1 row, 3 columns) or 3x1 
    if orient=='horizontal':
        fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=False)
    if orient=='vertical':
        fig, axes = plt.subplots(3, 1, figsize=(5, 15), sharey=False)

    # Plot histogram for train data
    title_fontsize = 26
    axislabel_fontsize = 26
    tick_fontsize = 16
    axes[0].hist(train_frac_disorder, bins=20, color=color_map['train'], alpha=0.7)
    axes[0].set_title('CAID2 Train', fontsize=title_fontsize)
    if orient=="horizontal":
        axes[0].set_xlabel('Fraction of Disorder', fontsize=axislabel_fontsize)
    axes[0].set_ylabel('Frequency', fontsize=axislabel_fontsize)
    axes[0].grid(True)
    axes[0].set_axisbelow(True)
    axes[0].tick_params(axis='both', which='major', labelsize=tick_fontsize)


    # Plot histogram for test data
    axes[1].hist(test_frac_disorder, bins=20, color=color_map['test'], alpha=0.7)
    axes[1].set_title('CAID2 Test',fontsize=title_fontsize)
    if orient=="horizontal":
        axes[1].set_xlabel('Fraction of Disorder', fontsize=axislabel_fontsize)
    if orient=="vertical":
        axes[1].set_ylabel('Frequency', fontsize=axislabel_fontsize)
    axes[1].grid(True)
    axes[1].set_axisbelow(True)
    axes[1].tick_params(axis='both', which='major', labelsize=tick_fontsize)

    # Plot histogram for benchmark (fusion) data
    axes[2].hist(benchmark_frac_disorder, bins=20, color=color_map['fusion'], alpha=0.7)
    axes[2].set_title('Fusion Oncoproteins',fontsize=title_fontsize)
    axes[2].set_xlabel('Fraction of Disorder', fontsize=axislabel_fontsize)
    if orient=="vertical":
        axes[2].set_ylabel('Frequency', fontsize=axislabel_fontsize)
    axes[2].grid(True)
    axes[2].set_axisbelow(True)
    axes[2].tick_params(axis='both', which='major', labelsize=tick_fontsize)
    plt.tight_layout()
    plt.savefig(savepath)
    
def categorize_plddt(values):
    categories = {
        "<= 50": sum(1 for x in values if x <= 50),
        "50-70": sum(1 for x in values if 50 < x <= 70),
        "70-90": sum(1 for x in values if 70 < x <= 90),
        "> 90": sum(1 for x in values if x > 90)
    }
    return categories


def plot_fusion_sequence_pLDDT_left_to_right(fusion_structure_data, fusiongene, save_path=''):
    """
    Plot each amino acid in the sequence as a separate colored bar based on pLDDT values.
    """
    set_font()
    # Filter for specific fusion data and preprocess
    df_of_interest = fusion_structure_data[fusion_structure_data['FusionGene'] == fusiongene].copy()
    df_of_interest['Fusion_AA_pLDDTs'] = df_of_interest['Fusion_AA_pLDDTs'].apply(lambda x: [float(i) for i in x.split(',')])
    df_of_interest['Label'] = df_of_interest['Fusion_Length'].astype(str) + 'AAs'
    
    # Sort data by Fusion_Length
    df_of_interest = df_of_interest.sort_values(by='Fusion_Length', ascending=True).reset_index(drop=True)
    
    # Define colors for each pLDDT range
    category_colors = {"<= 50": "#f27842", "50-70": "#f8d514", "70-90": "#60c1e8", "> 90": "#004ecb"}
    
    # Helper function to get color based on pLDDT
    def get_color(pLDDT):
        if pLDDT > 90:
            return category_colors["> 90"]
        elif pLDDT > 70:
            return category_colors["70-90"]
        elif pLDDT > 50:
            return category_colors["50-70"]
        else:
            return category_colors["<= 50"]
    
    # Start plotting each sequence with colored bars
    fig, ax = plt.subplots(figsize=(10, 6))
    if len(df_of_interest)<3:
        fig, ax = plt.subplots(figsize=(10, 2))

    average_plddt = dict(zip(df_of_interest['Label'], df_of_interest['Fusion_pLDDT']))
    df_of_interest['Fusion_AA_colors'] = df_of_interest['Fusion_AA_pLDDTs'].apply(lambda x: [get_color(plddt) for plddt in x])
    df_of_interest['Fusion_pLDDT_color'] = df_of_interest['Fusion_pLDDT'].apply(lambda plddt: get_color(plddt))
    # just save the columns needed for the plot 
    df_of_interest[['FusionGene','seq_id','Fusion_Length','Fusion_pLDDT','Fusion_AA_pLDDTs','Fusion_AA_colors','Fusion_pLDDT_color',
                    'top_hg_UniProtID','top_hg_UniProt_isoform','top_hg_UniProt_fus_indices',
                    'top_tg_UniProtID','top_tg_UniProt_isoform','top_tg_UniProt_fus_indices']].to_csv(f"{save_path}/plddt_sequence_{fusiongene}_source_data.csv",index=False)
    
    for idx, row in df_of_interest.iterrows():
        pLDDT_values = row['Fusion_AA_pLDDTs']
        colors = [get_color(plddt) for plddt in pLDDT_values]
        
        # Plot each amino acid in the sequence with the respective color
        ax.bar(range(len(pLDDT_values)), 
               [0.7] * len(pLDDT_values), color=colors, edgecolor='none', 
               bottom=idx - 0.7 / 2)  # Centering each row at idx
        
    labels = df_of_interest['Label'].tolist()
    # Annotate each bar with the Fusion_pLDDT value on the right, colored by PLDDT category
    for idx, label in enumerate(labels):
        avg_plddt_value = average_plddt[label]
        
        # Determine color based on the PLDDT category
        if avg_plddt_value > 90:
            color = '#004ecb'
        elif avg_plddt_value > 70:
            color = "#60c1e8"
        elif avg_plddt_value > 50:
            color = '#f8d514'
        else:
            color = '#f27842'
            
        # Annotate with the determined color
        if len(df_of_interest)>10:
            markersize = 10
        elif len(df_of_interest)>5:
            markersize = 16
        else:
            markersize=12
        ax.plot(1.02*max(df_of_interest['Fusion_Length']), 
                idx, marker='o', color="black", markersize=markersize, markerfacecolor=color, markeredgewidth=2)

        # Add breakpoint box - make sure we actually HAVE one of each
        hg_indices, tg_indices = None, None 
        if not(type(df_of_interest['top_hg_UniProt_fus_indices'][idx])==float):
            hg_indices = [int(x) for x in df_of_interest['top_hg_UniProt_fus_indices'][idx].split(',')]
        if not(type(df_of_interest['top_tg_UniProt_fus_indices'][idx])==float):
            tg_indices = [int(x) for x in df_of_interest['top_tg_UniProt_fus_indices'][idx].split(',')]
        print(hg_indices, tg_indices)
        
        if (hg_indices is not None) and (tg_indices is not None):
            box_start = min(hg_indices[-1],tg_indices[0])
            box_end = max(hg_indices[-1],tg_indices[0])
        elif hg_indices is not None:
            box_start, box_end = hg_indices[-1], hg_indices[-1]
        elif tg_indices is not None:
            box_start, box_end = tg_indices[0], tg_indices[0]
            
        print(f"box indices for structure {idx}, fusion gene {fusiongene}", box_start, box_end)
        
        # Plot the rectangle, making it slightly larger than the rest of the bar
        rect = patches.Rectangle((box_start, idx - 0.7 / 2), box_end-box_start, 0.7, linewidth=2, edgecolor='black', facecolor='none')
        ax.add_patch(rect)
    
    # Customize plot
    ax.set_yticks([])  # Hide y-axis ticks
    ax.set_yticklabels([])  # Hide y-axis labels
    ax.set_ylim(-0.5, len(df_of_interest) - 0.5) # reduce white space at top
    ax.set_xlabel("Amino Acid Sequence (ordered)", fontsize=14)
    # Customize x-axis for labeling
    ax.set_xlim(left=0)  # Start x-axis at 0 to make bars flush left
    ax.set_xlabel("Amino Acid Sequence (ordered)", fontsize=14)
    ax.tick_params(axis='x', labelsize=30) 

    
    plt.title(f"{fusiongene} pLDDT Distribution by Amino Acid Sequence", fontsize=16)
    plt.tight_layout()

    # Save figure
    fusiongene_savename = fusiongene.replace("::","-")
    plt.savefig(f"{save_path}/plddt_sequence_{fusiongene_savename}.png", dpi=300)
    plt.show()
    
def plot_favorite_fusion_pLDDT_distribution(fusion_structure_data, fusiongene, save_path=''):
    """
    Make a stacked bar chart of the pLDDT distribution 
    """
    set_font()
    # Filter for EWSR1::FLI1 fusion data and preprocess
    df_of_interest = fusion_structure_data[fusion_structure_data['FusionGene'] == fusiongene].copy()
    df_of_interest['Fusion_AA_pLDDTs'] = df_of_interest['Fusion_AA_pLDDTs'].apply(lambda x: [float(i) for i in x.split(',')])
    df_of_interest['Label'] = df_of_interest['Fusion_Length'].astype(str) + 'AAs'
    # Sort data by Fusion_Length
    df_of_interest = df_of_interest.sort_values(by='Fusion_Length', ascending=True).reset_index(drop=True)
    # Convert to dictionary format
    data_dict = dict(zip(df_of_interest['Label'], df_of_interest['Fusion_AA_pLDDTs']))
    average_plddt = dict(zip(df_of_interest['Label'], df_of_interest['Fusion_pLDDT']))
    
    # Categorize each structure
    categorized_data = {structure: categorize_plddt(plddt_values) for structure, plddt_values in data_dict.items()}

    # Extract counts for each category
    labels = list(categorized_data.keys())
    categories = ["<= 50", "50-70", "70-90", "> 90"]
    counts = {cat: [categorized_data[structure][cat] for structure in labels] for cat in categories}

    # Define colors for each category
    category_colors = {"<= 50": "#f27842", "50-70": "#f8d514", "70-90": "#60c1e8", "> 90": "#004ecb"}

    # Re-categorize PLDDT values for the bar chart
    categorized_data = {structure: categorize_plddt(plddt_values) for structure, plddt_values in data_dict.items()}
    labels = list(categorized_data.keys())
    counts = {cat: [categorized_data[structure][cat] for structure in labels] for cat in categories}

    # Plotting the horizontal stacked bar chart with annotations for 'Fusion_pLDDT' values
    fig, ax = plt.subplots(figsize=(10, 6))
    if len(data_dict)<3:
        fig, ax = plt.subplots(figsize=(10, 2))
    bottom = np.zeros(len(labels))

    # Stack each category horizontally
    for cat in categories:
        ax.barh(labels, counts[cat], label=cat, color=category_colors[cat], left=bottom)
        bottom += counts[cat]  # Update the left position for the next stack

    # Annotate each bar with the Fusion_pLDDT value on the right, colored by PLDDT category
    for idx, label in enumerate(labels):
        avg_plddt_value = average_plddt[label]
        
        # Determine color based on the PLDDT category
        if avg_plddt_value > 90:
            color = '#004ecb'
        elif avg_plddt_value > 70:
            color = "#60c1e8"
        elif avg_plddt_value > 50:
            color = '#f8d514'
        else:
            color = '#f27842'
            
        # Annotate with the determined color
        #ax.text(bottom[idx] + 1, idx, f"{avg_plddt_value:.2f}", va='center', ha='left', color="black", fontsize=18, fontweight='bold')
        if len(df_of_interest)>10:
            markersize = 10
        elif len(df_of_interest)>5:
            markersize = 16
        else:
            markersize=12
        ax.plot(bottom[idx] + .02*max(df_of_interest['Fusion_Length']), idx, marker='s', color="black", markersize=markersize, markerfacecolor=color, markeredgewidth=2)


    # Add labels and legend
    #ax.set_xlim([0,max(df_of_interest['Fusion_Length'])*1.0])
    #ax.set_ylabel("Structures")
    # Save original ticks before changing label size
    #ax.tick_params(axis='x', labelsize=16) 
    #original_xticks = ax.get_xticks()
    # Set ticks explicitly to avoid automatic adjustment
    #ax.set_xticks(original_xticks)

    #ax.set_xlabel("Length",fontsize=40)
    ax.tick_params(axis='x', labelsize=30) 
    #ax.tick_params(axis='y', labelsize=16) 
    ax.tick_params(axis='y', left=False, labelleft=False)
    #ax.set_title(f"{fusiongene} pLDDT Distribution")
    #ax.legend(title="pLDDT Ranges", fontsize=16, bbox_to_anchor=(1, 1), title_fontsize=16)

    plt.tight_layout()
    fusiongene_savename = fusiongene.replace("::","-")
    plt.savefig(f"{save_path}/plddt_dist_{fusiongene_savename}.png",dpi=300)

def make_all_favorite_fusion_pLDDT_plots(favorite_fusions,left_to_right=True):
    fusion_structure_data = pd.read_csv('processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv')
    swissprot_top_alignments = pd.read_csv("../../data/blast/blast_outputs/swissprot_top_alignments.csv")
    fuson_db = pd.read_csv("../../data/fuson_db.csv")
    seq_id_dict = dict(zip(fuson_db['aa_seq'],fuson_db['seq_id']))
    fusion_structure_data['seq_id'] = fusion_structure_data['Fusion_Seq'].map(seq_id_dict)
    fusion_structure_data = pd.merge(
        fusion_structure_data,
        swissprot_top_alignments,
        on="seq_id",
        how="left"
    )
    for x in favorite_fusions:
        if left_to_right:
            plot_fusion_sequence_pLDDT_left_to_right(fusion_structure_data, x, save_path='processed_data/figures/fusion_disorder')
        else:
            plot_favorite_fusion_pLDDT_distribution(fusion_structure_data, x, save_path='processed_data/figures/fusion_disorder')
    
def prep_data_for_ht_disorder_comparison():
    ht_structure_data = pd.read_csv('processed_data/fusionpdb/heads_tails_structural_data.csv')
    fusion_structure_data = pd.read_csv('processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv')
    fusion_heads_and_tails = pd.read_csv('processed_data/fusionpdb/fusion_heads_and_tails.csv')

    all_hts_with_structures = ht_structure_data['UniProtID'].unique().tolist()

    fuson_ht_db = pd.read_csv('../../data/blast/fuson_ht_db.csv')[['seq_id','aa_seq','fusiongenes','hgUniProt','tgUniProt']]

    merge = pd.merge(
        fuson_ht_db.rename(columns={'aa_seq':'Fusion_Seq'}),
        fusion_structure_data[['FusionGID', 'Fusion_Seq','Fusion_pLDDT','Fusion_AA_pLDDTs']],
        on='Fusion_Seq',
        how='right'
    )
    # now merge again
    merge['hgUniProt'] = merge['hgUniProt'].apply(lambda x: x.split(','))
    merge['tgUniProt'] = merge['tgUniProt'].apply(lambda x: x.split(','))
    merge = merge.explode('hgUniProt')
    merge = merge.explode('tgUniProt')
    merge = merge.loc[
        merge['hgUniProt'].isin(all_hts_with_structures) &
        merge['tgUniProt'].isin(all_hts_with_structures)
    ].reset_index(drop=True)

    merge = pd.merge(
        merge,
        ht_structure_data.rename(columns=
            {'UniProtID':'hgUniProt',
            'Avg pLDDT': 'hg_pLDDT',
            'All pLDDTs': 'hg_AA_pLDDTs',
            'Seq': 'hg_seq'}),
        on='hgUniProt',
        how='inner'
    )

    merge = pd.merge(
        merge,
        ht_structure_data.rename(columns=
            {'UniProtID':'tgUniProt',
            'Avg pLDDT': 'tg_pLDDT',
            'All pLDDTs': 'tg_AA_pLDDTs',
            'Seq': 'tg_seq'}),
        on='tgUniProt',
        how='inner'
    )
    merge = merge.loc[merge['hg_AA_pLDDTs'].notna()]
    merge = merge.loc[merge['tg_AA_pLDDTs'].notna()].reset_index(drop=True)

    # finally, calcualte label
    merge['hg_label'] = merge['hg_AA_pLDDTs'].apply(lambda x: x.split(','))
    merge['hg_label'] = merge['hg_label'].apply(lambda x: [float(y) for y in x])
    merge['hg_label'] = merge['hg_label'].apply(lambda x: [apply_plddt_thresh(y) for y in x])
    merge['hg_label'] = merge['hg_label'].apply(lambda x: ''.join(x))

    merge['tg_label'] = merge['tg_AA_pLDDTs'].apply(lambda x: x.split(','))
    merge['tg_label'] = merge['tg_label'].apply(lambda x: [float(y) for y in x])
    merge['tg_label'] = merge['tg_label'].apply(lambda x: [apply_plddt_thresh(y) for y in x])
    merge['tg_label'] = merge['tg_label'].apply(lambda x: ''.join(x))

    merge['fusion_label'] = merge['Fusion_AA_pLDDTs'].apply(lambda x: x.split(','))
    merge['fusion_label'] = merge['fusion_label'].apply(lambda x: [float(y) for y in x])
    merge['fusion_label'] = merge['fusion_label'].apply(lambda x: [apply_plddt_thresh(y) for y in x])
    merge['fusion_label'] = merge['fusion_label'].apply(lambda x: ''.join(x))

    return merge

def apply_plddt_thresh(y):
    if y < 68.8:
        return '1'
    else: 
        return '0'

def plot_fusion_stats_boxplots(data, save_path="fusion_disorder_boxplots.png"):
    set_font()
    # Create box plots
    plt.figure(figsize=(6, 5))
    # for ones that are 100% disordered, AUROC was NaN, so drop these
    box = plt.boxplot([data[col].dropna() for col in data.columns], labels=data.columns, patch_artist=True)

    # Set color of each box plot
    for patch in box['boxes']:
        patch.set_facecolor('#ff68b4')
        patch.set_edgecolor('#ff68b4')
    
    # Customize other elements if needed
    #for whisker in box['whiskers']:
        #whisker.set_color('#ff68b4')
    #for cap in box['caps']:
        #cap.set_color('#ff68b4')
    for median in box['medians']:
        median.set_color('black')
    # Add labels and title
    #plt.xlabel('Metrics')
    #plt.ylabel('Values')
    plt.title(f"Per-Residue Disorder (n={len(data)})",fontsize=22)
    plt.xticks(rotation=20,fontsize=22)
    plt.yticks(fontsize=22)

    # Show plot
    plt.tight_layout()
    plt.show()
    plt.savefig(save_path,dpi=300)

def plot_fusion_frac_disorder_r2(actual_values, predicted_values, save_path="fusion_pred_disorder_r2.png"):
    set_font()
    plt.figure(figsize=(6, 6))
    r2 = r2_score(actual_values, predicted_values)
    #sns.kdeplot(actual_values, label="Actual Values", shade=True)
    #sns.kdeplot(predicted_values, label="Predicted Values", shade=True)
    plt.scatter(actual_values, predicted_values, alpha=0.5, label=f"Predictions", color="#ff68b4")
    plt.plot([min(actual_values), max(actual_values)], [min(actual_values), max(actual_values)], 'k--', label='Ideal Fit')
    plt.text(0, 92, f"$R^2$={r2:.2f}", fontsize=32)
    # Adjusting font sizes and setting font properties
    plt.xlabel(f'AlphaFold-pLDDT',size=32)
    plt.ylabel(f'FusOn-pLM-Diso',size=32)
    plt.title(f"% Disordered (n={len(actual_values)})",size=32)
    plt.xticks(fontsize=24)
    plt.yticks(fontsize=24)
    #plt.xlabel("Values")
    #plt.ylabel("Density")
    #plt.title(f"Density Plot of Actual vs Predicted Values (R^2 = {r2:.2f})")
    plt.legend(prop={'size': 16})
    plt.tight_layout()
    plt.show()
    plt.savefig(save_path, dpi=300)
    
def main():
    set_font()
    #output_dir = "results/test"
    output_dir = "results/final"
    seq_label_dict = pd.read_csv('splits/test_df.csv')
    seq_ids_dict = dict(zip(seq_label_dict['Sequence'],seq_label_dict['IDs']))
    seq_label_dict = dict(zip(seq_label_dict['Sequence'],seq_label_dict['Label']))
    best_caid_model_results = pd.read_csv(f"{output_dir}/best_caid_model_results.csv")
    make_auroc_curve(results_dir=output_dir, 
                    seq_label_dict=seq_label_dict,
                    seq_ids_dict=seq_ids_dict,
                    path_to_results_of_interest="trained_models/fuson_plm/best/caid_hyperparam_screen_test_probs.csv", 
                    model_alias="FusOn-pLM", 
                    path_to_esm_results="trained_models/esm2_t33_650M_UR50D/best/caid_hyperparam_screen_test_probs.csv",
                    with_rankings=True)
        
    caid2_test_data = pd.read_csv(f"splits/splits.csv")
    caid2_test_data = caid2_test_data.loc[caid2_test_data['Split']=='Test']
    caid2_test_labels = caid2_test_data['Label'].tolist()
    caid2_test_ids = caid2_test_data['IDs'].tolist()
    # fusions, heads, and tails
    fusion_ht_data = prep_data_for_ht_disorder_comparison()
    os.makedirs("processed_data/figures",exist_ok=True)

    head_data = fusion_ht_data.drop_duplicates(['hg_seq']).reset_index(drop=True)
    head_labels = head_data['hg_label'].tolist()
    head_ids = head_data['hgUniProt'].tolist()
    tail_data = fusion_ht_data.drop_duplicates(['tg_seq']).reset_index(drop=True)
    tail_labels = tail_data['tg_label'].tolist()
    tail_ids = tail_data['tgUniProt'].tolist()
    fusion_data = fusion_ht_data.drop_duplicates(['Fusion_Seq']).reset_index(drop=True)
    fusion_labels = fusion_data['fusion_label'].tolist()
    fusion_ids = fusion_data['seq_id'].tolist()
    
    plt.rc('text', usetex=False)
    math_part = r"$n$"

    os.makedirs("processed_data/figures/histograms",exist_ok=True)
    plot_disorder_content_hist(caid2_test_labels, caid2_test_ids, title=f"CAID2 Disorder-NOX ({math_part}={len(caid2_test_labels):,})", color="black", savepath='processed_data/figures/histograms/disorder_nox_histogram.png')
    plot_disorder_content_hist(head_labels, head_ids, title=f"Head Proteins ({math_part}={len(head_labels):,})", color="#df8385", savepath='processed_data/figures/histograms/heads_histogram.png')
    plot_disorder_content_hist(tail_labels, tail_ids, title=f"Tail Proteins ({math_part}={len(tail_labels):,})", color="#6ea4da", savepath='processed_data/figures/histograms/tails_histogram.png')
    plot_disorder_content_hist(fusion_labels, fusion_ids, title=f"Fusion Oncoproteins ({math_part}={len(fusion_labels):,})", color="mediumpurple", savepath='processed_data/figures/histograms/fusions_histogram.png')
    
    os.makedirs("processed_data/figures/fusion_disorder",exist_ok=True)
    make_all_favorite_fusion_pLDDT_plots([
                                        "EWSR1::FLI1",
                                        "PAX3::FOXO1",
                                        "EML4::ALK",
                                        "SS18::SSX1"],
                                            left_to_right=True)

if __name__ == "__main__":
    main()