File size: 33,907 Bytes

bae913a

import os
import numpy as np
import re
import pandas as pd
import requests

from fuson_plm.utils.logging import open_logfile, log_update
from fuson_plm.utils.constants import DELIMITERS, VALID_AAS
from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars

from fuson_plm.benchmarking.caid.scrape_fusionpdb import scrape_fusionpdb_level_2_3
from fuson_plm.benchmarking.caid.process_fusion_structures import process_fusions_and_hts
            
def download_fasta(uniprotid, includeIsoform, output_file):
    try:
        url = f"https://rest.uniprot.org/uniprotkb/search?format=fasta&includeIsoform={includeIsoform}&query=accession%3A{uniprotid}&size=500&sort=accession+asc"
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Raise an exception if the request was unsuccessful
        response.raise_for_status()
        
        # Write the content to a file in text mode
        with open(output_file, 'a+') as file:
            file.write(response.text)
        
        log_update(f"FASTA file for {uniprotid} successfully downloaded and added to '{output_file}'")
    
    except requests.exceptions.RequestException as e:
        log_update(f"An error occurred: {e}")

# Test Sequences (CAID-2 Disorder-NOX)
def parse_caid_txt(fast_file):
    '''
    Parses correctly fasta-formatted text file with conditions:
    Line 1: ID
    Line 2: Sequence
    Line 3: Label
    '''

    seq_to_label = {}
    id_to_sequence = {}

    with open(fast_file, 'r') as file:
        label = None
        sequence = ""
        seq_id = None
        reading_sequence = False
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if label is not None and sequence:
                    seq_to_label[sequence] = (label, seq_id)
                seq_id = line[1:]  # Capture the ID without the '>'
                label = None
                sequence = ""
                reading_sequence = True
            elif reading_sequence:
                if all(c in "01-" for c in line):
                    label = line
                    reading_sequence = False
                else:
                    sequence += line
        if label is not None and sequence:
            seq_to_label[sequence] = (label, seq_id)

    return seq_to_label

def check_df_for_mismatched_labels(sd):
    log_update("\tChecking dataframe for mismatched sequences and labels...")
    counter=0
    for idx, row in sd.iterrows():
        seq = row['Sequence']
        label = row['Label']

        if len(seq) != len(label):
            counter+=1
            log_update(f"\t\tLength mismatch at index {idx}: sequence length = {len(seq)}, label length = {len(label)}")
            
    log_update(f"\t\tTotal mismatched lengths/labels: {counter}")


def process_caid2_disorder_nox_test(caid_path):
    """
    Processes the CAID-2_Disorder_NOX_Testing_Sequences.fasta file
    """
    log_update("Processing CAID-2-Disorder-NOX Testing Dataset")
    # Parse the fasta file
    caid_dict = parse_caid_txt(caid_path)
    
    # Gather the sequences
    caid_seqs = {}
    for k, (v, seq_id) in caid_dict.items():
        caid_seqs[seq_id] = (k, v)
    log_update(f"\tTotal sequences: {len(caid_seqs)}")

    # Form dataframe from processed data
    caid_df = pd.DataFrame({
        'ID': list(caid_seqs.keys()),
        'Sequence': [seq for seq, _ in caid_seqs.values()],
        'Label': [lbl for _, lbl in caid_seqs.values()],
        'Split': 'Test'
    })
    
    check_df_for_mismatched_labels(caid_df)
    return caid_df
    
# Training Squences (fldpnn and IDP-CRF)
# fldpnn Training Sequences
def parse_fldpnn_fasta(file_path):
    """
    Parse flDPnn_Training_Dataset.txt, where there are 5 sequence lines. We only want the first 

    >Disprot ID
    Amino acid sequence
    Experimental annotation for intrinsic disorder
    Experimental annotation for disordered protein binding
    Experimental annotation for disordered DNA binding
    Experimental annotation for disordered RNA binding
    Experimental annotation for disordered flexible linkers
    """
    sequences = []
    labels = []
    ids = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

        seq_id = ""
        current_sequence = ""
        seen_label_lines = 0    # should go up to 5 for each
        current_labels = []
        is_label = False

        for line in lines:
            line = line.strip()
            if line.startswith('>'):
                if current_sequence and current_labels:
                    assert seen_label_lines==5  # we should've seen 5 labels, otherwise something is wrong
                    ids.append(seq_id)
                    sequences.append(current_sequence)
                    labels.append(''.join(current_labels))
                seq_id = line[1:]  # Capture the ID without the '>'
                current_sequence = ""
                current_labels = []
                is_label = False
                seen_label_lines = 0
            elif re.match('^[A-Z]+$', line):  # Sequence lines
                current_sequence += line
            else:  # Label lines
                seen_label_lines+=1
                if seen_label_lines==1:
                    current_labels.append(line)
                is_label = True

        # Add the last sequence and labels
        if current_sequence and current_labels:
            sequences.append(current_sequence)
            labels.append(''.join(current_labels))
            ids.append(seq_id)

    return ids, sequences, labels

def parse_idp_crf_fasta(file_path):
    sequences = []
    labels = []
    ids = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

        seq_id = ""
        current_sequence = ""
        current_labels = []
        is_label = False

        for line in lines:
            line = line.strip()
            if line.startswith('>'):
                if current_sequence and current_labels:
                    ids.append(seq_id)
                    sequences.append(current_sequence)
                    labels.append(''.join(current_labels))
                seq_id = line[1:]  # Capture the ID without the '>'
                current_sequence = ""
                current_labels = []
                is_label = False
            elif re.match('^[A-Z]+$', line):  # Sequence lines
                current_sequence += line
            else:  # Label lines
                current_labels.append(line)
                is_label = True

        # Add the last sequence and labels
        if current_sequence and current_labels:
            sequences.append(current_sequence)
            labels.append(''.join(current_labels))
            ids.append(seq_id)

    return ids, sequences, labels

def process_fldpnn(fldpnn_path, split="training"):
    """
    Process the fldpnn_Training_Dataset
    """
    log_update(f"\nProcessing flDPnn {split} dataset")
    # Parse fasta
    fldpnn_ids, fldpnn_seqs, fldpnn_labels = parse_fldpnn_fasta(fldpnn_path)

    # Collect cleaned labels
    cleaned_fldpnn_ids = []
    cleaned_fldpnn_labels = []
    for i in range(len(fldpnn_seqs)):
        seq_len = len(fldpnn_seqs[i])
        label = fldpnn_labels[i]      # Should only be the first set of labels
        id = fldpnn_ids[i]
        cleaned_fldpnn_labels.append(label)
        
    log_update(f"\tTotal labels: {len(cleaned_fldpnn_labels)}, total sequences: {len(fldpnn_seqs)},total IDs: {len(fldpnn_ids)}")

    fldpnn_df = pd.DataFrame({'Sequence': fldpnn_seqs, 
                              'Label': cleaned_fldpnn_labels, 
                              "Split": "Train" if split=="training" else "Val",
                              "ID": fldpnn_ids})
    check_df_for_mismatched_labels(fldpnn_df)
    
    return fldpnn_df

def combine_fldpnn_train_val(fldpnn_train_df, fldpnn_val_df):
    log_update("\nJoining flDPnn train and val sets into one training set for CAID predictor")
    combined = pd.concat([fldpnn_train_df,fldpnn_val_df])
    
    # check for duplicates
    duplicates = combined[combined['Sequence'].duplicated()]['Sequence'].unique().tolist()
    n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)])
    log_update(f"\t{len(duplicates)} sequences in both train and val datasets, corresponding to {n_rows_with_duplicates} rows")
    for dup in duplicates:
        train_id = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Train')]['ID'].reset_index(drop=True).iloc[0]
        val_id = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Val')]['ID'].reset_index(drop=True).iloc[0]
        train_label = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Train')]['Label'].reset_index(drop=True).iloc[0]
        val_label = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Val')]['Label'].reset_index(drop=True).iloc[0]
        log_update(f"\t\tTrain ID: {train_id}\tVal ID: {val_id}\tSame labels: {train_label==val_label}\tSequence: {dup}")
        
        # if the labels are not equal, get rid of it completely. Otherwise just get rid of the val case
        if not(train_label==val_label):
            log_update(f"\t\t\tSince labels are not equal, removing sequence completely")
            combined = combined[combined['Sequence']!=dup].reset_index(drop=True)
        else:
            log_update(f"\t\t\tSince labels are equal, removing validation copy")
            combined = combined.loc[(combined['Sequence']!=dup) |
                                    ((combined['Sequence']==dup) & (combined['Split']=='Train'))]
    # drop duplicates
    log_update(f"\tLength of joined flDPnn data: {len(combined)}")
    
    return combined
    
def process_idp_crf_train(idp_crf_train_path):
    """
    Process IDP-CRF_Training_Dataset
    
    Args:
        idp_crf_train_path
    """
    log_update("\nProcessing IDP-CRF training dataset")
    # Parse the fasta, get sequences and labels
    idp_crf_ids, idp_crf_seqs, idp_crf_labels = parse_idp_crf_fasta(idp_crf_train_path)
    log_update(f"\tTotal labels: {len(idp_crf_labels)}, total sequences: {len(idp_crf_seqs)}, total IDs: {len(idp_crf_ids)}")

    # Clean the labels
    cleaned_idp_ids, cleaned_idp_seqs, cleaned_idp_labels = [], [], []
    counter = 0
    log_update("\tCleaning labels and counting length-mismatched examples...")
    for i, label in enumerate(idp_crf_labels):
        # If length of sequence and labels doesn't match, log it
        if len(idp_crf_seqs[i]) != len(idp_crf_labels[i]):
            log_update(f"\t\tLength mismatch at index {i}: sequence length = {len(idp_crf_seqs[i])}, label length = {len(idp_crf_labels[i])}")
            
            counter += 1
        # Else, "clean" the labels by mapping them to ints and converting them to a list 
        else:
            cleaned_idp_ids.append(idp_crf_ids[i])
            cleaned_idp_labels.append(label)
            cleaned_idp_seqs.append(idp_crf_seqs[i])

    log_update(f"\t\tMismatched lengths/labels: {counter}")

    # Confirm that final database has no mismatched labels
    idp_crf_df = pd.DataFrame({'Sequence': cleaned_idp_seqs, 
                               'Label': cleaned_idp_labels, 
                               "Split": "Train",
                               "ID": cleaned_idp_ids})
    check_df_for_mismatched_labels(idp_crf_df)
            
    return idp_crf_df

def find_agreeing_labels(row, lab1="", lab2=""):
    """
    If there's only one possible label, return that label. If the two labels disagree, return np.nan
    """
    val1 = row[lab1]
    val2 = row[lab2]
    
    # If one of them is nan, then they won't match anyway, so return True because there is no conflict
    if type(val1)==float and np.isnan(val1):
        return val2
    elif type(val2)==float and np.isnan(val2):
        return val1
    else:
        if val1==val2:
            return val1
        else:
            return np.nan

def get_unique_ids(row):
    source_to_id = {
        "IDP-CRF": row["IDP-CRF ID"],
        "flDPnn": row["flDPnn ID"],
        "CAID-2_Disorder_NOX": row["CAID-2_Disorder_NOX ID"]
    }
    
    all_sources = row["Source"].split(",")
    all_ids = []
    # they are already in the desired order so just iterate through them
    for source in all_sources:
        candidate_id = source_to_id[source]
        if not(candidate_id in all_ids):
            all_ids.append(candidate_id)
    
    return ",".join(all_ids)

def parse_caid2_results(processed_caid2_df,lines):
    # iterate through the lines
    all_caid2_disorder_nox_ids = processed_caid2_df['ID'].tolist()
    all_caid2_disorder_nox_sequences = processed_caid2_df['Sequence'].tolist()
    
    cur_id = None
    results = {
        }
    for i, line in enumerate(lines):
        # If line starts with >, that means we have a new ID
        if line[0]==">":
            # If we are currently on a different cur_id, finish that one out
            if not(cur_id is None):
                results[cur_id]['prob_1'] = ",".join(results[cur_id]['prob_1'])
                results[cur_id]['pred_labels'] = ",".join(results[cur_id]['pred_labels'])
                sequence = results[cur_id]['sequence']
                # get the true labels from the CAID2 dataset - IF POSSIBLE
                if (cur_id not in all_caid2_disorder_nox_ids) and (sequence not in all_caid2_disorder_nox_sequences):
                    results[cur_id]['labels'] = np.nan
                else: 
                    true_labels = processed_caid2_df.loc[
                        processed_caid2_df['ID']==cur_id,'Label'
                    ].item()
                    true_labels = ",".join(list(true_labels))
                    results[cur_id]['labels'] = true_labels
            # Now process the new one
            cur_id = line[1::].strip('\t').strip('\n')
            results[cur_id] = {
                'sequence': '',
                'prob_1': [],
                'pred_labels': []
            }
        # if cur id is not None
        else:
            # if we have a cur id to process, process it
            if not(cur_id is None):
                # Extract the information - not every .caid file as predicted labels!!
                lsplit =  line.strip('\n').split('\t')
                label=''
                idx, aa, prob = lsplit[0], lsplit[1], lsplit[2]
                if len(lsplit)==4: label=lsplit[3]
                # Add to dict
                results[cur_id]['sequence']+=aa
                results[cur_id]['prob_1'].append(prob)
                results[cur_id]['pred_labels'].append(label)

            # if we're on the last line, combine
            if i==len(lines)-1:
                results[cur_id]['prob_1'] = ",".join(results[cur_id]['prob_1'])
                results[cur_id]['pred_labels'] = ",".join(results[cur_id]['pred_labels'])
                sequence = results[cur_id]['sequence']
                # get the true labels from the CAID2 dataset - IF POSSIBLE
                if (cur_id not in all_caid2_disorder_nox_ids) and (sequence not in all_caid2_disorder_nox_sequences):
                    results[cur_id]['labels'] = np.nan
                else: 
                    true_labels = processed_caid2_df.loc[
                        processed_caid2_df['ID']==cur_id,'Label'
                    ].item()
                    true_labels = ",".join(list(true_labels))
                    results[cur_id]['labels'] = true_labels
                
    df = pd.DataFrame.from_dict(results,orient='index').reset_index().rename(columns={'index':'seq_id'})
    df = df.loc[df['labels'].notna()].reset_index(drop=True)
    # drop pred_labels if it's empty
    if set(','.join(df['pred_labels'].tolist()))=={','}: 
        df = df.drop(columns=['pred_labels'])
        log_update(f"\t\tno predicted labels provided for this dataset; only probabilities")
    log_update(f"\t\t{len(df)}/{len(all_caid2_disorder_nox_sequences)} total CAID2-Nox sequences")
    return df
    
def parse_all_caid2_results(processed_caid2_df, caid_raw_folder="raw_data/caid2_competition_results"):
    save_dir ="processed_data/caid2_competition_results" 
    os.makedirs(save_dir,exist_ok=True)
    
    log_update(f"\nExtracting all CAID-2_Disorder_NOX results from CAID2 competition results files...")
    all_caid_files = os.listdir(caid_raw_folder)
    for caid_file in all_caid_files:
        # figure out how to parse .caid files 
        with open(f"{caid_raw_folder}/{caid_file}", "r") as f:
            lines = f.readlines()
            log_update(f"\t{caid_file}:")
            results_df = parse_caid2_results(processed_caid2_df,lines)
            # save it
            competitor_name = caid_file.split('.caid')[0]
            results_df.to_csv(f"{save_dir}/{competitor_name}_CAID-2_Disorder_NOX.csv",index=False)
    
def make_train_df(fldpnn_df, idp_crf_df):
    """
    Make training dataframe by concatenating the two processed training sets. 
    """
    # Add source columns so we can track where each sequence came from
    idp_crf_df = idp_crf_df.rename(columns={'Label':'IDP-CRF Label', 'ID': 'IDP-CRF ID'}).drop(columns=['Split'])
    fldpnn_df = fldpnn_df.rename(columns={'Label':'flDPnn Label', 'ID': 'flDPnn ID'}).drop(columns=['Split'])
    ########### Combine fldpnn and idp crf
    # Join
    log_update("\nJoining flDPnn and IDP-CRF data by sequence make unified training set")
    train_df = pd.merge(idp_crf_df, 
                        fldpnn_df,
                        on='Sequence',
                        how='outer',
                        indicator=True)
    train_df["Split"] = ["Train"]*len(train_df)
    # Map _merge column to desired labels
    train_df['Source'] = train_df['_merge'].map({
        'left_only': 'IDP-CRF',
        'right_only': 'flDPnn',
        'both': 'IDP-CRF,flDPnn'
    })
    train_df = train_df.drop(columns=["_merge"])
    log_update(f"\tIDP-CRF dataset size: {len(idp_crf_df)}\n\tfLDpnn dataset size: {len(fldpnn_df)}\n\tinitial train dataset size: {len(train_df)}")

    # Check for duplicate sequences
    log_update(f"\tChecking for sequences in both datasets...")
    duplicates = train_df[train_df["Source"].str.contains(",")]['Sequence'].unique().tolist()
    n_rows_with_duplicates = len(train_df[train_df['Sequence'].isin(duplicates)])
    log_update(f"\t\t{len(duplicates)} sequences in both datasets, corresponding to {n_rows_with_duplicates} rows")

    # Check for consistency between IDP-CRF Label and flDPnn label
    train_df["Label"] = train_df.apply(lambda row: find_agreeing_labels(row,lab1="IDP-CRF Label",lab2="flDPnn Label"),axis=1)
    train_df["No Label Conflicts"]= ~train_df["Label"].isna()
    log_update(f"\tChecked for label inconsistencies between IDP-CRF and flDPnn on the same sequence:")
    match_str = train_df['No Label Conflicts'].value_counts().reset_index().rename(columns={'index': 'No Label Conflicts','No Label Conflicts': 'count'}).to_string(index=False)
    match_str = "\t\t" + match_str.replace("\n","\n\t\t")
    log_update(match_str)
    
    # Dropping rows where labels don't match 
    #train_df[train_df['No Label Conflicts']==False][['Sequence','Split','IDP-CRF ID','flDPnn ID','IDP-CRF Label','flDPnn Label','No Label Conflicts']].to_csv('mismatch.csv',index=False)
    # Drop row with known conflict with disprot
    conflict_seq="MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKRMRRRIESEDEEDSSQDALVPRTPSPRPSTSAADLAIAPKKKKKRPSPKPERPPSPEVIVDSEEEREDVALQMVGFSNPPVLIKHGKGGKRTVRRLNEDDPVARGMRTQEEEEEPSEAESEITVMNPLSVPIVSAWEKGMEAARALMDKYHVDNDLKANFKLLPDQVEALAAVCKTWLNEEHRGLQLTFTSKKTFVTMMGRFLQAYLQSFAEVTYKHHEPTGCALWLHRCAEIEGELKCLHGSIMINKEHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNTDARCCVHDAACPANQFSGKSCGMFFSEGAKAQVAFKQIKAFMQALYPNAQTGHGHLLMPLRCECNSKPGHAPFLGRQLPKLTPFALSNAEDLDADLISDKSVLASVHHPALIVFQCCNPVYRNSRAQGGGPNCDFKISAPDLLNALVMVRSLWSENFTELPRMVVPEFKWSTKHQYRNVSLPVAHSDARQNPFDF"
    train_df = train_df.loc[train_df['Sequence']!=conflict_seq].reset_index(drop=True)
    log_update(f"\tDropping rows with label mismatch or known error (total={len(train_df[train_df['No Label Conflicts']==False])+1})")
    train_df = train_df.loc[train_df['No Label Conflicts']].reset_index(drop=True)
    
    # Make a new label column
    train_df = train_df.drop(columns=["IDP-CRF Label","flDPnn Label"])
    log_update(f"\t\tNew dataset size: {len(train_df)}")
    
    ######## Final checks
    # Check for any invalid sequences or invalid characters
    cols_of_interest =  ['Sequence','Split','Label','IDP-CRF ID','flDPnn ID']
    listlike_dict = check_columns_for_listlike(train_df, cols_of_interest, DELIMITERS)
    
    # Check for invalid characters
    train_df['invalid_chars'] = train_df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS))
    train_df[train_df['invalid_chars'].str.len()>0].sort_values(by='Sequence')
    all_invalid_chars = set().union(*train_df['invalid_chars'])
    log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}")

    # Dropping rows where invalid characters(should be none)
    log_update(f"\tDropping rows with invalid characters (total={len(train_df[train_df['invalid_chars'].str.len()>0])})")
    train_df = train_df.loc[train_df['invalid_chars'].str.len()==0].reset_index(drop=True)
    train_df = train_df.drop(columns=['invalid_chars'])
    log_update(f"\t\tNew dataset size: {len(train_df)}")
    
    source_str = train_df['Source'].value_counts().reset_index().rename(columns={'index': 'Source','Source': 'count'}).to_string(index=False)
    source_str = "\t\t" + source_str.replace("\n","\n\t\t")
    log_update(f"\tSources:\n{source_str}")
    return train_df

def make_train_and_test_df(train_df, test_df):
    """
    Combine the training and testing dataframe into one
    """
    log_update("\nMaking final dataframe with train and test splits")
    # Concatenate proposed train and test
    test_df["Source"] = ["CAID-2_Disorder_NOX"]*len(test_df)
    splits_df = pd.concat([train_df.drop(columns=['No Label Conflicts']),
                           test_df.rename(columns={'ID':'CAID-2_Disorder_NOX ID', 'Label': 'CAID-2_Disorder_NOX Label'})])
    split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False)
    split_str = "\t\t" + split_str.replace("\n","\n\t\t")
    log_update(f"\tTrain dataset size: {len(train_df)}\n\tTest dataset size: {len(test_df)}\n\tinitial combined dataset size: {len(splits_df)}")
    
    # Check for duplicates - if we find any, REMOVE them from train and keep them in test
    duplicates = splits_df[splits_df.duplicated('Sequence')]['Sequence'].unique().tolist()
    n_rows_with_duplicates = len(splits_df[splits_df['Sequence'].isin(duplicates)])
    log_update(f"\t\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")
    for i, dup in enumerate(duplicates):
        fldpnn_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Train')]['flDPnn ID'].item()
        idp_crf_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Train')]['IDP-CRF ID'].item()
        caid2_disorder_nox_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Test')]['CAID-2_Disorder_NOX ID'].item()
        log_update(f"\t\t\t{i+1}: flDPnn ID: {fldpnn_id}\tIDP-CRF ID: {idp_crf_id}\tCAID-2_Disorder_NOX ID: {caid2_disorder_nox_id}\n\t\t\t\tSequence: {dup}")
    # remove from train and keep in test
    splits_df = splits_df.loc[
        (~splits_df['Sequence'].isin(duplicates)) |       # Either the sequence is NOT duplicated, or
        ((splits_df['Sequence'].isin(duplicates)) & (splits_df['Split']=='Test'))     # Sequence is duplicated, and it's in test set
    ].reset_index(drop=True)
    split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False)
    split_str = "\t\t" + split_str.replace("\n","\n\t\t")
    log_update(f"\tRemoved duplicate sequences from training split, kept in test split\n\t\tNew dataset size: {len(splits_df)}\n\n{split_str}")
    
    # Everything in the train set should have a label; nothing in the test set should
    assert splits_df[splits_df["Label"].isna()]["Split"].value_counts().reset_index()['index'].tolist()==['Test']
    splits_df.loc[
        splits_df["Split"]=="Test","Label"
    ] = splits_df.loc[
        splits_df["Split"]=="Test","CAID-2_Disorder_NOX Label"
    ] 
    splits_df = splits_df.drop(columns=["CAID-2_Disorder_NOX Label"])
    # Make sure there are no na's in label
    assert len(splits_df[splits_df["Label"].isna()])==0
    
    # Print out distribution of sources
    source_str = splits_df['Source'].value_counts().reset_index().rename(columns={'index': 'Source','Source': 'count'}).to_string(index=False)
    source_str = "\t\t" + source_str.replace("\n","\n\t\t")
    total_sources = sum(splits_df['Source'].value_counts().reset_index()['Source'])
    assert total_sources == len(splits_df)
    log_update(f"\n\tSource distribution:\n{source_str}\n\n\t\t\t\t\t\tSum:  {total_sources}")
    
    # Print largest and smallest seq len in each set
    longest_train = max(splits_df[splits_df['Split']=='Train']['Sequence'].apply(lambda x: len(x)).tolist())
    shortest_train = min(splits_df[splits_df['Split']=='Train']['Sequence'].apply(lambda x: len(x)).tolist())
    longest_test = max(splits_df[splits_df['Split']=='Test']['Sequence'].apply(lambda x: len(x)).tolist())
    shortest_test = min(splits_df[splits_df['Split']=='Test']['Sequence'].apply(lambda x: len(x)).tolist())
    log_update(f"\n\tLength distributions...\n\t\tTrain: max={longest_train}\tmin={shortest_train}\n\t\tTest: max={longest_test}\tmin={shortest_test}")
    
    # Consolidate the IDs a bit 
    splits_df["IDs"] = splits_df.apply(lambda row: get_unique_ids(row),axis=1)
    assert len(splits_df[splits_df["IDs"].isna()])==0
    n_different_ids = len(splits_df.loc[splits_df["IDs"].str.contains(",")])
    log_update(f"\n\tProvided comma-separated IDs in same listed order as Source\n\t\t- train: IDP-CRF first, flDPnn second ({n_different_ids} seqs have multiple distinct IDs)\n\t\t- test: CAID-2_Disorder_NOX")
    
    # Keep only desired columns
    splits_df = splits_df[[
        'Sequence','IDs','Split','Source','Label'
    ]]
    
    return splits_df

def main():
    with open_logfile("data_cleaning_log.txt"):
        rawdata_train_test_path = "raw_data/caid2_train_and_test_data"
        # make directory to save processed data
        processeddata_path = "processed_data"
        splits_path = "splits"
        os.makedirs(processeddata_path,exist_ok=True)
        os.makedirs(splits_path,exist_ok=True)
        
        # Process CAID-2_Disorder_NOX_Testing_Sequences dataset from fasta file
        caid_path = f"{rawdata_train_test_path}/CAID-2_Disorder_NOX_Testing_Sequences.fasta"
        caid_df = process_caid2_disorder_nox_test(caid_path)
        caid_df.to_csv(f"{processeddata_path}/CAID-2_Disorder_NOX_Processed.csv", index=False)
        
        # Process fldpnn Training and Validation Datasets
        fldpnn_train_path = f"{rawdata_train_test_path}/flDPnn_Training_Dataset.txt"
        fldpnn_val_path = f"{rawdata_train_test_path}/flDPnn_Validation_Annotation.txt"
        fldpnn_train_df = process_fldpnn(fldpnn_train_path, split="training")
        fldpnn_val_df = process_fldpnn(fldpnn_val_path, split="validation")
        fldpnn_train_df.to_csv(f"{processeddata_path}/flDPnn_Training_Dataset.csv", index=False)
        fldpnn_val_df.to_csv(f"{processeddata_path}/flDPnn_Validation_Dataset.csv", index=False)
        # Combine train and val
        fldpnn_df = combine_fldpnn_train_val(fldpnn_train_df, fldpnn_val_df)
        
        # Process IDP-CRF_Training_Dataset
        idp_crf_train_path = f"{rawdata_train_test_path}/IDP-CRF_Training_Dataset.txt"
        idp_crf_df= process_idp_crf_train(idp_crf_train_path)
        idp_crf_df.to_csv(f"{processeddata_path}/IDP-CRF_Training_Dataset.csv", index=False)
        
        # Merge
        train_df = make_train_df(fldpnn_df, idp_crf_df)
        
        # Make a full splits file 
        splits_df = make_train_and_test_df(train_df, caid_df)
        final_train_df = splits_df.loc[splits_df['Split']=='Train'].reset_index(drop=True)
        final_test_df = splits_df.loc[splits_df['Split']=='Test'].reset_index(drop=True)
        
        # Save final files
        final_train_df.to_csv(f"{splits_path}/train_df.csv", index=False)
        final_test_df.to_csv(f"{splits_path}/test_df.csv", index=False)
    
        # Process the caid competition results and save them in a more accessible format
        processed_caid2_df = pd.read_csv(f"{processeddata_path}/CAID-2_Disorder_NOX_Processed.csv")
        parse_all_caid2_results(processed_caid2_df)
        
        # Process data for visualizing fusion oncoproteins
        # Scrape FusionPDB
        scrape_fusionpdb_level_2_3()
        # Process the structures that we downloaded from scraping
        process_fusions_and_hts()
        
        # Now, figure out which structures are in the test set and isolate those for benchmarking in splits/fusion_bench_df.csv
        fusion_test_set = pd.read_csv("../../data/splits/test_df.csv")
        # columns are sequence, member length, snp_probabilities
        fusion_test_set = set(fusion_test_set['sequence'].tolist())
        log_update(f"\nFinding level 2 and 3 fusion structures that are in the FusOn-pLM test set...\n\tTest set size: {len(fusion_test_set)} seqs")
        level_2_3_info = pd.read_csv('processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv')
        # there are duplicate sequences in here so drop duplicates randomly
        level_2_3_seqs = level_2_3_info.drop_duplicates('Fusion_Seq').reset_index(drop=True)
        level_2_3_seqs = set(level_2_3_seqs.loc[
            level_2_3_info['Fusion_pLDDT'].notna()  # make sure we've got a structure
            ]['Fusion_Seq'].tolist())
        # if it has a structure, it's in the test set, and it's not in the caid train set, we can benchmark with it
        test_benchmark_seqs = fusion_test_set.intersection(level_2_3_seqs)   
        log_update(f"\tTotal fusion proteins in the FusOn-pLM test set: {len(test_benchmark_seqs)}")
        caid_train_set = set(pd.read_csv('splits/train_df.csv')['Sequence'].tolist())
        test_benchmark_seqs = test_benchmark_seqs.difference(caid_train_set)    # subtract off the caid train set to be sure
        log_update(f"\tTotal fusion proteins in the FusOn-pLM test set and NOT in the CAID train set: {len(test_benchmark_seqs)}")
        
        # Finally, make a dataframe structured like train_df and test_df. Columns are: Sequence,IDs,Split,Source,Label
        # Let's make the IDs FusionGID
        test_benchmark_df = pd.DataFrame(
            data = {'Sequence': list(test_benchmark_seqs)}

        )
        seq_id_dict = dict(zip(level_2_3_info['Fusion_Seq'],level_2_3_info['FusionGID']))
        seq_plddts_dict = dict(zip(level_2_3_info['Fusion_Seq'],level_2_3_info['Fusion_AA_pLDDTs']))
        test_benchmark_df['IDs'] = test_benchmark_df['Sequence'].map(seq_id_dict)
        test_benchmark_df['Split'] = ['Fusion_Benchmark']*len(test_benchmark_df)
        test_benchmark_df['Source'] = ['FusionPDB_AlphaFold2']*len(test_benchmark_df)
        test_benchmark_df['Label'] = test_benchmark_df['Sequence'].map(seq_plddts_dict)
        # convert label to 1 or 0
        test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: x.split(","))
        test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: [float(y) for y in x])    # make it a float list of pLDDTs
        test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: ['1' if y < 68.8 else '0' for y in x])    # disordered if pLDDT is < 68.8, accoridng to AlphaFold-pLDDT published threshold
        test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: ''.join(x)) # change ['1','1','0''] to '110'
        
        # check lengths
        test_benchmark_df['SeqLen'] = test_benchmark_df['Sequence'].apply(lambda x: len(x))
        test_benchmark_df['LabelLen'] = test_benchmark_df['Label'].apply(lambda x: len(x))
        log_update(f"\tAll seq lengths and label lengths match: {(test_benchmark_df['SeqLen']==test_benchmark_df['LabelLen']).all()}")
        test_benchmark_df = test_benchmark_df.drop(columns=['SeqLen','LabelLen'])
        
        # convert to string
        test_benchmark_df_str = test_benchmark_df.head(10)
        test_benchmark_df_str['Sequence'] = test_benchmark_df_str['Sequence'].apply(lambda x: x[0:10]+'...')
        test_benchmark_df_str['Label'] = test_benchmark_df_str['Label'].apply(lambda x: x[0:10]+'...')
        test_benchmark_df_str = test_benchmark_df_str.to_string(index=False)
        test_benchmark_df_str = "\t" + test_benchmark_df_str.replace("\n","\n\t")
        log_update(f"\nPreview of benchmarking set:\n{test_benchmark_df_str}")
        test_benchmark_df.to_csv('splits/fusion_bench_df.csv',index=False)
        
        # Add the benchmarking sequences to split 
        log_update(f"\nAdding benchmarking sequences to splits_df.csv:\n\tLength before adding bench seqs: {len(splits_df)}")
        splits_df = pd.concat([splits_df,test_benchmark_df])
        log_update(f"\tLength after adding bench seqs: {len(splits_df)}")
        split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False)
        split_str = "\t" + split_str.replace("\n","\n\t")
        log_update(f"Distribution among splits:\n{split_str}")
        splits_df.to_csv(f"{splits_path}/splits.csv",index=False)

if __name__ == "__main__":
    main()