svincoff's picture
caid benchmark
bae913a
import os
import numpy as np
import re
import pandas as pd
import requests
from fuson_plm.utils.logging import open_logfile, log_update
from fuson_plm.utils.constants import DELIMITERS, VALID_AAS
from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars
from fuson_plm.benchmarking.caid.scrape_fusionpdb import scrape_fusionpdb_level_2_3
from fuson_plm.benchmarking.caid.process_fusion_structures import process_fusions_and_hts
def download_fasta(uniprotid, includeIsoform, output_file):
try:
url = f"https://rest.uniprot.org/uniprotkb/search?format=fasta&includeIsoform={includeIsoform}&query=accession%3A{uniprotid}&size=500&sort=accession+asc"
# Send a GET request to the URL
response = requests.get(url)
# Raise an exception if the request was unsuccessful
response.raise_for_status()
# Write the content to a file in text mode
with open(output_file, 'a+') as file:
file.write(response.text)
log_update(f"FASTA file for {uniprotid} successfully downloaded and added to '{output_file}'")
except requests.exceptions.RequestException as e:
log_update(f"An error occurred: {e}")
# Test Sequences (CAID-2 Disorder-NOX)
def parse_caid_txt(fast_file):
'''
Parses correctly fasta-formatted text file with conditions:
Line 1: ID
Line 2: Sequence
Line 3: Label
'''
seq_to_label = {}
id_to_sequence = {}
with open(fast_file, 'r') as file:
label = None
sequence = ""
seq_id = None
reading_sequence = False
for line in file:
line = line.strip()
if line.startswith(">"):
if label is not None and sequence:
seq_to_label[sequence] = (label, seq_id)
seq_id = line[1:] # Capture the ID without the '>'
label = None
sequence = ""
reading_sequence = True
elif reading_sequence:
if all(c in "01-" for c in line):
label = line
reading_sequence = False
else:
sequence += line
if label is not None and sequence:
seq_to_label[sequence] = (label, seq_id)
return seq_to_label
def check_df_for_mismatched_labels(sd):
log_update("\tChecking dataframe for mismatched sequences and labels...")
counter=0
for idx, row in sd.iterrows():
seq = row['Sequence']
label = row['Label']
if len(seq) != len(label):
counter+=1
log_update(f"\t\tLength mismatch at index {idx}: sequence length = {len(seq)}, label length = {len(label)}")
log_update(f"\t\tTotal mismatched lengths/labels: {counter}")
def process_caid2_disorder_nox_test(caid_path):
"""
Processes the CAID-2_Disorder_NOX_Testing_Sequences.fasta file
"""
log_update("Processing CAID-2-Disorder-NOX Testing Dataset")
# Parse the fasta file
caid_dict = parse_caid_txt(caid_path)
# Gather the sequences
caid_seqs = {}
for k, (v, seq_id) in caid_dict.items():
caid_seqs[seq_id] = (k, v)
log_update(f"\tTotal sequences: {len(caid_seqs)}")
# Form dataframe from processed data
caid_df = pd.DataFrame({
'ID': list(caid_seqs.keys()),
'Sequence': [seq for seq, _ in caid_seqs.values()],
'Label': [lbl for _, lbl in caid_seqs.values()],
'Split': 'Test'
})
check_df_for_mismatched_labels(caid_df)
return caid_df
# Training Squences (fldpnn and IDP-CRF)
# fldpnn Training Sequences
def parse_fldpnn_fasta(file_path):
"""
Parse flDPnn_Training_Dataset.txt, where there are 5 sequence lines. We only want the first
>Disprot ID
Amino acid sequence
Experimental annotation for intrinsic disorder
Experimental annotation for disordered protein binding
Experimental annotation for disordered DNA binding
Experimental annotation for disordered RNA binding
Experimental annotation for disordered flexible linkers
"""
sequences = []
labels = []
ids = []
with open(file_path, 'r') as file:
lines = file.readlines()
seq_id = ""
current_sequence = ""
seen_label_lines = 0 # should go up to 5 for each
current_labels = []
is_label = False
for line in lines:
line = line.strip()
if line.startswith('>'):
if current_sequence and current_labels:
assert seen_label_lines==5 # we should've seen 5 labels, otherwise something is wrong
ids.append(seq_id)
sequences.append(current_sequence)
labels.append(''.join(current_labels))
seq_id = line[1:] # Capture the ID without the '>'
current_sequence = ""
current_labels = []
is_label = False
seen_label_lines = 0
elif re.match('^[A-Z]+$', line): # Sequence lines
current_sequence += line
else: # Label lines
seen_label_lines+=1
if seen_label_lines==1:
current_labels.append(line)
is_label = True
# Add the last sequence and labels
if current_sequence and current_labels:
sequences.append(current_sequence)
labels.append(''.join(current_labels))
ids.append(seq_id)
return ids, sequences, labels
def parse_idp_crf_fasta(file_path):
sequences = []
labels = []
ids = []
with open(file_path, 'r') as file:
lines = file.readlines()
seq_id = ""
current_sequence = ""
current_labels = []
is_label = False
for line in lines:
line = line.strip()
if line.startswith('>'):
if current_sequence and current_labels:
ids.append(seq_id)
sequences.append(current_sequence)
labels.append(''.join(current_labels))
seq_id = line[1:] # Capture the ID without the '>'
current_sequence = ""
current_labels = []
is_label = False
elif re.match('^[A-Z]+$', line): # Sequence lines
current_sequence += line
else: # Label lines
current_labels.append(line)
is_label = True
# Add the last sequence and labels
if current_sequence and current_labels:
sequences.append(current_sequence)
labels.append(''.join(current_labels))
ids.append(seq_id)
return ids, sequences, labels
def process_fldpnn(fldpnn_path, split="training"):
"""
Process the fldpnn_Training_Dataset
"""
log_update(f"\nProcessing flDPnn {split} dataset")
# Parse fasta
fldpnn_ids, fldpnn_seqs, fldpnn_labels = parse_fldpnn_fasta(fldpnn_path)
# Collect cleaned labels
cleaned_fldpnn_ids = []
cleaned_fldpnn_labels = []
for i in range(len(fldpnn_seqs)):
seq_len = len(fldpnn_seqs[i])
label = fldpnn_labels[i] # Should only be the first set of labels
id = fldpnn_ids[i]
cleaned_fldpnn_labels.append(label)
log_update(f"\tTotal labels: {len(cleaned_fldpnn_labels)}, total sequences: {len(fldpnn_seqs)},total IDs: {len(fldpnn_ids)}")
fldpnn_df = pd.DataFrame({'Sequence': fldpnn_seqs,
'Label': cleaned_fldpnn_labels,
"Split": "Train" if split=="training" else "Val",
"ID": fldpnn_ids})
check_df_for_mismatched_labels(fldpnn_df)
return fldpnn_df
def combine_fldpnn_train_val(fldpnn_train_df, fldpnn_val_df):
log_update("\nJoining flDPnn train and val sets into one training set for CAID predictor")
combined = pd.concat([fldpnn_train_df,fldpnn_val_df])
# check for duplicates
duplicates = combined[combined['Sequence'].duplicated()]['Sequence'].unique().tolist()
n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)])
log_update(f"\t{len(duplicates)} sequences in both train and val datasets, corresponding to {n_rows_with_duplicates} rows")
for dup in duplicates:
train_id = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Train')]['ID'].reset_index(drop=True).iloc[0]
val_id = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Val')]['ID'].reset_index(drop=True).iloc[0]
train_label = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Train')]['Label'].reset_index(drop=True).iloc[0]
val_label = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Val')]['Label'].reset_index(drop=True).iloc[0]
log_update(f"\t\tTrain ID: {train_id}\tVal ID: {val_id}\tSame labels: {train_label==val_label}\tSequence: {dup}")
# if the labels are not equal, get rid of it completely. Otherwise just get rid of the val case
if not(train_label==val_label):
log_update(f"\t\t\tSince labels are not equal, removing sequence completely")
combined = combined[combined['Sequence']!=dup].reset_index(drop=True)
else:
log_update(f"\t\t\tSince labels are equal, removing validation copy")
combined = combined.loc[(combined['Sequence']!=dup) |
((combined['Sequence']==dup) & (combined['Split']=='Train'))]
# drop duplicates
log_update(f"\tLength of joined flDPnn data: {len(combined)}")
return combined
def process_idp_crf_train(idp_crf_train_path):
"""
Process IDP-CRF_Training_Dataset
Args:
idp_crf_train_path
"""
log_update("\nProcessing IDP-CRF training dataset")
# Parse the fasta, get sequences and labels
idp_crf_ids, idp_crf_seqs, idp_crf_labels = parse_idp_crf_fasta(idp_crf_train_path)
log_update(f"\tTotal labels: {len(idp_crf_labels)}, total sequences: {len(idp_crf_seqs)}, total IDs: {len(idp_crf_ids)}")
# Clean the labels
cleaned_idp_ids, cleaned_idp_seqs, cleaned_idp_labels = [], [], []
counter = 0
log_update("\tCleaning labels and counting length-mismatched examples...")
for i, label in enumerate(idp_crf_labels):
# If length of sequence and labels doesn't match, log it
if len(idp_crf_seqs[i]) != len(idp_crf_labels[i]):
log_update(f"\t\tLength mismatch at index {i}: sequence length = {len(idp_crf_seqs[i])}, label length = {len(idp_crf_labels[i])}")
counter += 1
# Else, "clean" the labels by mapping them to ints and converting them to a list
else:
cleaned_idp_ids.append(idp_crf_ids[i])
cleaned_idp_labels.append(label)
cleaned_idp_seqs.append(idp_crf_seqs[i])
log_update(f"\t\tMismatched lengths/labels: {counter}")
# Confirm that final database has no mismatched labels
idp_crf_df = pd.DataFrame({'Sequence': cleaned_idp_seqs,
'Label': cleaned_idp_labels,
"Split": "Train",
"ID": cleaned_idp_ids})
check_df_for_mismatched_labels(idp_crf_df)
return idp_crf_df
def find_agreeing_labels(row, lab1="", lab2=""):
"""
If there's only one possible label, return that label. If the two labels disagree, return np.nan
"""
val1 = row[lab1]
val2 = row[lab2]
# If one of them is nan, then they won't match anyway, so return True because there is no conflict
if type(val1)==float and np.isnan(val1):
return val2
elif type(val2)==float and np.isnan(val2):
return val1
else:
if val1==val2:
return val1
else:
return np.nan
def get_unique_ids(row):
source_to_id = {
"IDP-CRF": row["IDP-CRF ID"],
"flDPnn": row["flDPnn ID"],
"CAID-2_Disorder_NOX": row["CAID-2_Disorder_NOX ID"]
}
all_sources = row["Source"].split(",")
all_ids = []
# they are already in the desired order so just iterate through them
for source in all_sources:
candidate_id = source_to_id[source]
if not(candidate_id in all_ids):
all_ids.append(candidate_id)
return ",".join(all_ids)
def parse_caid2_results(processed_caid2_df,lines):
# iterate through the lines
all_caid2_disorder_nox_ids = processed_caid2_df['ID'].tolist()
all_caid2_disorder_nox_sequences = processed_caid2_df['Sequence'].tolist()
cur_id = None
results = {
}
for i, line in enumerate(lines):
# If line starts with >, that means we have a new ID
if line[0]==">":
# If we are currently on a different cur_id, finish that one out
if not(cur_id is None):
results[cur_id]['prob_1'] = ",".join(results[cur_id]['prob_1'])
results[cur_id]['pred_labels'] = ",".join(results[cur_id]['pred_labels'])
sequence = results[cur_id]['sequence']
# get the true labels from the CAID2 dataset - IF POSSIBLE
if (cur_id not in all_caid2_disorder_nox_ids) and (sequence not in all_caid2_disorder_nox_sequences):
results[cur_id]['labels'] = np.nan
else:
true_labels = processed_caid2_df.loc[
processed_caid2_df['ID']==cur_id,'Label'
].item()
true_labels = ",".join(list(true_labels))
results[cur_id]['labels'] = true_labels
# Now process the new one
cur_id = line[1::].strip('\t').strip('\n')
results[cur_id] = {
'sequence': '',
'prob_1': [],
'pred_labels': []
}
# if cur id is not None
else:
# if we have a cur id to process, process it
if not(cur_id is None):
# Extract the information - not every .caid file as predicted labels!!
lsplit = line.strip('\n').split('\t')
label=''
idx, aa, prob = lsplit[0], lsplit[1], lsplit[2]
if len(lsplit)==4: label=lsplit[3]
# Add to dict
results[cur_id]['sequence']+=aa
results[cur_id]['prob_1'].append(prob)
results[cur_id]['pred_labels'].append(label)
# if we're on the last line, combine
if i==len(lines)-1:
results[cur_id]['prob_1'] = ",".join(results[cur_id]['prob_1'])
results[cur_id]['pred_labels'] = ",".join(results[cur_id]['pred_labels'])
sequence = results[cur_id]['sequence']
# get the true labels from the CAID2 dataset - IF POSSIBLE
if (cur_id not in all_caid2_disorder_nox_ids) and (sequence not in all_caid2_disorder_nox_sequences):
results[cur_id]['labels'] = np.nan
else:
true_labels = processed_caid2_df.loc[
processed_caid2_df['ID']==cur_id,'Label'
].item()
true_labels = ",".join(list(true_labels))
results[cur_id]['labels'] = true_labels
df = pd.DataFrame.from_dict(results,orient='index').reset_index().rename(columns={'index':'seq_id'})
df = df.loc[df['labels'].notna()].reset_index(drop=True)
# drop pred_labels if it's empty
if set(','.join(df['pred_labels'].tolist()))=={','}:
df = df.drop(columns=['pred_labels'])
log_update(f"\t\tno predicted labels provided for this dataset; only probabilities")
log_update(f"\t\t{len(df)}/{len(all_caid2_disorder_nox_sequences)} total CAID2-Nox sequences")
return df
def parse_all_caid2_results(processed_caid2_df, caid_raw_folder="raw_data/caid2_competition_results"):
save_dir ="processed_data/caid2_competition_results"
os.makedirs(save_dir,exist_ok=True)
log_update(f"\nExtracting all CAID-2_Disorder_NOX results from CAID2 competition results files...")
all_caid_files = os.listdir(caid_raw_folder)
for caid_file in all_caid_files:
# figure out how to parse .caid files
with open(f"{caid_raw_folder}/{caid_file}", "r") as f:
lines = f.readlines()
log_update(f"\t{caid_file}:")
results_df = parse_caid2_results(processed_caid2_df,lines)
# save it
competitor_name = caid_file.split('.caid')[0]
results_df.to_csv(f"{save_dir}/{competitor_name}_CAID-2_Disorder_NOX.csv",index=False)
def make_train_df(fldpnn_df, idp_crf_df):
"""
Make training dataframe by concatenating the two processed training sets.
"""
# Add source columns so we can track where each sequence came from
idp_crf_df = idp_crf_df.rename(columns={'Label':'IDP-CRF Label', 'ID': 'IDP-CRF ID'}).drop(columns=['Split'])
fldpnn_df = fldpnn_df.rename(columns={'Label':'flDPnn Label', 'ID': 'flDPnn ID'}).drop(columns=['Split'])
########### Combine fldpnn and idp crf
# Join
log_update("\nJoining flDPnn and IDP-CRF data by sequence make unified training set")
train_df = pd.merge(idp_crf_df,
fldpnn_df,
on='Sequence',
how='outer',
indicator=True)
train_df["Split"] = ["Train"]*len(train_df)
# Map _merge column to desired labels
train_df['Source'] = train_df['_merge'].map({
'left_only': 'IDP-CRF',
'right_only': 'flDPnn',
'both': 'IDP-CRF,flDPnn'
})
train_df = train_df.drop(columns=["_merge"])
log_update(f"\tIDP-CRF dataset size: {len(idp_crf_df)}\n\tfLDpnn dataset size: {len(fldpnn_df)}\n\tinitial train dataset size: {len(train_df)}")
# Check for duplicate sequences
log_update(f"\tChecking for sequences in both datasets...")
duplicates = train_df[train_df["Source"].str.contains(",")]['Sequence'].unique().tolist()
n_rows_with_duplicates = len(train_df[train_df['Sequence'].isin(duplicates)])
log_update(f"\t\t{len(duplicates)} sequences in both datasets, corresponding to {n_rows_with_duplicates} rows")
# Check for consistency between IDP-CRF Label and flDPnn label
train_df["Label"] = train_df.apply(lambda row: find_agreeing_labels(row,lab1="IDP-CRF Label",lab2="flDPnn Label"),axis=1)
train_df["No Label Conflicts"]= ~train_df["Label"].isna()
log_update(f"\tChecked for label inconsistencies between IDP-CRF and flDPnn on the same sequence:")
match_str = train_df['No Label Conflicts'].value_counts().reset_index().rename(columns={'index': 'No Label Conflicts','No Label Conflicts': 'count'}).to_string(index=False)
match_str = "\t\t" + match_str.replace("\n","\n\t\t")
log_update(match_str)
# Dropping rows where labels don't match
#train_df[train_df['No Label Conflicts']==False][['Sequence','Split','IDP-CRF ID','flDPnn ID','IDP-CRF Label','flDPnn Label','No Label Conflicts']].to_csv('mismatch.csv',index=False)
# Drop row with known conflict with disprot
conflict_seq="MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKRMRRRIESEDEEDSSQDALVPRTPSPRPSTSAADLAIAPKKKKKRPSPKPERPPSPEVIVDSEEEREDVALQMVGFSNPPVLIKHGKGGKRTVRRLNEDDPVARGMRTQEEEEEPSEAESEITVMNPLSVPIVSAWEKGMEAARALMDKYHVDNDLKANFKLLPDQVEALAAVCKTWLNEEHRGLQLTFTSKKTFVTMMGRFLQAYLQSFAEVTYKHHEPTGCALWLHRCAEIEGELKCLHGSIMINKEHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNTDARCCVHDAACPANQFSGKSCGMFFSEGAKAQVAFKQIKAFMQALYPNAQTGHGHLLMPLRCECNSKPGHAPFLGRQLPKLTPFALSNAEDLDADLISDKSVLASVHHPALIVFQCCNPVYRNSRAQGGGPNCDFKISAPDLLNALVMVRSLWSENFTELPRMVVPEFKWSTKHQYRNVSLPVAHSDARQNPFDF"
train_df = train_df.loc[train_df['Sequence']!=conflict_seq].reset_index(drop=True)
log_update(f"\tDropping rows with label mismatch or known error (total={len(train_df[train_df['No Label Conflicts']==False])+1})")
train_df = train_df.loc[train_df['No Label Conflicts']].reset_index(drop=True)
# Make a new label column
train_df = train_df.drop(columns=["IDP-CRF Label","flDPnn Label"])
log_update(f"\t\tNew dataset size: {len(train_df)}")
######## Final checks
# Check for any invalid sequences or invalid characters
cols_of_interest = ['Sequence','Split','Label','IDP-CRF ID','flDPnn ID']
listlike_dict = check_columns_for_listlike(train_df, cols_of_interest, DELIMITERS)
# Check for invalid characters
train_df['invalid_chars'] = train_df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS))
train_df[train_df['invalid_chars'].str.len()>0].sort_values(by='Sequence')
all_invalid_chars = set().union(*train_df['invalid_chars'])
log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}")
# Dropping rows where invalid characters(should be none)
log_update(f"\tDropping rows with invalid characters (total={len(train_df[train_df['invalid_chars'].str.len()>0])})")
train_df = train_df.loc[train_df['invalid_chars'].str.len()==0].reset_index(drop=True)
train_df = train_df.drop(columns=['invalid_chars'])
log_update(f"\t\tNew dataset size: {len(train_df)}")
source_str = train_df['Source'].value_counts().reset_index().rename(columns={'index': 'Source','Source': 'count'}).to_string(index=False)
source_str = "\t\t" + source_str.replace("\n","\n\t\t")
log_update(f"\tSources:\n{source_str}")
return train_df
def make_train_and_test_df(train_df, test_df):
"""
Combine the training and testing dataframe into one
"""
log_update("\nMaking final dataframe with train and test splits")
# Concatenate proposed train and test
test_df["Source"] = ["CAID-2_Disorder_NOX"]*len(test_df)
splits_df = pd.concat([train_df.drop(columns=['No Label Conflicts']),
test_df.rename(columns={'ID':'CAID-2_Disorder_NOX ID', 'Label': 'CAID-2_Disorder_NOX Label'})])
split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False)
split_str = "\t\t" + split_str.replace("\n","\n\t\t")
log_update(f"\tTrain dataset size: {len(train_df)}\n\tTest dataset size: {len(test_df)}\n\tinitial combined dataset size: {len(splits_df)}")
# Check for duplicates - if we find any, REMOVE them from train and keep them in test
duplicates = splits_df[splits_df.duplicated('Sequence')]['Sequence'].unique().tolist()
n_rows_with_duplicates = len(splits_df[splits_df['Sequence'].isin(duplicates)])
log_update(f"\t\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")
for i, dup in enumerate(duplicates):
fldpnn_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Train')]['flDPnn ID'].item()
idp_crf_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Train')]['IDP-CRF ID'].item()
caid2_disorder_nox_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Test')]['CAID-2_Disorder_NOX ID'].item()
log_update(f"\t\t\t{i+1}: flDPnn ID: {fldpnn_id}\tIDP-CRF ID: {idp_crf_id}\tCAID-2_Disorder_NOX ID: {caid2_disorder_nox_id}\n\t\t\t\tSequence: {dup}")
# remove from train and keep in test
splits_df = splits_df.loc[
(~splits_df['Sequence'].isin(duplicates)) | # Either the sequence is NOT duplicated, or
((splits_df['Sequence'].isin(duplicates)) & (splits_df['Split']=='Test')) # Sequence is duplicated, and it's in test set
].reset_index(drop=True)
split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False)
split_str = "\t\t" + split_str.replace("\n","\n\t\t")
log_update(f"\tRemoved duplicate sequences from training split, kept in test split\n\t\tNew dataset size: {len(splits_df)}\n\n{split_str}")
# Everything in the train set should have a label; nothing in the test set should
assert splits_df[splits_df["Label"].isna()]["Split"].value_counts().reset_index()['index'].tolist()==['Test']
splits_df.loc[
splits_df["Split"]=="Test","Label"
] = splits_df.loc[
splits_df["Split"]=="Test","CAID-2_Disorder_NOX Label"
]
splits_df = splits_df.drop(columns=["CAID-2_Disorder_NOX Label"])
# Make sure there are no na's in label
assert len(splits_df[splits_df["Label"].isna()])==0
# Print out distribution of sources
source_str = splits_df['Source'].value_counts().reset_index().rename(columns={'index': 'Source','Source': 'count'}).to_string(index=False)
source_str = "\t\t" + source_str.replace("\n","\n\t\t")
total_sources = sum(splits_df['Source'].value_counts().reset_index()['Source'])
assert total_sources == len(splits_df)
log_update(f"\n\tSource distribution:\n{source_str}\n\n\t\t\t\t\t\tSum: {total_sources}")
# Print largest and smallest seq len in each set
longest_train = max(splits_df[splits_df['Split']=='Train']['Sequence'].apply(lambda x: len(x)).tolist())
shortest_train = min(splits_df[splits_df['Split']=='Train']['Sequence'].apply(lambda x: len(x)).tolist())
longest_test = max(splits_df[splits_df['Split']=='Test']['Sequence'].apply(lambda x: len(x)).tolist())
shortest_test = min(splits_df[splits_df['Split']=='Test']['Sequence'].apply(lambda x: len(x)).tolist())
log_update(f"\n\tLength distributions...\n\t\tTrain: max={longest_train}\tmin={shortest_train}\n\t\tTest: max={longest_test}\tmin={shortest_test}")
# Consolidate the IDs a bit
splits_df["IDs"] = splits_df.apply(lambda row: get_unique_ids(row),axis=1)
assert len(splits_df[splits_df["IDs"].isna()])==0
n_different_ids = len(splits_df.loc[splits_df["IDs"].str.contains(",")])
log_update(f"\n\tProvided comma-separated IDs in same listed order as Source\n\t\t- train: IDP-CRF first, flDPnn second ({n_different_ids} seqs have multiple distinct IDs)\n\t\t- test: CAID-2_Disorder_NOX")
# Keep only desired columns
splits_df = splits_df[[
'Sequence','IDs','Split','Source','Label'
]]
return splits_df
def main():
with open_logfile("data_cleaning_log.txt"):
rawdata_train_test_path = "raw_data/caid2_train_and_test_data"
# make directory to save processed data
processeddata_path = "processed_data"
splits_path = "splits"
os.makedirs(processeddata_path,exist_ok=True)
os.makedirs(splits_path,exist_ok=True)
# Process CAID-2_Disorder_NOX_Testing_Sequences dataset from fasta file
caid_path = f"{rawdata_train_test_path}/CAID-2_Disorder_NOX_Testing_Sequences.fasta"
caid_df = process_caid2_disorder_nox_test(caid_path)
caid_df.to_csv(f"{processeddata_path}/CAID-2_Disorder_NOX_Processed.csv", index=False)
# Process fldpnn Training and Validation Datasets
fldpnn_train_path = f"{rawdata_train_test_path}/flDPnn_Training_Dataset.txt"
fldpnn_val_path = f"{rawdata_train_test_path}/flDPnn_Validation_Annotation.txt"
fldpnn_train_df = process_fldpnn(fldpnn_train_path, split="training")
fldpnn_val_df = process_fldpnn(fldpnn_val_path, split="validation")
fldpnn_train_df.to_csv(f"{processeddata_path}/flDPnn_Training_Dataset.csv", index=False)
fldpnn_val_df.to_csv(f"{processeddata_path}/flDPnn_Validation_Dataset.csv", index=False)
# Combine train and val
fldpnn_df = combine_fldpnn_train_val(fldpnn_train_df, fldpnn_val_df)
# Process IDP-CRF_Training_Dataset
idp_crf_train_path = f"{rawdata_train_test_path}/IDP-CRF_Training_Dataset.txt"
idp_crf_df= process_idp_crf_train(idp_crf_train_path)
idp_crf_df.to_csv(f"{processeddata_path}/IDP-CRF_Training_Dataset.csv", index=False)
# Merge
train_df = make_train_df(fldpnn_df, idp_crf_df)
# Make a full splits file
splits_df = make_train_and_test_df(train_df, caid_df)
final_train_df = splits_df.loc[splits_df['Split']=='Train'].reset_index(drop=True)
final_test_df = splits_df.loc[splits_df['Split']=='Test'].reset_index(drop=True)
# Save final files
final_train_df.to_csv(f"{splits_path}/train_df.csv", index=False)
final_test_df.to_csv(f"{splits_path}/test_df.csv", index=False)
# Process the caid competition results and save them in a more accessible format
processed_caid2_df = pd.read_csv(f"{processeddata_path}/CAID-2_Disorder_NOX_Processed.csv")
parse_all_caid2_results(processed_caid2_df)
# Process data for visualizing fusion oncoproteins
# Scrape FusionPDB
scrape_fusionpdb_level_2_3()
# Process the structures that we downloaded from scraping
process_fusions_and_hts()
# Now, figure out which structures are in the test set and isolate those for benchmarking in splits/fusion_bench_df.csv
fusion_test_set = pd.read_csv("../../data/splits/test_df.csv")
# columns are sequence, member length, snp_probabilities
fusion_test_set = set(fusion_test_set['sequence'].tolist())
log_update(f"\nFinding level 2 and 3 fusion structures that are in the FusOn-pLM test set...\n\tTest set size: {len(fusion_test_set)} seqs")
level_2_3_info = pd.read_csv('processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv')
# there are duplicate sequences in here so drop duplicates randomly
level_2_3_seqs = level_2_3_info.drop_duplicates('Fusion_Seq').reset_index(drop=True)
level_2_3_seqs = set(level_2_3_seqs.loc[
level_2_3_info['Fusion_pLDDT'].notna() # make sure we've got a structure
]['Fusion_Seq'].tolist())
# if it has a structure, it's in the test set, and it's not in the caid train set, we can benchmark with it
test_benchmark_seqs = fusion_test_set.intersection(level_2_3_seqs)
log_update(f"\tTotal fusion proteins in the FusOn-pLM test set: {len(test_benchmark_seqs)}")
caid_train_set = set(pd.read_csv('splits/train_df.csv')['Sequence'].tolist())
test_benchmark_seqs = test_benchmark_seqs.difference(caid_train_set) # subtract off the caid train set to be sure
log_update(f"\tTotal fusion proteins in the FusOn-pLM test set and NOT in the CAID train set: {len(test_benchmark_seqs)}")
# Finally, make a dataframe structured like train_df and test_df. Columns are: Sequence,IDs,Split,Source,Label
# Let's make the IDs FusionGID
test_benchmark_df = pd.DataFrame(
data = {'Sequence': list(test_benchmark_seqs)}
)
seq_id_dict = dict(zip(level_2_3_info['Fusion_Seq'],level_2_3_info['FusionGID']))
seq_plddts_dict = dict(zip(level_2_3_info['Fusion_Seq'],level_2_3_info['Fusion_AA_pLDDTs']))
test_benchmark_df['IDs'] = test_benchmark_df['Sequence'].map(seq_id_dict)
test_benchmark_df['Split'] = ['Fusion_Benchmark']*len(test_benchmark_df)
test_benchmark_df['Source'] = ['FusionPDB_AlphaFold2']*len(test_benchmark_df)
test_benchmark_df['Label'] = test_benchmark_df['Sequence'].map(seq_plddts_dict)
# convert label to 1 or 0
test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: x.split(","))
test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: [float(y) for y in x]) # make it a float list of pLDDTs
test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: ['1' if y < 68.8 else '0' for y in x]) # disordered if pLDDT is < 68.8, accoridng to AlphaFold-pLDDT published threshold
test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: ''.join(x)) # change ['1','1','0''] to '110'
# check lengths
test_benchmark_df['SeqLen'] = test_benchmark_df['Sequence'].apply(lambda x: len(x))
test_benchmark_df['LabelLen'] = test_benchmark_df['Label'].apply(lambda x: len(x))
log_update(f"\tAll seq lengths and label lengths match: {(test_benchmark_df['SeqLen']==test_benchmark_df['LabelLen']).all()}")
test_benchmark_df = test_benchmark_df.drop(columns=['SeqLen','LabelLen'])
# convert to string
test_benchmark_df_str = test_benchmark_df.head(10)
test_benchmark_df_str['Sequence'] = test_benchmark_df_str['Sequence'].apply(lambda x: x[0:10]+'...')
test_benchmark_df_str['Label'] = test_benchmark_df_str['Label'].apply(lambda x: x[0:10]+'...')
test_benchmark_df_str = test_benchmark_df_str.to_string(index=False)
test_benchmark_df_str = "\t" + test_benchmark_df_str.replace("\n","\n\t")
log_update(f"\nPreview of benchmarking set:\n{test_benchmark_df_str}")
test_benchmark_df.to_csv('splits/fusion_bench_df.csv',index=False)
# Add the benchmarking sequences to split
log_update(f"\nAdding benchmarking sequences to splits_df.csv:\n\tLength before adding bench seqs: {len(splits_df)}")
splits_df = pd.concat([splits_df,test_benchmark_df])
log_update(f"\tLength after adding bench seqs: {len(splits_df)}")
split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False)
split_str = "\t" + split_str.replace("\n","\n\t")
log_update(f"Distribution among splits:\n{split_str}")
splits_df.to_csv(f"{splits_path}/splits.csv",index=False)
if __name__ == "__main__":
main()