import os |
import numpy as np |
import re |
import pandas as pd |
import requests |
from fuson_plm.utils.logging import open_logfile, log_update |
from fuson_plm.utils.constants import DELIMITERS, VALID_AAS |
from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars |
from fuson_plm.benchmarking.caid.scrape_fusionpdb import scrape_fusionpdb_level_2_3 |
from fuson_plm.benchmarking.caid.process_fusion_structures import process_fusions_and_hts |
def download_fasta(uniprotid, includeIsoform, output_file): |
try: |
url = f"https://rest.uniprot.org/uniprotkb/search?format=fasta&includeIsoform={includeIsoform}&query=accession%3A{uniprotid}&size=500&sort=accession+asc" |
response = requests.get(url) |
response.raise_for_status() |
with open(output_file, 'a+') as file: |
file.write(response.text) |
log_update(f"FASTA file for {uniprotid} successfully downloaded and added to '{output_file}'") |
except requests.exceptions.RequestException as e: |
log_update(f"An error occurred: {e}") |
def parse_caid_txt(fast_file): |
''' |
Parses correctly fasta-formatted text file with conditions: |
Line 1: ID |
Line 2: Sequence |
Line 3: Label |
''' |
seq_to_label = {} |
id_to_sequence = {} |
with open(fast_file, 'r') as file: |
label = None |
sequence = "" |
seq_id = None |
reading_sequence = False |
for line in file: |
line = line.strip() |
if line.startswith(">"): |
if label is not None and sequence: |
seq_to_label[sequence] = (label, seq_id) |
seq_id = line[1:] |
label = None |
sequence = "" |
reading_sequence = True |
elif reading_sequence: |
if all(c in "01-" for c in line): |
label = line |
reading_sequence = False |
else: |
sequence += line |
if label is not None and sequence: |
seq_to_label[sequence] = (label, seq_id) |
return seq_to_label |
def check_df_for_mismatched_labels(sd): |
log_update("\tChecking dataframe for mismatched sequences and labels...") |
counter=0 |
for idx, row in sd.iterrows(): |
seq = row['Sequence'] |
label = row['Label'] |
if len(seq) != len(label): |
counter+=1 |
log_update(f"\t\tLength mismatch at index {idx}: sequence length = {len(seq)}, label length = {len(label)}") |
log_update(f"\t\tTotal mismatched lengths/labels: {counter}") |
def process_caid2_disorder_nox_test(caid_path): |
""" |
Processes the CAID-2_Disorder_NOX_Testing_Sequences.fasta file |
""" |
log_update("Processing CAID-2-Disorder-NOX Testing Dataset") |
caid_dict = parse_caid_txt(caid_path) |
caid_seqs = {} |
for k, (v, seq_id) in caid_dict.items(): |
caid_seqs[seq_id] = (k, v) |
log_update(f"\tTotal sequences: {len(caid_seqs)}") |
caid_df = pd.DataFrame({ |
'ID': list(caid_seqs.keys()), |
'Sequence': [seq for seq, _ in caid_seqs.values()], |
'Label': [lbl for _, lbl in caid_seqs.values()], |
'Split': 'Test' |
}) |
check_df_for_mismatched_labels(caid_df) |
return caid_df |
def parse_fldpnn_fasta(file_path): |
""" |
Parse flDPnn_Training_Dataset.txt, where there are 5 sequence lines. We only want the first |
>Disprot ID |
Amino acid sequence |
Experimental annotation for intrinsic disorder |
Experimental annotation for disordered protein binding |
Experimental annotation for disordered DNA binding |
Experimental annotation for disordered RNA binding |
Experimental annotation for disordered flexible linkers |
""" |
sequences = [] |
labels = [] |
ids = [] |
with open(file_path, 'r') as file: |
lines = file.readlines() |
seq_id = "" |
current_sequence = "" |
seen_label_lines = 0 |
current_labels = [] |
is_label = False |
for line in lines: |
line = line.strip() |
if line.startswith('>'): |
if current_sequence and current_labels: |
assert seen_label_lines==5 |
ids.append(seq_id) |
sequences.append(current_sequence) |
labels.append(''.join(current_labels)) |
seq_id = line[1:] |
current_sequence = "" |
current_labels = [] |
is_label = False |
seen_label_lines = 0 |
elif re.match('^[A-Z]+$', line): |
current_sequence += line |
else: |
seen_label_lines+=1 |
if seen_label_lines==1: |
current_labels.append(line) |
is_label = True |
if current_sequence and current_labels: |
sequences.append(current_sequence) |
labels.append(''.join(current_labels)) |
ids.append(seq_id) |
return ids, sequences, labels |
def parse_idp_crf_fasta(file_path): |
sequences = [] |
labels = [] |
ids = [] |
with open(file_path, 'r') as file: |
lines = file.readlines() |
seq_id = "" |
current_sequence = "" |
current_labels = [] |
is_label = False |
for line in lines: |
line = line.strip() |
if line.startswith('>'): |
if current_sequence and current_labels: |
ids.append(seq_id) |
sequences.append(current_sequence) |
labels.append(''.join(current_labels)) |
seq_id = line[1:] |
current_sequence = "" |
current_labels = [] |
is_label = False |
elif re.match('^[A-Z]+$', line): |
current_sequence += line |
else: |
current_labels.append(line) |
is_label = True |
if current_sequence and current_labels: |
sequences.append(current_sequence) |
labels.append(''.join(current_labels)) |
ids.append(seq_id) |
return ids, sequences, labels |
def process_fldpnn(fldpnn_path, split="training"): |
""" |
Process the fldpnn_Training_Dataset |
""" |
log_update(f"\nProcessing flDPnn {split} dataset") |
fldpnn_ids, fldpnn_seqs, fldpnn_labels = parse_fldpnn_fasta(fldpnn_path) |
cleaned_fldpnn_ids = [] |
cleaned_fldpnn_labels = [] |
for i in range(len(fldpnn_seqs)): |
seq_len = len(fldpnn_seqs[i]) |
label = fldpnn_labels[i] |
id = fldpnn_ids[i] |
cleaned_fldpnn_labels.append(label) |
log_update(f"\tTotal labels: {len(cleaned_fldpnn_labels)}, total sequences: {len(fldpnn_seqs)},total IDs: {len(fldpnn_ids)}") |
fldpnn_df = pd.DataFrame({'Sequence': fldpnn_seqs, |
'Label': cleaned_fldpnn_labels, |
"Split": "Train" if split=="training" else "Val", |
"ID": fldpnn_ids}) |
check_df_for_mismatched_labels(fldpnn_df) |
return fldpnn_df |
def combine_fldpnn_train_val(fldpnn_train_df, fldpnn_val_df): |
log_update("\nJoining flDPnn train and val sets into one training set for CAID predictor") |
combined = pd.concat([fldpnn_train_df,fldpnn_val_df]) |
duplicates = combined[combined['Sequence'].duplicated()]['Sequence'].unique().tolist() |
n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)]) |
log_update(f"\t{len(duplicates)} sequences in both train and val datasets, corresponding to {n_rows_with_duplicates} rows") |
for dup in duplicates: |
train_id = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Train')]['ID'].reset_index(drop=True).iloc[0] |
val_id = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Val')]['ID'].reset_index(drop=True).iloc[0] |
train_label = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Train')]['Label'].reset_index(drop=True).iloc[0] |
val_label = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Val')]['Label'].reset_index(drop=True).iloc[0] |
log_update(f"\t\tTrain ID: {train_id}\tVal ID: {val_id}\tSame labels: {train_label==val_label}\tSequence: {dup}") |
if not(train_label==val_label): |
log_update(f"\t\t\tSince labels are not equal, removing sequence completely") |
combined = combined[combined['Sequence']!=dup].reset_index(drop=True) |
else: |
log_update(f"\t\t\tSince labels are equal, removing validation copy") |
combined = combined.loc[(combined['Sequence']!=dup) | |
((combined['Sequence']==dup) & (combined['Split']=='Train'))] |
log_update(f"\tLength of joined flDPnn data: {len(combined)}") |
return combined |
def process_idp_crf_train(idp_crf_train_path): |
""" |
Process IDP-CRF_Training_Dataset |
Args: |
idp_crf_train_path |
""" |
log_update("\nProcessing IDP-CRF training dataset") |
idp_crf_ids, idp_crf_seqs, idp_crf_labels = parse_idp_crf_fasta(idp_crf_train_path) |
log_update(f"\tTotal labels: {len(idp_crf_labels)}, total sequences: {len(idp_crf_seqs)}, total IDs: {len(idp_crf_ids)}") |
cleaned_idp_ids, cleaned_idp_seqs, cleaned_idp_labels = [], [], [] |
counter = 0 |
log_update("\tCleaning labels and counting length-mismatched examples...") |
for i, label in enumerate(idp_crf_labels): |
if len(idp_crf_seqs[i]) != len(idp_crf_labels[i]): |
log_update(f"\t\tLength mismatch at index {i}: sequence length = {len(idp_crf_seqs[i])}, label length = {len(idp_crf_labels[i])}") |
counter += 1 |
else: |
cleaned_idp_ids.append(idp_crf_ids[i]) |
cleaned_idp_labels.append(label) |
cleaned_idp_seqs.append(idp_crf_seqs[i]) |
log_update(f"\t\tMismatched lengths/labels: {counter}") |
idp_crf_df = pd.DataFrame({'Sequence': cleaned_idp_seqs, |
'Label': cleaned_idp_labels, |
"Split": "Train", |
"ID": cleaned_idp_ids}) |
check_df_for_mismatched_labels(idp_crf_df) |
return idp_crf_df |
def find_agreeing_labels(row, lab1="", lab2=""): |
""" |
If there's only one possible label, return that label. If the two labels disagree, return np.nan |
""" |
val1 = row[lab1] |
val2 = row[lab2] |
if type(val1)==float and np.isnan(val1): |
return val2 |
elif type(val2)==float and np.isnan(val2): |
return val1 |
else: |
if val1==val2: |
return val1 |
else: |
return np.nan |
def get_unique_ids(row): |
source_to_id = { |
"IDP-CRF": row["IDP-CRF ID"], |
"flDPnn": row["flDPnn ID"], |
"CAID-2_Disorder_NOX": row["CAID-2_Disorder_NOX ID"] |
} |
all_sources = row["Source"].split(",") |
all_ids = [] |
for source in all_sources: |
candidate_id = source_to_id[source] |
if not(candidate_id in all_ids): |
all_ids.append(candidate_id) |
return ",".join(all_ids) |
def parse_caid2_results(processed_caid2_df,lines): |
all_caid2_disorder_nox_ids = processed_caid2_df['ID'].tolist() |
all_caid2_disorder_nox_sequences = processed_caid2_df['Sequence'].tolist() |
cur_id = None |
results = { |
} |
for i, line in enumerate(lines): |
if line[0]==">": |
if not(cur_id is None): |
results[cur_id]['prob_1'] = ",".join(results[cur_id]['prob_1']) |
results[cur_id]['pred_labels'] = ",".join(results[cur_id]['pred_labels']) |
sequence = results[cur_id]['sequence'] |
if (cur_id not in all_caid2_disorder_nox_ids) and (sequence not in all_caid2_disorder_nox_sequences): |
results[cur_id]['labels'] = np.nan |
else: |
true_labels = processed_caid2_df.loc[ |
processed_caid2_df['ID']==cur_id,'Label' |
].item() |
true_labels = ",".join(list(true_labels)) |
results[cur_id]['labels'] = true_labels |
cur_id = line[1::].strip('\t').strip('\n') |
results[cur_id] = { |
'sequence': '', |
'prob_1': [], |
'pred_labels': [] |
} |
else: |
if not(cur_id is None): |
lsplit = line.strip('\n').split('\t') |
label='' |
idx, aa, prob = lsplit[0], lsplit[1], lsplit[2] |
if len(lsplit)==4: label=lsplit[3] |
results[cur_id]['sequence']+=aa |
results[cur_id]['prob_1'].append(prob) |
results[cur_id]['pred_labels'].append(label) |
if i==len(lines)-1: |
results[cur_id]['prob_1'] = ",".join(results[cur_id]['prob_1']) |
results[cur_id]['pred_labels'] = ",".join(results[cur_id]['pred_labels']) |
sequence = results[cur_id]['sequence'] |
if (cur_id not in all_caid2_disorder_nox_ids) and (sequence not in all_caid2_disorder_nox_sequences): |
results[cur_id]['labels'] = np.nan |
else: |
true_labels = processed_caid2_df.loc[ |
processed_caid2_df['ID']==cur_id,'Label' |
].item() |
true_labels = ",".join(list(true_labels)) |
results[cur_id]['labels'] = true_labels |
df = pd.DataFrame.from_dict(results,orient='index').reset_index().rename(columns={'index':'seq_id'}) |
df = df.loc[df['labels'].notna()].reset_index(drop=True) |
if set(','.join(df['pred_labels'].tolist()))=={','}: |
df = df.drop(columns=['pred_labels']) |
log_update(f"\t\tno predicted labels provided for this dataset; only probabilities") |
log_update(f"\t\t{len(df)}/{len(all_caid2_disorder_nox_sequences)} total CAID2-Nox sequences") |
return df |
def parse_all_caid2_results(processed_caid2_df, caid_raw_folder="raw_data/caid2_competition_results"): |
save_dir ="processed_data/caid2_competition_results" |
os.makedirs(save_dir,exist_ok=True) |
log_update(f"\nExtracting all CAID-2_Disorder_NOX results from CAID2 competition results files...") |
all_caid_files = os.listdir(caid_raw_folder) |
for caid_file in all_caid_files: |
with open(f"{caid_raw_folder}/{caid_file}", "r") as f: |
lines = f.readlines() |
log_update(f"\t{caid_file}:") |
results_df = parse_caid2_results(processed_caid2_df,lines) |
competitor_name = caid_file.split('.caid')[0] |
results_df.to_csv(f"{save_dir}/{competitor_name}_CAID-2_Disorder_NOX.csv",index=False) |
def make_train_df(fldpnn_df, idp_crf_df): |
""" |
Make training dataframe by concatenating the two processed training sets. |
""" |
idp_crf_df = idp_crf_df.rename(columns={'Label':'IDP-CRF Label', 'ID': 'IDP-CRF ID'}).drop(columns=['Split']) |
fldpnn_df = fldpnn_df.rename(columns={'Label':'flDPnn Label', 'ID': 'flDPnn ID'}).drop(columns=['Split']) |
log_update("\nJoining flDPnn and IDP-CRF data by sequence make unified training set") |
train_df = pd.merge(idp_crf_df, |
fldpnn_df, |
on='Sequence', |
how='outer', |
indicator=True) |
train_df["Split"] = ["Train"]*len(train_df) |
train_df['Source'] = train_df['_merge'].map({ |
'left_only': 'IDP-CRF', |
'right_only': 'flDPnn', |
'both': 'IDP-CRF,flDPnn' |
}) |
train_df = train_df.drop(columns=["_merge"]) |
log_update(f"\tIDP-CRF dataset size: {len(idp_crf_df)}\n\tfLDpnn dataset size: {len(fldpnn_df)}\n\tinitial train dataset size: {len(train_df)}") |
log_update(f"\tChecking for sequences in both datasets...") |
duplicates = train_df[train_df["Source"].str.contains(",")]['Sequence'].unique().tolist() |
n_rows_with_duplicates = len(train_df[train_df['Sequence'].isin(duplicates)]) |
log_update(f"\t\t{len(duplicates)} sequences in both datasets, corresponding to {n_rows_with_duplicates} rows") |
train_df["Label"] = train_df.apply(lambda row: find_agreeing_labels(row,lab1="IDP-CRF Label",lab2="flDPnn Label"),axis=1) |
train_df["No Label Conflicts"]= ~train_df["Label"].isna() |
log_update(f"\tChecked for label inconsistencies between IDP-CRF and flDPnn on the same sequence:") |
match_str = train_df['No Label Conflicts'].value_counts().reset_index().rename(columns={'index': 'No Label Conflicts','No Label Conflicts': 'count'}).to_string(index=False) |
match_str = "\t\t" + match_str.replace("\n","\n\t\t") |
log_update(match_str) |
train_df = train_df.loc[train_df['Sequence']!=conflict_seq].reset_index(drop=True) |
log_update(f"\tDropping rows with label mismatch or known error (total={len(train_df[train_df['No Label Conflicts']==False])+1})") |
train_df = train_df.loc[train_df['No Label Conflicts']].reset_index(drop=True) |
train_df = train_df.drop(columns=["IDP-CRF Label","flDPnn Label"]) |
log_update(f"\t\tNew dataset size: {len(train_df)}") |
cols_of_interest = ['Sequence','Split','Label','IDP-CRF ID','flDPnn ID'] |
listlike_dict = check_columns_for_listlike(train_df, cols_of_interest, DELIMITERS) |
train_df['invalid_chars'] = train_df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS)) |
train_df[train_df['invalid_chars'].str.len()>0].sort_values(by='Sequence') |
all_invalid_chars = set().union(*train_df['invalid_chars']) |
log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}") |
log_update(f"\tDropping rows with invalid characters (total={len(train_df[train_df['invalid_chars'].str.len()>0])})") |
train_df = train_df.loc[train_df['invalid_chars'].str.len()==0].reset_index(drop=True) |
train_df = train_df.drop(columns=['invalid_chars']) |
log_update(f"\t\tNew dataset size: {len(train_df)}") |
source_str = train_df['Source'].value_counts().reset_index().rename(columns={'index': 'Source','Source': 'count'}).to_string(index=False) |
source_str = "\t\t" + source_str.replace("\n","\n\t\t") |
log_update(f"\tSources:\n{source_str}") |
return train_df |
def make_train_and_test_df(train_df, test_df): |
""" |
Combine the training and testing dataframe into one |
""" |
log_update("\nMaking final dataframe with train and test splits") |
test_df["Source"] = ["CAID-2_Disorder_NOX"]*len(test_df) |
splits_df = pd.concat([train_df.drop(columns=['No Label Conflicts']), |
test_df.rename(columns={'ID':'CAID-2_Disorder_NOX ID', 'Label': 'CAID-2_Disorder_NOX Label'})]) |
split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False) |
split_str = "\t\t" + split_str.replace("\n","\n\t\t") |
log_update(f"\tTrain dataset size: {len(train_df)}\n\tTest dataset size: {len(test_df)}\n\tinitial combined dataset size: {len(splits_df)}") |
duplicates = splits_df[splits_df.duplicated('Sequence')]['Sequence'].unique().tolist() |
n_rows_with_duplicates = len(splits_df[splits_df['Sequence'].isin(duplicates)]) |
log_update(f"\t\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows") |
for i, dup in enumerate(duplicates): |
fldpnn_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Train')]['flDPnn ID'].item() |
idp_crf_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Train')]['IDP-CRF ID'].item() |
caid2_disorder_nox_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Test')]['CAID-2_Disorder_NOX ID'].item() |
log_update(f"\t\t\t{i+1}: flDPnn ID: {fldpnn_id}\tIDP-CRF ID: {idp_crf_id}\tCAID-2_Disorder_NOX ID: {caid2_disorder_nox_id}\n\t\t\t\tSequence: {dup}") |
splits_df = splits_df.loc[ |
(~splits_df['Sequence'].isin(duplicates)) | |
((splits_df['Sequence'].isin(duplicates)) & (splits_df['Split']=='Test')) |
].reset_index(drop=True) |
split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False) |
split_str = "\t\t" + split_str.replace("\n","\n\t\t") |
log_update(f"\tRemoved duplicate sequences from training split, kept in test split\n\t\tNew dataset size: {len(splits_df)}\n\n{split_str}") |
assert splits_df[splits_df["Label"].isna()]["Split"].value_counts().reset_index()['index'].tolist()==['Test'] |
splits_df.loc[ |
splits_df["Split"]=="Test","Label" |
] = splits_df.loc[ |
splits_df["Split"]=="Test","CAID-2_Disorder_NOX Label" |
] |
splits_df = splits_df.drop(columns=["CAID-2_Disorder_NOX Label"]) |
assert len(splits_df[splits_df["Label"].isna()])==0 |
source_str = splits_df['Source'].value_counts().reset_index().rename(columns={'index': 'Source','Source': 'count'}).to_string(index=False) |
source_str = "\t\t" + source_str.replace("\n","\n\t\t") |
total_sources = sum(splits_df['Source'].value_counts().reset_index()['Source']) |
assert total_sources == len(splits_df) |
log_update(f"\n\tSource distribution:\n{source_str}\n\n\t\t\t\t\t\tSum: {total_sources}") |
longest_train = max(splits_df[splits_df['Split']=='Train']['Sequence'].apply(lambda x: len(x)).tolist()) |
shortest_train = min(splits_df[splits_df['Split']=='Train']['Sequence'].apply(lambda x: len(x)).tolist()) |
longest_test = max(splits_df[splits_df['Split']=='Test']['Sequence'].apply(lambda x: len(x)).tolist()) |
shortest_test = min(splits_df[splits_df['Split']=='Test']['Sequence'].apply(lambda x: len(x)).tolist()) |
log_update(f"\n\tLength distributions...\n\t\tTrain: max={longest_train}\tmin={shortest_train}\n\t\tTest: max={longest_test}\tmin={shortest_test}") |
splits_df["IDs"] = splits_df.apply(lambda row: get_unique_ids(row),axis=1) |
assert len(splits_df[splits_df["IDs"].isna()])==0 |
n_different_ids = len(splits_df.loc[splits_df["IDs"].str.contains(",")]) |
log_update(f"\n\tProvided comma-separated IDs in same listed order as Source\n\t\t- train: IDP-CRF first, flDPnn second ({n_different_ids} seqs have multiple distinct IDs)\n\t\t- test: CAID-2_Disorder_NOX") |
splits_df = splits_df[[ |
'Sequence','IDs','Split','Source','Label' |
]] |
return splits_df |
def main(): |
with open_logfile("data_cleaning_log.txt"): |
rawdata_train_test_path = "raw_data/caid2_train_and_test_data" |
processeddata_path = "processed_data" |
splits_path = "splits" |
os.makedirs(processeddata_path,exist_ok=True) |
os.makedirs(splits_path,exist_ok=True) |
caid_path = f"{rawdata_train_test_path}/CAID-2_Disorder_NOX_Testing_Sequences.fasta" |
caid_df = process_caid2_disorder_nox_test(caid_path) |
caid_df.to_csv(f"{processeddata_path}/CAID-2_Disorder_NOX_Processed.csv", index=False) |
fldpnn_train_path = f"{rawdata_train_test_path}/flDPnn_Training_Dataset.txt" |
fldpnn_val_path = f"{rawdata_train_test_path}/flDPnn_Validation_Annotation.txt" |
fldpnn_train_df = process_fldpnn(fldpnn_train_path, split="training") |
fldpnn_val_df = process_fldpnn(fldpnn_val_path, split="validation") |
fldpnn_train_df.to_csv(f"{processeddata_path}/flDPnn_Training_Dataset.csv", index=False) |
fldpnn_val_df.to_csv(f"{processeddata_path}/flDPnn_Validation_Dataset.csv", index=False) |
fldpnn_df = combine_fldpnn_train_val(fldpnn_train_df, fldpnn_val_df) |
idp_crf_train_path = f"{rawdata_train_test_path}/IDP-CRF_Training_Dataset.txt" |
idp_crf_df= process_idp_crf_train(idp_crf_train_path) |
idp_crf_df.to_csv(f"{processeddata_path}/IDP-CRF_Training_Dataset.csv", index=False) |
train_df = make_train_df(fldpnn_df, idp_crf_df) |
splits_df = make_train_and_test_df(train_df, caid_df) |
final_train_df = splits_df.loc[splits_df['Split']=='Train'].reset_index(drop=True) |
final_test_df = splits_df.loc[splits_df['Split']=='Test'].reset_index(drop=True) |
final_train_df.to_csv(f"{splits_path}/train_df.csv", index=False) |
final_test_df.to_csv(f"{splits_path}/test_df.csv", index=False) |
processed_caid2_df = pd.read_csv(f"{processeddata_path}/CAID-2_Disorder_NOX_Processed.csv") |
parse_all_caid2_results(processed_caid2_df) |
scrape_fusionpdb_level_2_3() |
process_fusions_and_hts() |
fusion_test_set = pd.read_csv("../../data/splits/test_df.csv") |
fusion_test_set = set(fusion_test_set['sequence'].tolist()) |
log_update(f"\nFinding level 2 and 3 fusion structures that are in the FusOn-pLM test set...\n\tTest set size: {len(fusion_test_set)} seqs") |
level_2_3_info = pd.read_csv('processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv') |
level_2_3_seqs = level_2_3_info.drop_duplicates('Fusion_Seq').reset_index(drop=True) |
level_2_3_seqs = set(level_2_3_seqs.loc[ |
level_2_3_info['Fusion_pLDDT'].notna() |
]['Fusion_Seq'].tolist()) |
test_benchmark_seqs = fusion_test_set.intersection(level_2_3_seqs) |
log_update(f"\tTotal fusion proteins in the FusOn-pLM test set: {len(test_benchmark_seqs)}") |
caid_train_set = set(pd.read_csv('splits/train_df.csv')['Sequence'].tolist()) |
test_benchmark_seqs = test_benchmark_seqs.difference(caid_train_set) |
log_update(f"\tTotal fusion proteins in the FusOn-pLM test set and NOT in the CAID train set: {len(test_benchmark_seqs)}") |
test_benchmark_df = pd.DataFrame( |
data = {'Sequence': list(test_benchmark_seqs)} |
) |
seq_id_dict = dict(zip(level_2_3_info['Fusion_Seq'],level_2_3_info['FusionGID'])) |
seq_plddts_dict = dict(zip(level_2_3_info['Fusion_Seq'],level_2_3_info['Fusion_AA_pLDDTs'])) |
test_benchmark_df['IDs'] = test_benchmark_df['Sequence'].map(seq_id_dict) |
test_benchmark_df['Split'] = ['Fusion_Benchmark']*len(test_benchmark_df) |
test_benchmark_df['Source'] = ['FusionPDB_AlphaFold2']*len(test_benchmark_df) |
test_benchmark_df['Label'] = test_benchmark_df['Sequence'].map(seq_plddts_dict) |
test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: x.split(",")) |
test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: [float(y) for y in x]) |
test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: ['1' if y < 68.8 else '0' for y in x]) |
test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: ''.join(x)) |
test_benchmark_df['SeqLen'] = test_benchmark_df['Sequence'].apply(lambda x: len(x)) |
test_benchmark_df['LabelLen'] = test_benchmark_df['Label'].apply(lambda x: len(x)) |
log_update(f"\tAll seq lengths and label lengths match: {(test_benchmark_df['SeqLen']==test_benchmark_df['LabelLen']).all()}") |
test_benchmark_df = test_benchmark_df.drop(columns=['SeqLen','LabelLen']) |
test_benchmark_df_str = test_benchmark_df.head(10) |
test_benchmark_df_str['Sequence'] = test_benchmark_df_str['Sequence'].apply(lambda x: x[0:10]+'...') |
test_benchmark_df_str['Label'] = test_benchmark_df_str['Label'].apply(lambda x: x[0:10]+'...') |
test_benchmark_df_str = test_benchmark_df_str.to_string(index=False) |
test_benchmark_df_str = "\t" + test_benchmark_df_str.replace("\n","\n\t") |
log_update(f"\nPreview of benchmarking set:\n{test_benchmark_df_str}") |
test_benchmark_df.to_csv('splits/fusion_bench_df.csv',index=False) |
log_update(f"\nAdding benchmarking sequences to splits_df.csv:\n\tLength before adding bench seqs: {len(splits_df)}") |
splits_df = pd.concat([splits_df,test_benchmark_df]) |
log_update(f"\tLength after adding bench seqs: {len(splits_df)}") |
split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False) |
split_str = "\t" + split_str.replace("\n","\n\t") |
log_update(f"Distribution among splits:\n{split_str}") |
splits_df.to_csv(f"{splits_path}/splits.csv",index=False) |
if __name__ == "__main__": |
main() |