import os import numpy as np import re import pandas as pd import requests from fuson_plm.utils.logging import open_logfile, log_update from fuson_plm.utils.constants import DELIMITERS, VALID_AAS from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars from fuson_plm.benchmarking.caid.scrape_fusionpdb import scrape_fusionpdb_level_2_3 from fuson_plm.benchmarking.caid.process_fusion_structures import process_fusions_and_hts def download_fasta(uniprotid, includeIsoform, output_file): try: url = f"https://rest.uniprot.org/uniprotkb/search?format=fasta&includeIsoform={includeIsoform}&query=accession%3A{uniprotid}&size=500&sort=accession+asc" # Send a GET request to the URL response = requests.get(url) # Raise an exception if the request was unsuccessful response.raise_for_status() # Write the content to a file in text mode with open(output_file, 'a+') as file: file.write(response.text) log_update(f"FASTA file for {uniprotid} successfully downloaded and added to '{output_file}'") except requests.exceptions.RequestException as e: log_update(f"An error occurred: {e}") # Test Sequences (CAID-2 Disorder-NOX) def parse_caid_txt(fast_file): ''' Parses correctly fasta-formatted text file with conditions: Line 1: ID Line 2: Sequence Line 3: Label ''' seq_to_label = {} id_to_sequence = {} with open(fast_file, 'r') as file: label = None sequence = "" seq_id = None reading_sequence = False for line in file: line = line.strip() if line.startswith(">"): if label is not None and sequence: seq_to_label[sequence] = (label, seq_id) seq_id = line[1:] # Capture the ID without the '>' label = None sequence = "" reading_sequence = True elif reading_sequence: if all(c in "01-" for c in line): label = line reading_sequence = False else: sequence += line if label is not None and sequence: seq_to_label[sequence] = (label, seq_id) return seq_to_label def check_df_for_mismatched_labels(sd): log_update("\tChecking dataframe for mismatched sequences and labels...") counter=0 for idx, row in sd.iterrows(): seq = row['Sequence'] label = row['Label'] if len(seq) != len(label): counter+=1 log_update(f"\t\tLength mismatch at index {idx}: sequence length = {len(seq)}, label length = {len(label)}") log_update(f"\t\tTotal mismatched lengths/labels: {counter}") def process_caid2_disorder_nox_test(caid_path): """ Processes the CAID-2_Disorder_NOX_Testing_Sequences.fasta file """ log_update("Processing CAID-2-Disorder-NOX Testing Dataset") # Parse the fasta file caid_dict = parse_caid_txt(caid_path) # Gather the sequences caid_seqs = {} for k, (v, seq_id) in caid_dict.items(): caid_seqs[seq_id] = (k, v) log_update(f"\tTotal sequences: {len(caid_seqs)}") # Form dataframe from processed data caid_df = pd.DataFrame({ 'ID': list(caid_seqs.keys()), 'Sequence': [seq for seq, _ in caid_seqs.values()], 'Label': [lbl for _, lbl in caid_seqs.values()], 'Split': 'Test' }) check_df_for_mismatched_labels(caid_df) return caid_df # Training Squences (fldpnn and IDP-CRF) # fldpnn Training Sequences def parse_fldpnn_fasta(file_path): """ Parse flDPnn_Training_Dataset.txt, where there are 5 sequence lines. We only want the first >Disprot ID Amino acid sequence Experimental annotation for intrinsic disorder Experimental annotation for disordered protein binding Experimental annotation for disordered DNA binding Experimental annotation for disordered RNA binding Experimental annotation for disordered flexible linkers """ sequences = [] labels = [] ids = [] with open(file_path, 'r') as file: lines = file.readlines() seq_id = "" current_sequence = "" seen_label_lines = 0 # should go up to 5 for each current_labels = [] is_label = False for line in lines: line = line.strip() if line.startswith('>'): if current_sequence and current_labels: assert seen_label_lines==5 # we should've seen 5 labels, otherwise something is wrong ids.append(seq_id) sequences.append(current_sequence) labels.append(''.join(current_labels)) seq_id = line[1:] # Capture the ID without the '>' current_sequence = "" current_labels = [] is_label = False seen_label_lines = 0 elif re.match('^[A-Z]+$', line): # Sequence lines current_sequence += line else: # Label lines seen_label_lines+=1 if seen_label_lines==1: current_labels.append(line) is_label = True # Add the last sequence and labels if current_sequence and current_labels: sequences.append(current_sequence) labels.append(''.join(current_labels)) ids.append(seq_id) return ids, sequences, labels def parse_idp_crf_fasta(file_path): sequences = [] labels = [] ids = [] with open(file_path, 'r') as file: lines = file.readlines() seq_id = "" current_sequence = "" current_labels = [] is_label = False for line in lines: line = line.strip() if line.startswith('>'): if current_sequence and current_labels: ids.append(seq_id) sequences.append(current_sequence) labels.append(''.join(current_labels)) seq_id = line[1:] # Capture the ID without the '>' current_sequence = "" current_labels = [] is_label = False elif re.match('^[A-Z]+$', line): # Sequence lines current_sequence += line else: # Label lines current_labels.append(line) is_label = True # Add the last sequence and labels if current_sequence and current_labels: sequences.append(current_sequence) labels.append(''.join(current_labels)) ids.append(seq_id) return ids, sequences, labels def process_fldpnn(fldpnn_path, split="training"): """ Process the fldpnn_Training_Dataset """ log_update(f"\nProcessing flDPnn {split} dataset") # Parse fasta fldpnn_ids, fldpnn_seqs, fldpnn_labels = parse_fldpnn_fasta(fldpnn_path) # Collect cleaned labels cleaned_fldpnn_ids = [] cleaned_fldpnn_labels = [] for i in range(len(fldpnn_seqs)): seq_len = len(fldpnn_seqs[i]) label = fldpnn_labels[i] # Should only be the first set of labels id = fldpnn_ids[i] cleaned_fldpnn_labels.append(label) log_update(f"\tTotal labels: {len(cleaned_fldpnn_labels)}, total sequences: {len(fldpnn_seqs)},total IDs: {len(fldpnn_ids)}") fldpnn_df = pd.DataFrame({'Sequence': fldpnn_seqs, 'Label': cleaned_fldpnn_labels, "Split": "Train" if split=="training" else "Val", "ID": fldpnn_ids}) check_df_for_mismatched_labels(fldpnn_df) return fldpnn_df def combine_fldpnn_train_val(fldpnn_train_df, fldpnn_val_df): log_update("\nJoining flDPnn train and val sets into one training set for CAID predictor") combined = pd.concat([fldpnn_train_df,fldpnn_val_df]) # check for duplicates duplicates = combined[combined['Sequence'].duplicated()]['Sequence'].unique().tolist() n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)]) log_update(f"\t{len(duplicates)} sequences in both train and val datasets, corresponding to {n_rows_with_duplicates} rows") for dup in duplicates: train_id = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Train')]['ID'].reset_index(drop=True).iloc[0] val_id = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Val')]['ID'].reset_index(drop=True).iloc[0] train_label = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Train')]['Label'].reset_index(drop=True).iloc[0] val_label = combined.loc[(combined['Sequence']==dup) & (combined['Split']=='Val')]['Label'].reset_index(drop=True).iloc[0] log_update(f"\t\tTrain ID: {train_id}\tVal ID: {val_id}\tSame labels: {train_label==val_label}\tSequence: {dup}") # if the labels are not equal, get rid of it completely. Otherwise just get rid of the val case if not(train_label==val_label): log_update(f"\t\t\tSince labels are not equal, removing sequence completely") combined = combined[combined['Sequence']!=dup].reset_index(drop=True) else: log_update(f"\t\t\tSince labels are equal, removing validation copy") combined = combined.loc[(combined['Sequence']!=dup) | ((combined['Sequence']==dup) & (combined['Split']=='Train'))] # drop duplicates log_update(f"\tLength of joined flDPnn data: {len(combined)}") return combined def process_idp_crf_train(idp_crf_train_path): """ Process IDP-CRF_Training_Dataset Args: idp_crf_train_path """ log_update("\nProcessing IDP-CRF training dataset") # Parse the fasta, get sequences and labels idp_crf_ids, idp_crf_seqs, idp_crf_labels = parse_idp_crf_fasta(idp_crf_train_path) log_update(f"\tTotal labels: {len(idp_crf_labels)}, total sequences: {len(idp_crf_seqs)}, total IDs: {len(idp_crf_ids)}") # Clean the labels cleaned_idp_ids, cleaned_idp_seqs, cleaned_idp_labels = [], [], [] counter = 0 log_update("\tCleaning labels and counting length-mismatched examples...") for i, label in enumerate(idp_crf_labels): # If length of sequence and labels doesn't match, log it if len(idp_crf_seqs[i]) != len(idp_crf_labels[i]): log_update(f"\t\tLength mismatch at index {i}: sequence length = {len(idp_crf_seqs[i])}, label length = {len(idp_crf_labels[i])}") counter += 1 # Else, "clean" the labels by mapping them to ints and converting them to a list else: cleaned_idp_ids.append(idp_crf_ids[i]) cleaned_idp_labels.append(label) cleaned_idp_seqs.append(idp_crf_seqs[i]) log_update(f"\t\tMismatched lengths/labels: {counter}") # Confirm that final database has no mismatched labels idp_crf_df = pd.DataFrame({'Sequence': cleaned_idp_seqs, 'Label': cleaned_idp_labels, "Split": "Train", "ID": cleaned_idp_ids}) check_df_for_mismatched_labels(idp_crf_df) return idp_crf_df def find_agreeing_labels(row, lab1="", lab2=""): """ If there's only one possible label, return that label. If the two labels disagree, return np.nan """ val1 = row[lab1] val2 = row[lab2] # If one of them is nan, then they won't match anyway, so return True because there is no conflict if type(val1)==float and np.isnan(val1): return val2 elif type(val2)==float and np.isnan(val2): return val1 else: if val1==val2: return val1 else: return np.nan def get_unique_ids(row): source_to_id = { "IDP-CRF": row["IDP-CRF ID"], "flDPnn": row["flDPnn ID"], "CAID-2_Disorder_NOX": row["CAID-2_Disorder_NOX ID"] } all_sources = row["Source"].split(",") all_ids = [] # they are already in the desired order so just iterate through them for source in all_sources: candidate_id = source_to_id[source] if not(candidate_id in all_ids): all_ids.append(candidate_id) return ",".join(all_ids) def parse_caid2_results(processed_caid2_df,lines): # iterate through the lines all_caid2_disorder_nox_ids = processed_caid2_df['ID'].tolist() all_caid2_disorder_nox_sequences = processed_caid2_df['Sequence'].tolist() cur_id = None results = { } for i, line in enumerate(lines): # If line starts with >, that means we have a new ID if line[0]==">": # If we are currently on a different cur_id, finish that one out if not(cur_id is None): results[cur_id]['prob_1'] = ",".join(results[cur_id]['prob_1']) results[cur_id]['pred_labels'] = ",".join(results[cur_id]['pred_labels']) sequence = results[cur_id]['sequence'] # get the true labels from the CAID2 dataset - IF POSSIBLE if (cur_id not in all_caid2_disorder_nox_ids) and (sequence not in all_caid2_disorder_nox_sequences): results[cur_id]['labels'] = np.nan else: true_labels = processed_caid2_df.loc[ processed_caid2_df['ID']==cur_id,'Label' ].item() true_labels = ",".join(list(true_labels)) results[cur_id]['labels'] = true_labels # Now process the new one cur_id = line[1::].strip('\t').strip('\n') results[cur_id] = { 'sequence': '', 'prob_1': [], 'pred_labels': [] } # if cur id is not None else: # if we have a cur id to process, process it if not(cur_id is None): # Extract the information - not every .caid file as predicted labels!! lsplit = line.strip('\n').split('\t') label='' idx, aa, prob = lsplit[0], lsplit[1], lsplit[2] if len(lsplit)==4: label=lsplit[3] # Add to dict results[cur_id]['sequence']+=aa results[cur_id]['prob_1'].append(prob) results[cur_id]['pred_labels'].append(label) # if we're on the last line, combine if i==len(lines)-1: results[cur_id]['prob_1'] = ",".join(results[cur_id]['prob_1']) results[cur_id]['pred_labels'] = ",".join(results[cur_id]['pred_labels']) sequence = results[cur_id]['sequence'] # get the true labels from the CAID2 dataset - IF POSSIBLE if (cur_id not in all_caid2_disorder_nox_ids) and (sequence not in all_caid2_disorder_nox_sequences): results[cur_id]['labels'] = np.nan else: true_labels = processed_caid2_df.loc[ processed_caid2_df['ID']==cur_id,'Label' ].item() true_labels = ",".join(list(true_labels)) results[cur_id]['labels'] = true_labels df = pd.DataFrame.from_dict(results,orient='index').reset_index().rename(columns={'index':'seq_id'}) df = df.loc[df['labels'].notna()].reset_index(drop=True) # drop pred_labels if it's empty if set(','.join(df['pred_labels'].tolist()))=={','}: df = df.drop(columns=['pred_labels']) log_update(f"\t\tno predicted labels provided for this dataset; only probabilities") log_update(f"\t\t{len(df)}/{len(all_caid2_disorder_nox_sequences)} total CAID2-Nox sequences") return df def parse_all_caid2_results(processed_caid2_df, caid_raw_folder="raw_data/caid2_competition_results"): save_dir ="processed_data/caid2_competition_results" os.makedirs(save_dir,exist_ok=True) log_update(f"\nExtracting all CAID-2_Disorder_NOX results from CAID2 competition results files...") all_caid_files = os.listdir(caid_raw_folder) for caid_file in all_caid_files: # figure out how to parse .caid files with open(f"{caid_raw_folder}/{caid_file}", "r") as f: lines = f.readlines() log_update(f"\t{caid_file}:") results_df = parse_caid2_results(processed_caid2_df,lines) # save it competitor_name = caid_file.split('.caid')[0] results_df.to_csv(f"{save_dir}/{competitor_name}_CAID-2_Disorder_NOX.csv",index=False) def make_train_df(fldpnn_df, idp_crf_df): """ Make training dataframe by concatenating the two processed training sets. """ # Add source columns so we can track where each sequence came from idp_crf_df = idp_crf_df.rename(columns={'Label':'IDP-CRF Label', 'ID': 'IDP-CRF ID'}).drop(columns=['Split']) fldpnn_df = fldpnn_df.rename(columns={'Label':'flDPnn Label', 'ID': 'flDPnn ID'}).drop(columns=['Split']) ########### Combine fldpnn and idp crf # Join log_update("\nJoining flDPnn and IDP-CRF data by sequence make unified training set") train_df = pd.merge(idp_crf_df, fldpnn_df, on='Sequence', how='outer', indicator=True) train_df["Split"] = ["Train"]*len(train_df) # Map _merge column to desired labels train_df['Source'] = train_df['_merge'].map({ 'left_only': 'IDP-CRF', 'right_only': 'flDPnn', 'both': 'IDP-CRF,flDPnn' }) train_df = train_df.drop(columns=["_merge"]) log_update(f"\tIDP-CRF dataset size: {len(idp_crf_df)}\n\tfLDpnn dataset size: {len(fldpnn_df)}\n\tinitial train dataset size: {len(train_df)}") # Check for duplicate sequences log_update(f"\tChecking for sequences in both datasets...") duplicates = train_df[train_df["Source"].str.contains(",")]['Sequence'].unique().tolist() n_rows_with_duplicates = len(train_df[train_df['Sequence'].isin(duplicates)]) log_update(f"\t\t{len(duplicates)} sequences in both datasets, corresponding to {n_rows_with_duplicates} rows") # Check for consistency between IDP-CRF Label and flDPnn label train_df["Label"] = train_df.apply(lambda row: find_agreeing_labels(row,lab1="IDP-CRF Label",lab2="flDPnn Label"),axis=1) train_df["No Label Conflicts"]= ~train_df["Label"].isna() log_update(f"\tChecked for label inconsistencies between IDP-CRF and flDPnn on the same sequence:") match_str = train_df['No Label Conflicts'].value_counts().reset_index().rename(columns={'index': 'No Label Conflicts','No Label Conflicts': 'count'}).to_string(index=False) match_str = "\t\t" + match_str.replace("\n","\n\t\t") log_update(match_str) # Dropping rows where labels don't match #train_df[train_df['No Label Conflicts']==False][['Sequence','Split','IDP-CRF ID','flDPnn ID','IDP-CRF Label','flDPnn Label','No Label Conflicts']].to_csv('mismatch.csv',index=False) # Drop row with known conflict with disprot conflict_seq="MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKRMRRRIESEDEEDSSQDALVPRTPSPRPSTSAADLAIAPKKKKKRPSPKPERPPSPEVIVDSEEEREDVALQMVGFSNPPVLIKHGKGGKRTVRRLNEDDPVARGMRTQEEEEEPSEAESEITVMNPLSVPIVSAWEKGMEAARALMDKYHVDNDLKANFKLLPDQVEALAAVCKTWLNEEHRGLQLTFTSKKTFVTMMGRFLQAYLQSFAEVTYKHHEPTGCALWLHRCAEIEGELKCLHGSIMINKEHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNTDARCCVHDAACPANQFSGKSCGMFFSEGAKAQVAFKQIKAFMQALYPNAQTGHGHLLMPLRCECNSKPGHAPFLGRQLPKLTPFALSNAEDLDADLISDKSVLASVHHPALIVFQCCNPVYRNSRAQGGGPNCDFKISAPDLLNALVMVRSLWSENFTELPRMVVPEFKWSTKHQYRNVSLPVAHSDARQNPFDF" train_df = train_df.loc[train_df['Sequence']!=conflict_seq].reset_index(drop=True) log_update(f"\tDropping rows with label mismatch or known error (total={len(train_df[train_df['No Label Conflicts']==False])+1})") train_df = train_df.loc[train_df['No Label Conflicts']].reset_index(drop=True) # Make a new label column train_df = train_df.drop(columns=["IDP-CRF Label","flDPnn Label"]) log_update(f"\t\tNew dataset size: {len(train_df)}") ######## Final checks # Check for any invalid sequences or invalid characters cols_of_interest = ['Sequence','Split','Label','IDP-CRF ID','flDPnn ID'] listlike_dict = check_columns_for_listlike(train_df, cols_of_interest, DELIMITERS) # Check for invalid characters train_df['invalid_chars'] = train_df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS)) train_df[train_df['invalid_chars'].str.len()>0].sort_values(by='Sequence') all_invalid_chars = set().union(*train_df['invalid_chars']) log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}") # Dropping rows where invalid characters(should be none) log_update(f"\tDropping rows with invalid characters (total={len(train_df[train_df['invalid_chars'].str.len()>0])})") train_df = train_df.loc[train_df['invalid_chars'].str.len()==0].reset_index(drop=True) train_df = train_df.drop(columns=['invalid_chars']) log_update(f"\t\tNew dataset size: {len(train_df)}") source_str = train_df['Source'].value_counts().reset_index().rename(columns={'index': 'Source','Source': 'count'}).to_string(index=False) source_str = "\t\t" + source_str.replace("\n","\n\t\t") log_update(f"\tSources:\n{source_str}") return train_df def make_train_and_test_df(train_df, test_df): """ Combine the training and testing dataframe into one """ log_update("\nMaking final dataframe with train and test splits") # Concatenate proposed train and test test_df["Source"] = ["CAID-2_Disorder_NOX"]*len(test_df) splits_df = pd.concat([train_df.drop(columns=['No Label Conflicts']), test_df.rename(columns={'ID':'CAID-2_Disorder_NOX ID', 'Label': 'CAID-2_Disorder_NOX Label'})]) split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False) split_str = "\t\t" + split_str.replace("\n","\n\t\t") log_update(f"\tTrain dataset size: {len(train_df)}\n\tTest dataset size: {len(test_df)}\n\tinitial combined dataset size: {len(splits_df)}") # Check for duplicates - if we find any, REMOVE them from train and keep them in test duplicates = splits_df[splits_df.duplicated('Sequence')]['Sequence'].unique().tolist() n_rows_with_duplicates = len(splits_df[splits_df['Sequence'].isin(duplicates)]) log_update(f"\t\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows") for i, dup in enumerate(duplicates): fldpnn_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Train')]['flDPnn ID'].item() idp_crf_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Train')]['IDP-CRF ID'].item() caid2_disorder_nox_id = splits_df.loc[(splits_df['Sequence']==dup)&(splits_df['Split']=='Test')]['CAID-2_Disorder_NOX ID'].item() log_update(f"\t\t\t{i+1}: flDPnn ID: {fldpnn_id}\tIDP-CRF ID: {idp_crf_id}\tCAID-2_Disorder_NOX ID: {caid2_disorder_nox_id}\n\t\t\t\tSequence: {dup}") # remove from train and keep in test splits_df = splits_df.loc[ (~splits_df['Sequence'].isin(duplicates)) | # Either the sequence is NOT duplicated, or ((splits_df['Sequence'].isin(duplicates)) & (splits_df['Split']=='Test')) # Sequence is duplicated, and it's in test set ].reset_index(drop=True) split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False) split_str = "\t\t" + split_str.replace("\n","\n\t\t") log_update(f"\tRemoved duplicate sequences from training split, kept in test split\n\t\tNew dataset size: {len(splits_df)}\n\n{split_str}") # Everything in the train set should have a label; nothing in the test set should assert splits_df[splits_df["Label"].isna()]["Split"].value_counts().reset_index()['index'].tolist()==['Test'] splits_df.loc[ splits_df["Split"]=="Test","Label" ] = splits_df.loc[ splits_df["Split"]=="Test","CAID-2_Disorder_NOX Label" ] splits_df = splits_df.drop(columns=["CAID-2_Disorder_NOX Label"]) # Make sure there are no na's in label assert len(splits_df[splits_df["Label"].isna()])==0 # Print out distribution of sources source_str = splits_df['Source'].value_counts().reset_index().rename(columns={'index': 'Source','Source': 'count'}).to_string(index=False) source_str = "\t\t" + source_str.replace("\n","\n\t\t") total_sources = sum(splits_df['Source'].value_counts().reset_index()['Source']) assert total_sources == len(splits_df) log_update(f"\n\tSource distribution:\n{source_str}\n\n\t\t\t\t\t\tSum: {total_sources}") # Print largest and smallest seq len in each set longest_train = max(splits_df[splits_df['Split']=='Train']['Sequence'].apply(lambda x: len(x)).tolist()) shortest_train = min(splits_df[splits_df['Split']=='Train']['Sequence'].apply(lambda x: len(x)).tolist()) longest_test = max(splits_df[splits_df['Split']=='Test']['Sequence'].apply(lambda x: len(x)).tolist()) shortest_test = min(splits_df[splits_df['Split']=='Test']['Sequence'].apply(lambda x: len(x)).tolist()) log_update(f"\n\tLength distributions...\n\t\tTrain: max={longest_train}\tmin={shortest_train}\n\t\tTest: max={longest_test}\tmin={shortest_test}") # Consolidate the IDs a bit splits_df["IDs"] = splits_df.apply(lambda row: get_unique_ids(row),axis=1) assert len(splits_df[splits_df["IDs"].isna()])==0 n_different_ids = len(splits_df.loc[splits_df["IDs"].str.contains(",")]) log_update(f"\n\tProvided comma-separated IDs in same listed order as Source\n\t\t- train: IDP-CRF first, flDPnn second ({n_different_ids} seqs have multiple distinct IDs)\n\t\t- test: CAID-2_Disorder_NOX") # Keep only desired columns splits_df = splits_df[[ 'Sequence','IDs','Split','Source','Label' ]] return splits_df def main(): with open_logfile("data_cleaning_log.txt"): rawdata_train_test_path = "raw_data/caid2_train_and_test_data" # make directory to save processed data processeddata_path = "processed_data" splits_path = "splits" os.makedirs(processeddata_path,exist_ok=True) os.makedirs(splits_path,exist_ok=True) # Process CAID-2_Disorder_NOX_Testing_Sequences dataset from fasta file caid_path = f"{rawdata_train_test_path}/CAID-2_Disorder_NOX_Testing_Sequences.fasta" caid_df = process_caid2_disorder_nox_test(caid_path) caid_df.to_csv(f"{processeddata_path}/CAID-2_Disorder_NOX_Processed.csv", index=False) # Process fldpnn Training and Validation Datasets fldpnn_train_path = f"{rawdata_train_test_path}/flDPnn_Training_Dataset.txt" fldpnn_val_path = f"{rawdata_train_test_path}/flDPnn_Validation_Annotation.txt" fldpnn_train_df = process_fldpnn(fldpnn_train_path, split="training") fldpnn_val_df = process_fldpnn(fldpnn_val_path, split="validation") fldpnn_train_df.to_csv(f"{processeddata_path}/flDPnn_Training_Dataset.csv", index=False) fldpnn_val_df.to_csv(f"{processeddata_path}/flDPnn_Validation_Dataset.csv", index=False) # Combine train and val fldpnn_df = combine_fldpnn_train_val(fldpnn_train_df, fldpnn_val_df) # Process IDP-CRF_Training_Dataset idp_crf_train_path = f"{rawdata_train_test_path}/IDP-CRF_Training_Dataset.txt" idp_crf_df= process_idp_crf_train(idp_crf_train_path) idp_crf_df.to_csv(f"{processeddata_path}/IDP-CRF_Training_Dataset.csv", index=False) # Merge train_df = make_train_df(fldpnn_df, idp_crf_df) # Make a full splits file splits_df = make_train_and_test_df(train_df, caid_df) final_train_df = splits_df.loc[splits_df['Split']=='Train'].reset_index(drop=True) final_test_df = splits_df.loc[splits_df['Split']=='Test'].reset_index(drop=True) # Save final files final_train_df.to_csv(f"{splits_path}/train_df.csv", index=False) final_test_df.to_csv(f"{splits_path}/test_df.csv", index=False) # Process the caid competition results and save them in a more accessible format processed_caid2_df = pd.read_csv(f"{processeddata_path}/CAID-2_Disorder_NOX_Processed.csv") parse_all_caid2_results(processed_caid2_df) # Process data for visualizing fusion oncoproteins # Scrape FusionPDB scrape_fusionpdb_level_2_3() # Process the structures that we downloaded from scraping process_fusions_and_hts() # Now, figure out which structures are in the test set and isolate those for benchmarking in splits/fusion_bench_df.csv fusion_test_set = pd.read_csv("../../data/splits/test_df.csv") # columns are sequence, member length, snp_probabilities fusion_test_set = set(fusion_test_set['sequence'].tolist()) log_update(f"\nFinding level 2 and 3 fusion structures that are in the FusOn-pLM test set...\n\tTest set size: {len(fusion_test_set)} seqs") level_2_3_info = pd.read_csv('processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv') # there are duplicate sequences in here so drop duplicates randomly level_2_3_seqs = level_2_3_info.drop_duplicates('Fusion_Seq').reset_index(drop=True) level_2_3_seqs = set(level_2_3_seqs.loc[ level_2_3_info['Fusion_pLDDT'].notna() # make sure we've got a structure ]['Fusion_Seq'].tolist()) # if it has a structure, it's in the test set, and it's not in the caid train set, we can benchmark with it test_benchmark_seqs = fusion_test_set.intersection(level_2_3_seqs) log_update(f"\tTotal fusion proteins in the FusOn-pLM test set: {len(test_benchmark_seqs)}") caid_train_set = set(pd.read_csv('splits/train_df.csv')['Sequence'].tolist()) test_benchmark_seqs = test_benchmark_seqs.difference(caid_train_set) # subtract off the caid train set to be sure log_update(f"\tTotal fusion proteins in the FusOn-pLM test set and NOT in the CAID train set: {len(test_benchmark_seqs)}") # Finally, make a dataframe structured like train_df and test_df. Columns are: Sequence,IDs,Split,Source,Label # Let's make the IDs FusionGID test_benchmark_df = pd.DataFrame( data = {'Sequence': list(test_benchmark_seqs)} ) seq_id_dict = dict(zip(level_2_3_info['Fusion_Seq'],level_2_3_info['FusionGID'])) seq_plddts_dict = dict(zip(level_2_3_info['Fusion_Seq'],level_2_3_info['Fusion_AA_pLDDTs'])) test_benchmark_df['IDs'] = test_benchmark_df['Sequence'].map(seq_id_dict) test_benchmark_df['Split'] = ['Fusion_Benchmark']*len(test_benchmark_df) test_benchmark_df['Source'] = ['FusionPDB_AlphaFold2']*len(test_benchmark_df) test_benchmark_df['Label'] = test_benchmark_df['Sequence'].map(seq_plddts_dict) # convert label to 1 or 0 test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: x.split(",")) test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: [float(y) for y in x]) # make it a float list of pLDDTs test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: ['1' if y < 68.8 else '0' for y in x]) # disordered if pLDDT is < 68.8, accoridng to AlphaFold-pLDDT published threshold test_benchmark_df['Label'] = test_benchmark_df['Label'].apply(lambda x: ''.join(x)) # change ['1','1','0''] to '110' # check lengths test_benchmark_df['SeqLen'] = test_benchmark_df['Sequence'].apply(lambda x: len(x)) test_benchmark_df['LabelLen'] = test_benchmark_df['Label'].apply(lambda x: len(x)) log_update(f"\tAll seq lengths and label lengths match: {(test_benchmark_df['SeqLen']==test_benchmark_df['LabelLen']).all()}") test_benchmark_df = test_benchmark_df.drop(columns=['SeqLen','LabelLen']) # convert to string test_benchmark_df_str = test_benchmark_df.head(10) test_benchmark_df_str['Sequence'] = test_benchmark_df_str['Sequence'].apply(lambda x: x[0:10]+'...') test_benchmark_df_str['Label'] = test_benchmark_df_str['Label'].apply(lambda x: x[0:10]+'...') test_benchmark_df_str = test_benchmark_df_str.to_string(index=False) test_benchmark_df_str = "\t" + test_benchmark_df_str.replace("\n","\n\t") log_update(f"\nPreview of benchmarking set:\n{test_benchmark_df_str}") test_benchmark_df.to_csv('splits/fusion_bench_df.csv',index=False) # Add the benchmarking sequences to split log_update(f"\nAdding benchmarking sequences to splits_df.csv:\n\tLength before adding bench seqs: {len(splits_df)}") splits_df = pd.concat([splits_df,test_benchmark_df]) log_update(f"\tLength after adding bench seqs: {len(splits_df)}") split_str = splits_df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}).to_string(index=False) split_str = "\t" + split_str.replace("\n","\n\t") log_update(f"Distribution among splits:\n{split_str}") splits_df.to_csv(f"{splits_path}/splits.csv",index=False) if __name__ == "__main__": main()