import pandas as pd import numpy as np import os from fuson_plm.utils.logging import open_logfile, log_update from fuson_plm.utils.constants import DELIMITERS, VALID_AAS from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars from fuson_plm.benchmarking.idr_prediction.plot import plot_all_values_hist_grid, plot_all_train_val_test_values_hist_grid def process_raw_albatross(df): # return a version of the df with first column split, duplicates cleaned,columns checked for weird characters and invalids # first, look at the splits split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}) tot_prots = sum(split_str['count']) split_str['pcnt'] = round(100*split_str['count']/tot_prots,2) split_str = split_str.to_string(index=False) split_str = "\t\t" + split_str.replace("\n","\n\t\t") log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}") # format: IDR_19076_tr___A0A8M9PNM5___A0A8M9PNM5_DANRE # or: synth_test_sequence0 df['temp'] = df['ID'].str.split("_") df['ID'] = df['temp'].apply(lambda x: f"{x[0]}" if len(x)==1 else f"{x[0]}_{x[1]}" if len(x)<3 else f"{x[0]}_{x[1]}_{x[2]}") # Not ever column has UniProt IDs and Names, so we have to allow np.nan if this info is missing. df['UniProt_ID'] = df['temp'].apply(lambda x: x[5].strip() if len(x)>=5 else np.nan) df['UniProt_Name'] = df['temp'].apply(lambda x: f"{x[8].strip()}_{x[9].strip()}" if len(x)>=8 else np.nan) df = df.drop(columns=['temp']) cols_to_check = list(df.columns) cols_to_check.remove('Value') # don't check this one because it shouldn't be string # Investigate the colimns we just created and make sure they don't have any invalid features. # make sure value is float type assert df['Value'].dtype == 'float64' check_columns_for_listlike(df, cols_of_interest=cols_to_check, delimiters=DELIMITERS) # Check for invalid AAs df['invalid_chars'] = df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS)) df[df['invalid_chars'].str.len()>0].sort_values(by='Sequence') all_invalid_chars = set().union(*df['invalid_chars']) log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}") # Assert no invalid AAs assert (df['invalid_chars'].str.len()==0).all() df = df.drop(columns=['invalid_chars']) # Check for duplicates - if we find any, REMOVE them from train and keep them in test duplicates = df[df.duplicated('Sequence')]['Sequence'].unique().tolist() n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)]) log_update(f"\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows") # Look for distribution of duplicates WITHIN train, WITHIN test, and BETWEEN train and test # Train only duplicates = df.loc[ (df['Split']=='Train') ] duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist() n_rows_with_duplicates = len(df.loc[ (df['Sequence'].isin(duplicates)) & (df['Split']=='Train') ]) log_update(f"\t\twithin TRAIN only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Train rows") # Test only duplicates = df.loc[ (df['Split']=='Test') ] duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist() n_rows_with_duplicates = len(df.loc[ (df['Sequence'].isin(duplicates)) & (df['Split']=='Test') ]) log_update(f"\t\twithin TEST only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Test rows") # Between train and test duplicates_df = df.groupby('Sequence').agg({ 'Split': lambda x: ','.join(set(x)) }).reset_index() duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True) duplicates = duplicates_df['Sequence'].unique().tolist() n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)]) log_update(f"\t\tduplicates in BOTH TRAIN AND TEST: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows") log_update(f"\t\tprinting portion of dataframe with train+test shared seqs:\n{duplicates_df.head(5)}") log_update("\tGrouping by sequence, averaging values, and keeping any Train/Test duplicates in the Test set...") df = df.replace(np.nan, '') df = df.groupby('Sequence').agg( Value=('Value', 'mean'), Value_STD=('Value', 'std'), IDs=('ID', lambda x: ','.join(x)), UniProt_IDs=('UniProt_ID', lambda x: ','.join(x)), UniProt_Names=('UniProt_Name', lambda x: ','.join(x)), Split=('Split', lambda x: ','.join(x)) ).reset_index() for col in ['IDs','UniProt_IDs','UniProt_Names','Split']: df[col] = df[col].apply(lambda x: [y for y in x.split(',')]) df[col] = df[col].apply(lambda x: ','.join(x)) df[col] = df[col].str.strip(',') # make sure there are no commas left assert len(df[df[col].str.contains(',,')])==0 # set Split to Test if test is in it df['Split'] = df['Split'].apply(lambda x: 'Test' if 'Test' in x else 'Train') # For anything that wasn't duplicated, Value_STD is nan log_update("\tChecking coefficients of variation for averaged rows") # calculate coefficient of variation, should be < 10 df['Value_CV'] = 100*df['Value_STD']/df['Value'] log_update(f"\t\tTotal rows with coefficient of variation (CV)\n\t\t\t<=10%: {len(df[df['Value_CV']<=10])}\n\t\t\t>10%: {len(df[df['Value_CV']>10])}\n\t\t\t>20%: {len(df[df['Value_CV']>20])}") # Ensure there are no duplicates assert len(df[df['Sequence'].duplicated()])==0 log_update(f"\tNo remaining duplicates: {len(df[df['Sequence'].duplicated()])==0}") # Print the final distribution of train and test values split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}) tot_prots = sum(split_str['count']) split_str['pcnt'] = round(100*split_str['count']/tot_prots,2) split_str = split_str.to_string(index=False) split_str = "\t\t" + split_str.replace("\n","\n\t\t") log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}") return df def combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp): log_update("\nCombining all four dataframes into one file of ALBATROSS sequences") asph = asph[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'asph'}) scaled_re = scaled_re[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_re'}) scaled_rg = scaled_rg[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_rg'}) scaling_exp = scaling_exp[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaling_exp'}) combined = asph.merge(scaled_re, on='Sequence',how='outer',suffixes=('_asph', '_scaledre'))\ .merge(scaled_rg, on='Sequence',how='outer',suffixes=('_scaledre', '_scaledrg'))\ .merge(scaling_exp, on='Sequence',how='outer',suffixes=('_scaledrg', '_scalingexp')).fillna('') # Make sure something that's in train for one is in train for all, and not test combined['IDs'] = combined['IDs_asph']+','+combined['IDs_scaledre']+','+combined['IDs_scaledrg']+','+combined['IDs_scalingexp'] combined['UniProt_IDs'] = combined['UniProt_IDs_asph']+','+combined['UniProt_IDs_scaledre']+','+combined['UniProt_IDs_scaledrg']+','+combined['UniProt_IDs_scalingexp'] combined['UniProt_Names'] = combined['UniProt_Names_asph']+','+combined['UniProt_Names_scaledre']+','+combined['UniProt_Names_scaledrg']+','+combined['UniProt_Names_scalingexp'] combined['Split'] = combined['Split_asph']+','+combined['Split_scaledre']+','+combined['Split_scaledrg']+','+combined['Split_scalingexp'] # Make the lists clean for col in ['IDs','UniProt_IDs','UniProt_Names','Split']: combined[col] = combined[col].apply(lambda x: [y.strip() for y in x.split(',') if len(y)>0]) combined[col] = combined[col].apply(lambda x: ','.join(set(x))) combined[col] = combined[col].str.strip(',') # make sure there are no commas left assert len(combined[combined[col].str.contains(',,')])==0 combined = combined[['Sequence','IDs','UniProt_IDs','UniProt_Names','Split','asph','scaled_re','scaled_rg','scaling_exp']] # drop unneeded merge relics combined = combined.replace('',np.nan) # Make sure there are no sequences where split is both train and test log_update("\tChecking for any cases where a protein is Train for one IDR prediction task and Test for another (should NOT happen!)") duplicates_df = combined.groupby('Sequence').agg({ 'Split': lambda x: ','.join(set(x)) }).reset_index() duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True) duplicates = duplicates_df['Sequence'].unique().tolist() n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)]) log_update(f"\t\tsequences in BOTH TRAIN AND TEST: {len(duplicates)} sequences, corresponding to {n_rows_with_duplicates} rows") if len(duplicates)>0: log_update(f"\t\tprinting portion of assert len(combined[combined['asph'].notna()])==len(asph)dataframe with train+test shared seqs:\n{duplicates_df.head(5)}") # Now, get rid of duplicates combined = combined.drop_duplicates().reset_index(drop=True) duplicates = combined[combined.duplicated('Sequence')]['Sequence'].unique().tolist() log_update(f"\tDropped duplicates.\n\tTotal duplicate sequences: {len(duplicates)}\n\tTotal sequences: {len(combined)}") assert len(duplicates)==0 # See how many columns have multiple entries for each log_update(f"\tChecking how many sequences have multiple of the following: ID, UniProt ID, UniProt Name") for col in ['IDs','UniProt_IDs','UniProt_Names','Split']: n_multiple = len(combined.loc[(combined[col].notna()) & (combined[col].str.contains(','))]) log_update(f"\t\t{col}: {n_multiple}") # See how many entries there are of each cproperty (should match length of original database) assert len(combined[combined['asph'].notna()])==len(asph) assert len(combined[combined['scaled_re'].notna()])==len(scaled_re) assert len(combined[combined['scaled_rg'].notna()])==len(scaled_rg) assert len(combined[combined['scaling_exp'].notna()])==len(scaling_exp) log_update("\tSequences with values for each property:") for property in ['asph','scaled_re','scaled_rg','scaling_exp']: log_update(f"\t\t{property}: {len(combined[combined[property].notna()])}") log_update(f"\nPreview of combined database with columns: {combined.columns}\n{combined.head(10)}") return combined def main(): with open_logfile("data_cleaning_log.txt"): # Read in all of the raw data raw_data_folder = 'raw_data' dtype_dict = {0:str,1:str,2:float} rename_dict = {0:'ID',1:'Sequence',2:'Value'} # Read in the test data asph_test = pd.read_csv(f"{raw_data_folder}/asph_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict) scaled_re_test = pd.read_csv(f"{raw_data_folder}/scaled_re_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict) scaled_rg_test = pd.read_csv(f"{raw_data_folder}/scaled_rg_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict) scaling_exp_test = pd.read_csv(f"{raw_data_folder}/scaling_exp_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict) # Read in the train data asph_train = pd.read_csv(f"{raw_data_folder}/asph_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict) scaled_re_train = pd.read_csv(f"{raw_data_folder}/scaled_re_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict) scaled_rg_train = pd.read_csv(f"{raw_data_folder}/scaled_rg_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict) scaling_exp_train = pd.read_csv(f"{raw_data_folder}/scaling_exp_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict) # Concatenate - include columns for split asph_test['Split'] = ['Test']*len(asph_test) scaled_re_test['Split'] = ['Test']*len(scaled_re_test) scaled_rg_test['Split'] = ['Test']*len(scaled_rg_test) scaling_exp_test['Split'] = ['Test']*len(scaling_exp_test) asph_train['Split'] = ['Train']*len(asph_train) scaled_re_train['Split'] = ['Train']*len(scaled_re_train) scaled_rg_train['Split'] = ['Train']*len(scaled_rg_train) scaling_exp_train['Split'] = ['Train']*len(scaling_exp_train) asph = pd.concat([asph_test, asph_train]) scaled_re = pd.concat([scaled_re_test, scaled_re_train]) scaled_rg = pd.concat([scaled_rg_test, scaled_rg_train]) scaling_exp = pd.concat([scaling_exp_test, scaling_exp_train]) log_update("Initial counts:") log_update(f"\tAsphericity: total entries={len(asph)}, not nan entries={len(asph.loc[asph['Value'].notna()])}") log_update(f"\tScaled re: total entries={len(scaled_re)}, not nan entries={len(scaled_re.loc[scaled_re['Value'].notna()])}") log_update(f"\tScaled rg: total entries={len(scaled_rg)}, not nan entries={len(scaled_rg.loc[scaled_rg['Value'].notna()])}") # change any scaled_rg rows with values less than 1 to np.nan, as done in the paper scaled_rg = scaled_rg.loc[ scaled_rg['Value']>=1].reset_index(drop=True) log_update(f"\t\tAfter dropping Rg values < 1: total entries={len(scaled_rg)}") log_update(f"\tScaling exp: total entries={len(scaling_exp)}, not nan entries={len(scaling_exp.loc[scaling_exp['Value'].notna()])}") # Process the raw data log_update(f"Example raw download: asphericity\n{asph.head()}") log_update(f"\nCleaning Asphericity") asph = process_raw_albatross(asph) log_update(f"\nProcessed data: asphericity\n{asph.head()}") log_update(f"\nCleaning Scaled Re") scaled_re = process_raw_albatross(scaled_re) log_update(f"\nProcessed data: scaled re\n{scaled_re.head()}") log_update(f"\nCleaning Scaled Rg") scaled_rg = process_raw_albatross(scaled_rg) log_update(f"\nProcessed data: scaled rg\n{scaled_rg.head()}") log_update(f"\nCleaning Scaling Exp") scaling_exp = process_raw_albatross(scaling_exp) log_update(f"\nProcessed data: scaling exp\n{scaling_exp.head()}") # Give some stats about each dataset log_update("\nStats:") log_update(f"# Asphericity sequences: {len(asph)}\n\tRange: {min(asph['Value']):.4f}-{max(asph['Value']):.4f}") log_update(f"# Scaled Re sequences: {len(scaled_re)}\n\tRange: {min(scaled_re['Value']):.4f}-{max(scaled_re['Value']):.4f}") log_update(f"# Scaled Rg sequences: {len(scaled_rg)}\n\tRange: {min(scaled_rg['Value']):.4f}-{max(scaled_rg['Value']):.4f}") log_update(f"# Scaling Exponent sequences: {len(scaling_exp)}\n\tRange: {min(scaling_exp['Value']):.4f}-{max(scaling_exp['Value']):.4f}") # Combine combined = combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp) # Save processed data proc_folder = "processed_data" os.makedirs(proc_folder,exist_ok=True) combined.to_csv(f"{proc_folder}/all_albatross_seqs_and_properties.csv",index=False) # Plot the data distribution and save it values_dict = { 'Asphericity': asph['Value'].tolist(), 'End-to-End Distance (Re)': scaled_re['Value'].tolist(), 'Radius of Gyration (Rg)': scaled_rg['Value'].tolist(), 'Scaling Exponent': scaling_exp['Value'].tolist() } train_test_values_dict = { 'Asphericity': { 'train': asph[asph['Split']=='Train']['Value'].tolist(), 'test': asph[asph['Split']=='Test']['Value'].tolist()}, 'End-to-End Distance (Re)': { 'train': scaled_re[scaled_re['Split']=='Train']['Value'].tolist(), 'test': scaled_re[scaled_re['Split']=='Test']['Value'].tolist()}, 'Radius of Gyration (Rg)': { 'train': scaled_rg[scaled_rg['Split']=='Train']['Value'].tolist(), 'test': scaled_rg[scaled_rg['Split']=='Test']['Value'].tolist()}, 'Scaling Exponent': { 'train': scaling_exp[scaling_exp['Split']=='Train']['Value'].tolist(), 'test': scaling_exp[scaling_exp['Split']=='Test']['Value'].tolist()}, } plot_all_values_hist_grid(values_dict, save_path="processed_data/value_histograms.png") plot_all_train_val_test_values_hist_grid(train_test_values_dict, save_path="processed_data/train_test_value_histograms.png") if __name__ == "__main__": main()