File size: 8,454 Bytes
8d9d9da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# Cleans raw data to prepare FO labels and embeddings
from fuson_plm.utils.logging import open_logfile, log_update
from fuson_plm.utils.data_cleaning import find_invalid_chars
from fuson_plm.utils.constants import VALID_AAS
import pandas as pd
import numpy as np
import pickle
def find_localization(row):
puncta_status = row['Puncta_Status']
cytoplasm = (row['Cytoplasm']=='Punctate')
nucleus = (row['Nucleus']=='Punctate')
both = cytoplasm and nucleus
if puncta_status=='YES':
if both:
return 'Both'
else:
if cytoplasm:
return 'Cytoplasm'
if nucleus:
return 'Nucleus'
return np.nan
def clean_s5(df):
log_update("Cleaning FOdb Supplementary Table 5")
# extract only the physicochemical features used by the FO-Puncta ML model
retained_features = df.loc[
df['Low MI Set: Used In ML Model'].isin(['Yes','Yet']) # allow flexibility for typo in this DF
]['Parameter Label (Sup Table 2 & Matlab Scripts)'].tolist()
retained_features = sorted(retained_features)
# log the result
log_update(f'\tIsolated the {len(retained_features)} low-MI features used to train ML model')
for i, feat in enumerate(retained_features): log_update(f'\t\t{i+1}. {feat}')
# return the result
return retained_features
def make_label_df(df):
"""
Input df should be cleaned s4
"""
label_df = df[['FO_Name','AAseq','Localization','Puncta_Status','Dataset']].rename(columns={'FO_Name':'fusiongene','AAseq':'aa_seq','Dataset':'dataset'})
dataset_to_split_dict = {'Expressed_Set': 'train', 'Verification_Set': 'test'}
label_df['split'] = label_df['dataset'].apply(lambda x: dataset_to_split_dict[x])
label_df['nucleus'] = label_df['Localization'].apply(lambda x: 1 if x in ['Nucleus','Both'] else 0)
label_df['cytoplasm'] = label_df['Localization'].apply(lambda x: 1 if x in ['Cytoplasm','Both'] else 0)
label_df['formation'] = label_df['Puncta_Status'].apply(lambda x: 1 if x=='YES' else 0)
label_df = label_df[['fusiongene','aa_seq','dataset','split','nucleus','cytoplasm','formation']]
return label_df
def make_embeddings(df, physicochemical_features):
feat_string = '\n\t' + '\n\t'.join([str(i)+'. '+feat for i,feat in enumerate(physicochemical_features)])
log_update(f"\nMaking phyisochemical feature vectors.\nFeature Order: {feat_string}")
embeddings = {}
aa_seqs = df['AAseq'].unique()
for seq in aa_seqs:
feats = df.loc[df['AAseq']==seq].reset_index(drop=True)[physicochemical_features].T[0].tolist()
embeddings[seq] = feats
return embeddings
def clean_s4(df, retained_features):
log_update("Cleaning FOdb Supplementary Table 4")
df = df.loc[
df['Puncta_Status'].isin(['YES','NO'])
].reset_index(drop=True)
log_update(f'\tRemoved invalid FOs (puncta status = "Other" or "Nucleolar"). Remaining FOs: {len(df)}')
# check for duplicate sequences
dup_seqs = df.loc[df['AAseq'].duplicated()]['AAseq'].unique()
log_update(f"\tTotal duplicated sequences: {len(dup_seqs)}")
# check for invalid characters
df['invalid_chars'] = df['AAseq'].apply(lambda x: find_invalid_chars(x, VALID_AAS))
all_invalid_chars = set().union(*df['invalid_chars'])
log_update(f"\tChecking for invalid characters...\n\t\tFound {len(all_invalid_chars)} invalid characters")
for c in all_invalid_chars:
subset = df.loc[df['AAseq'].str.contains(c)]['AAseq'].tolist()
for seq in subset:
log_update(f"\t\tInvalid char {c} at index {seq.index(c)}/{len(seq)-1} of sequence {seq}")
# going to just remove the "-" from the special sequence
df = df.drop(columns=['invalid_chars'])
df.loc[
df['AAseq'].str.contains('-'),'AAseq'
] = df.loc[df['AAseq'].str.contains('-'),'AAseq'].item().replace('-','')
# change FO format to ::
df['FO_Name'] = df['FO_Name'].apply(lambda x: x.replace('_','::'))
log_update(f'\tChanged FO names to Head::Tail format')
# Isolate positive and negative sets
df['Localization'] = ['']*len(df)
df['Localization'] = df.apply(lambda row: find_localization(row), axis=1)
puncta_positive = df.loc[
df['Puncta_Status']=='YES'
].reset_index(drop=True)
puncta_negative = df.loc[
df['Puncta_Status']=='NO'
].reset_index(drop=True)
# Only keeping retained features
cols = list(df.columns)
mi_feats_included = set(retained_features).intersection(set(cols))
log_update(f"\tChecking for the {len(retained_features)} low-MI features... {len(mi_feats_included)} found")
# make sure all of these are no-na
for rf in retained_features:
# if there's NaN, log it. Make sure the only instances of np.nan are for Verification Set FOs.
if df[rf].isna().sum()>0:
nas = df.loc[df[rf].isna()]
log_update(f"\t\tFeature {rf} has {len(nas)} np.nan values in the following datasets:")
for k,v in nas['Dataset'].value_counts().items():
print(f'\t\t\t{k}: {v}')
df = df[['FO_Name', 'Nucleus', 'Nucleolus', 'Cytoplasm','Puncta_Status', 'Dataset', 'Localization', 'AAseq',
'Puncta.pred', 'Puncta.prob']+retained_features]
# Quantify localization
log_update(f'\n\tPuncta localization for {len(puncta_positive)} FOs where Puncta_Status==YES')
for k, v in puncta_positive['Localization'].value_counts().items():
pcnt = 100*v/sum(puncta_positive['Localization'].value_counts())
log_update(f'\t\t{k}: \t{v} ({pcnt:.2f}%)')
log_update("\tDataset breakdown...")
dataset_vc = df['Dataset'].value_counts()
expressed_puncta_statuses = df.loc[df['Dataset']=='Expressed_Set']['Puncta_Status'].value_counts()
expressed_positive_locs = puncta_positive.loc[puncta_positive['Dataset']=='Expressed_Set']['Localization'].value_counts()
verification_positive_locs = puncta_positive.loc[puncta_positive['Dataset']=='Verification_Set']['Localization'].value_counts()
verification_puncta_statuses = df.loc[df['Dataset']=='Verification_Set']['Puncta_Status'].value_counts()
for k, v in dataset_vc.items():
pcnt = 100*v/sum(dataset_vc)
log_update(f'\t\t{k}: \t{v} ({pcnt:.2f}%)')
if k=='Expressed_Set':
for key, val in expressed_puncta_statuses.items():
pcnt = 100*val/v
log_update(f'\t\t\t{key}: \t{val} ({pcnt:.2f}%)')
if key=='YES':
log_update('\t\t\t\tLocalizations...')
for key2, val2 in expressed_positive_locs.items():
pcnt = 100*val2/val
log_update(f'\t\t\t\t\t{key2}: \t{val2} ({pcnt:.2f}%)')
if k=='Verification_Set':
for key, val in verification_puncta_statuses.items():
pcnt = 100*val/v
log_update(f'\t\t\t{key}: \t{val} ({pcnt:.2f}%)')
if key=='YES':
log_update('\t\t\t\tLocalizations...')
for key2, val2 in verification_positive_locs.items():
pcnt = 100*val2/val
log_update(f'\t\t\t\t\t{key2}: \t{val2} ({pcnt:.2f}%)')
return df
def main():
LOG_PATH = 'cleaning_log.txt'
FODB_S4_PATH = '../../data/raw_data/FOdb_puncta.csv'
FODB_S5_PATH = '../../data/raw_data/FOdb_SD5.csv'
with open_logfile(LOG_PATH):
s4 = pd.read_csv(FODB_S4_PATH)
s5 = pd.read_csv(FODB_S5_PATH)
retained_features = clean_s5(s5)
cleaned_s4 = clean_s4(s4, retained_features)
label_df = make_label_df(cleaned_s4)
embeddings = make_embeddings(cleaned_s4, retained_features)
# save the results
cleaned_s4.to_csv('cleaned_dataset_s4.csv', index=False)
log_update("\nSaved cleaned table S5 to cleaned_dataset_s4.csv")
label_df.to_csv('splits.csv', index=False)
log_update("\nSaved train-test splits with nucleus, cytoplasm, and formation labels to splits.csv")
with open('FOdb_physicochemical_embeddings.pkl','wb') as f:
pickle.dump(embeddings, f)
log_update("\nSaved physicochemical embeddings as a dictionary to FOdb_physicochemical_embeddings.pkl")
if __name__ == '__main__':
main() |