|
|
|
from fuson_plm.utils.logging import open_logfile, log_update |
|
from fuson_plm.utils.data_cleaning import find_invalid_chars |
|
from fuson_plm.utils.constants import VALID_AAS |
|
import pandas as pd |
|
import numpy as np |
|
import pickle |
|
|
|
def find_localization(row): |
|
puncta_status = row['Puncta_Status'] |
|
cytoplasm = (row['Cytoplasm']=='Punctate') |
|
nucleus = (row['Nucleus']=='Punctate') |
|
both = cytoplasm and nucleus |
|
|
|
if puncta_status=='YES': |
|
if both: |
|
return 'Both' |
|
else: |
|
if cytoplasm: |
|
return 'Cytoplasm' |
|
if nucleus: |
|
return 'Nucleus' |
|
return np.nan |
|
|
|
def clean_s5(df): |
|
log_update("Cleaning FOdb Supplementary Table 5") |
|
|
|
|
|
retained_features = df.loc[ |
|
df['Low MI Set: Used In ML Model'].isin(['Yes','Yet']) |
|
]['Parameter Label (Sup Table 2 & Matlab Scripts)'].tolist() |
|
retained_features = sorted(retained_features) |
|
|
|
|
|
log_update(f'\tIsolated the {len(retained_features)} low-MI features used to train ML model') |
|
for i, feat in enumerate(retained_features): log_update(f'\t\t{i+1}. {feat}') |
|
|
|
|
|
return retained_features |
|
|
|
def make_label_df(df): |
|
""" |
|
Input df should be cleaned s4 |
|
""" |
|
label_df = df[['FO_Name','AAseq','Localization','Puncta_Status','Dataset']].rename(columns={'FO_Name':'fusiongene','AAseq':'aa_seq','Dataset':'dataset'}) |
|
dataset_to_split_dict = {'Expressed_Set': 'train', 'Verification_Set': 'test'} |
|
label_df['split'] = label_df['dataset'].apply(lambda x: dataset_to_split_dict[x]) |
|
label_df['nucleus'] = label_df['Localization'].apply(lambda x: 1 if x in ['Nucleus','Both'] else 0) |
|
label_df['cytoplasm'] = label_df['Localization'].apply(lambda x: 1 if x in ['Cytoplasm','Both'] else 0) |
|
label_df['formation'] = label_df['Puncta_Status'].apply(lambda x: 1 if x=='YES' else 0) |
|
label_df = label_df[['fusiongene','aa_seq','dataset','split','nucleus','cytoplasm','formation']] |
|
|
|
return label_df |
|
|
|
def make_embeddings(df, physicochemical_features): |
|
feat_string = '\n\t' + '\n\t'.join([str(i)+'. '+feat for i,feat in enumerate(physicochemical_features)]) |
|
log_update(f"\nMaking phyisochemical feature vectors.\nFeature Order: {feat_string}") |
|
embeddings = {} |
|
aa_seqs = df['AAseq'].unique() |
|
for seq in aa_seqs: |
|
feats = df.loc[df['AAseq']==seq].reset_index(drop=True)[physicochemical_features].T[0].tolist() |
|
embeddings[seq] = feats |
|
|
|
return embeddings |
|
|
|
def clean_s4(df, retained_features): |
|
log_update("Cleaning FOdb Supplementary Table 4") |
|
df = df.loc[ |
|
df['Puncta_Status'].isin(['YES','NO']) |
|
].reset_index(drop=True) |
|
log_update(f'\tRemoved invalid FOs (puncta status = "Other" or "Nucleolar"). Remaining FOs: {len(df)}') |
|
|
|
|
|
dup_seqs = df.loc[df['AAseq'].duplicated()]['AAseq'].unique() |
|
log_update(f"\tTotal duplicated sequences: {len(dup_seqs)}") |
|
|
|
|
|
df['invalid_chars'] = df['AAseq'].apply(lambda x: find_invalid_chars(x, VALID_AAS)) |
|
all_invalid_chars = set().union(*df['invalid_chars']) |
|
log_update(f"\tChecking for invalid characters...\n\t\tFound {len(all_invalid_chars)} invalid characters") |
|
for c in all_invalid_chars: |
|
subset = df.loc[df['AAseq'].str.contains(c)]['AAseq'].tolist() |
|
for seq in subset: |
|
log_update(f"\t\tInvalid char {c} at index {seq.index(c)}/{len(seq)-1} of sequence {seq}") |
|
|
|
df = df.drop(columns=['invalid_chars']) |
|
df.loc[ |
|
df['AAseq'].str.contains('-'),'AAseq' |
|
] = df.loc[df['AAseq'].str.contains('-'),'AAseq'].item().replace('-','') |
|
|
|
|
|
df['FO_Name'] = df['FO_Name'].apply(lambda x: x.replace('_','::')) |
|
log_update(f'\tChanged FO names to Head::Tail format') |
|
|
|
|
|
df['Localization'] = ['']*len(df) |
|
df['Localization'] = df.apply(lambda row: find_localization(row), axis=1) |
|
puncta_positive = df.loc[ |
|
df['Puncta_Status']=='YES' |
|
].reset_index(drop=True) |
|
puncta_negative = df.loc[ |
|
df['Puncta_Status']=='NO' |
|
].reset_index(drop=True) |
|
|
|
|
|
cols = list(df.columns) |
|
mi_feats_included = set(retained_features).intersection(set(cols)) |
|
log_update(f"\tChecking for the {len(retained_features)} low-MI features... {len(mi_feats_included)} found") |
|
|
|
for rf in retained_features: |
|
|
|
if df[rf].isna().sum()>0: |
|
nas = df.loc[df[rf].isna()] |
|
log_update(f"\t\tFeature {rf} has {len(nas)} np.nan values in the following datasets:") |
|
for k,v in nas['Dataset'].value_counts().items(): |
|
print(f'\t\t\t{k}: {v}') |
|
|
|
df = df[['FO_Name', 'Nucleus', 'Nucleolus', 'Cytoplasm','Puncta_Status', 'Dataset', 'Localization', 'AAseq', |
|
'Puncta.pred', 'Puncta.prob']+retained_features] |
|
|
|
|
|
log_update(f'\n\tPuncta localization for {len(puncta_positive)} FOs where Puncta_Status==YES') |
|
for k, v in puncta_positive['Localization'].value_counts().items(): |
|
pcnt = 100*v/sum(puncta_positive['Localization'].value_counts()) |
|
log_update(f'\t\t{k}: \t{v} ({pcnt:.2f}%)') |
|
|
|
log_update("\tDataset breakdown...") |
|
dataset_vc = df['Dataset'].value_counts() |
|
expressed_puncta_statuses = df.loc[df['Dataset']=='Expressed_Set']['Puncta_Status'].value_counts() |
|
expressed_positive_locs = puncta_positive.loc[puncta_positive['Dataset']=='Expressed_Set']['Localization'].value_counts() |
|
verification_positive_locs = puncta_positive.loc[puncta_positive['Dataset']=='Verification_Set']['Localization'].value_counts() |
|
verification_puncta_statuses = df.loc[df['Dataset']=='Verification_Set']['Puncta_Status'].value_counts() |
|
for k, v in dataset_vc.items(): |
|
pcnt = 100*v/sum(dataset_vc) |
|
log_update(f'\t\t{k}: \t{v} ({pcnt:.2f}%)') |
|
if k=='Expressed_Set': |
|
for key, val in expressed_puncta_statuses.items(): |
|
pcnt = 100*val/v |
|
log_update(f'\t\t\t{key}: \t{val} ({pcnt:.2f}%)') |
|
if key=='YES': |
|
log_update('\t\t\t\tLocalizations...') |
|
for key2, val2 in expressed_positive_locs.items(): |
|
pcnt = 100*val2/val |
|
log_update(f'\t\t\t\t\t{key2}: \t{val2} ({pcnt:.2f}%)') |
|
if k=='Verification_Set': |
|
for key, val in verification_puncta_statuses.items(): |
|
pcnt = 100*val/v |
|
log_update(f'\t\t\t{key}: \t{val} ({pcnt:.2f}%)') |
|
if key=='YES': |
|
log_update('\t\t\t\tLocalizations...') |
|
for key2, val2 in verification_positive_locs.items(): |
|
pcnt = 100*val2/val |
|
log_update(f'\t\t\t\t\t{key2}: \t{val2} ({pcnt:.2f}%)') |
|
|
|
return df |
|
|
|
def main(): |
|
LOG_PATH = 'cleaning_log.txt' |
|
FODB_S4_PATH = '../../data/raw_data/FOdb_puncta.csv' |
|
FODB_S5_PATH = '../../data/raw_data/FOdb_SD5.csv' |
|
|
|
with open_logfile(LOG_PATH): |
|
s4 = pd.read_csv(FODB_S4_PATH) |
|
s5 = pd.read_csv(FODB_S5_PATH) |
|
|
|
retained_features = clean_s5(s5) |
|
cleaned_s4 = clean_s4(s4, retained_features) |
|
|
|
label_df = make_label_df(cleaned_s4) |
|
embeddings = make_embeddings(cleaned_s4, retained_features) |
|
|
|
|
|
cleaned_s4.to_csv('cleaned_dataset_s4.csv', index=False) |
|
log_update("\nSaved cleaned table S5 to cleaned_dataset_s4.csv") |
|
|
|
label_df.to_csv('splits.csv', index=False) |
|
log_update("\nSaved train-test splits with nucleus, cytoplasm, and formation labels to splits.csv") |
|
|
|
with open('FOdb_physicochemical_embeddings.pkl','wb') as f: |
|
pickle.dump(embeddings, f) |
|
log_update("\nSaved physicochemical embeddings as a dictionary to FOdb_physicochemical_embeddings.pkl") |
|
|
|
if __name__ == '__main__': |
|
main() |