File size: 8,454 Bytes
8d9d9da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# Cleans raw data to prepare FO labels and embeddings
from fuson_plm.utils.logging import open_logfile, log_update
from fuson_plm.utils.data_cleaning import find_invalid_chars
from fuson_plm.utils.constants import VALID_AAS
import pandas as pd
import numpy as np
import pickle

def find_localization(row):
    puncta_status = row['Puncta_Status']
    cytoplasm = (row['Cytoplasm']=='Punctate')
    nucleus = (row['Nucleus']=='Punctate')
    both = cytoplasm and nucleus
    
    if puncta_status=='YES':
        if both:
            return 'Both'
        else:
            if cytoplasm:
                return 'Cytoplasm'
            if nucleus:
                return 'Nucleus'
    return np.nan

def clean_s5(df):
    log_update("Cleaning FOdb Supplementary Table 5")
    
    # extract only the physicochemical features used by the FO-Puncta ML model
    retained_features = df.loc[
        df['Low MI Set: Used In ML Model'].isin(['Yes','Yet'])      # allow flexibility for typo in this DF
    ]['Parameter Label (Sup Table 2 & Matlab Scripts)'].tolist()
    retained_features = sorted(retained_features)
    
    # log the result
    log_update(f'\tIsolated the {len(retained_features)} low-MI features used to train ML model')
    for i, feat in enumerate(retained_features): log_update(f'\t\t{i+1}. {feat}')
    
    # return the result
    return retained_features

def make_label_df(df):
    """
    Input df should be cleaned s4
    """
    label_df = df[['FO_Name','AAseq','Localization','Puncta_Status','Dataset']].rename(columns={'FO_Name':'fusiongene','AAseq':'aa_seq','Dataset':'dataset'})
    dataset_to_split_dict = {'Expressed_Set': 'train', 'Verification_Set': 'test'}
    label_df['split'] = label_df['dataset'].apply(lambda x: dataset_to_split_dict[x])
    label_df['nucleus'] = label_df['Localization'].apply(lambda x: 1 if x in ['Nucleus','Both'] else 0)
    label_df['cytoplasm'] = label_df['Localization'].apply(lambda x: 1 if x in ['Cytoplasm','Both'] else 0)
    label_df['formation'] = label_df['Puncta_Status'].apply(lambda x: 1 if x=='YES' else 0)
    label_df = label_df[['fusiongene','aa_seq','dataset','split','nucleus','cytoplasm','formation']]
    
    return label_df

def make_embeddings(df, physicochemical_features):
    feat_string = '\n\t' + '\n\t'.join([str(i)+'. '+feat for i,feat in enumerate(physicochemical_features)])
    log_update(f"\nMaking phyisochemical feature vectors.\nFeature Order: {feat_string}")
    embeddings = {}
    aa_seqs = df['AAseq'].unique()
    for seq in aa_seqs:
        feats = df.loc[df['AAseq']==seq].reset_index(drop=True)[physicochemical_features].T[0].tolist()
        embeddings[seq] = feats
    
    return embeddings

def clean_s4(df, retained_features):
    log_update("Cleaning FOdb Supplementary Table 4")
    df = df.loc[
        df['Puncta_Status'].isin(['YES','NO'])
    ].reset_index(drop=True)
    log_update(f'\tRemoved invalid FOs (puncta status = "Other" or "Nucleolar"). Remaining FOs: {len(df)}')
    
    # check for duplicate sequences
    dup_seqs = df.loc[df['AAseq'].duplicated()]['AAseq'].unique()
    log_update(f"\tTotal duplicated sequences: {len(dup_seqs)}")
    
    # check for invalid characters
    df['invalid_chars'] = df['AAseq'].apply(lambda x: find_invalid_chars(x, VALID_AAS))
    all_invalid_chars = set().union(*df['invalid_chars'])
    log_update(f"\tChecking for invalid characters...\n\t\tFound {len(all_invalid_chars)} invalid characters")
    for c in all_invalid_chars:
        subset = df.loc[df['AAseq'].str.contains(c)]['AAseq'].tolist()
        for seq in subset:
            log_update(f"\t\tInvalid char {c} at index {seq.index(c)}/{len(seq)-1} of sequence {seq}")
    # going to just remove the "-" from the special sequence
    df = df.drop(columns=['invalid_chars'])
    df.loc[
        df['AAseq'].str.contains('-'),'AAseq'
    ] = df.loc[df['AAseq'].str.contains('-'),'AAseq'].item().replace('-','')
    
    # change FO format to ::
    df['FO_Name'] = df['FO_Name'].apply(lambda x: x.replace('_','::'))
    log_update(f'\tChanged FO names to Head::Tail format')
    
     # Isolate positive and negative sets
    df['Localization'] = ['']*len(df)
    df['Localization'] = df.apply(lambda row: find_localization(row), axis=1)
    puncta_positive = df.loc[
        df['Puncta_Status']=='YES'
    ].reset_index(drop=True)
    puncta_negative = df.loc[
        df['Puncta_Status']=='NO'
    ].reset_index(drop=True)
    
    # Only keeping retained features
    cols = list(df.columns)
    mi_feats_included = set(retained_features).intersection(set(cols))
    log_update(f"\tChecking for the {len(retained_features)} low-MI features... {len(mi_feats_included)} found")
    # make sure all of these are no-na
    for rf in retained_features:
        # if there's NaN, log it. Make sure the only instances of np.nan are for Verification Set FOs. 
        if df[rf].isna().sum()>0: 
            nas = df.loc[df[rf].isna()]
            log_update(f"\t\tFeature {rf} has {len(nas)} np.nan values in the following datasets:")
            for k,v in nas['Dataset'].value_counts().items():
                print(f'\t\t\t{k}: {v}')
    
    df = df[['FO_Name', 'Nucleus', 'Nucleolus', 'Cytoplasm','Puncta_Status', 'Dataset', 'Localization', 'AAseq', 
             'Puncta.pred', 'Puncta.prob']+retained_features]
    
    # Quantify localization
    log_update(f'\n\tPuncta localization for {len(puncta_positive)} FOs where Puncta_Status==YES')
    for k, v in puncta_positive['Localization'].value_counts().items():
        pcnt = 100*v/sum(puncta_positive['Localization'].value_counts())
        log_update(f'\t\t{k}: \t{v} ({pcnt:.2f}%)')
        
    log_update("\tDataset breakdown...")
    dataset_vc = df['Dataset'].value_counts()
    expressed_puncta_statuses = df.loc[df['Dataset']=='Expressed_Set']['Puncta_Status'].value_counts()
    expressed_positive_locs = puncta_positive.loc[puncta_positive['Dataset']=='Expressed_Set']['Localization'].value_counts()
    verification_positive_locs = puncta_positive.loc[puncta_positive['Dataset']=='Verification_Set']['Localization'].value_counts()
    verification_puncta_statuses = df.loc[df['Dataset']=='Verification_Set']['Puncta_Status'].value_counts()
    for k, v in dataset_vc.items():
        pcnt = 100*v/sum(dataset_vc)
        log_update(f'\t\t{k}: \t{v} ({pcnt:.2f}%)')
        if k=='Expressed_Set':
            for key, val in expressed_puncta_statuses.items():
                pcnt = 100*val/v
                log_update(f'\t\t\t{key}: \t{val} ({pcnt:.2f}%)')
                if key=='YES':
                    log_update('\t\t\t\tLocalizations...')
                    for key2, val2 in expressed_positive_locs.items():
                        pcnt = 100*val2/val
                        log_update(f'\t\t\t\t\t{key2}: \t{val2} ({pcnt:.2f}%)')
        if k=='Verification_Set':
            for key, val in verification_puncta_statuses.items():
                pcnt = 100*val/v
                log_update(f'\t\t\t{key}: \t{val} ({pcnt:.2f}%)')
                if key=='YES':
                    log_update('\t\t\t\tLocalizations...')
                    for key2, val2 in verification_positive_locs.items():
                        pcnt = 100*val2/val
                        log_update(f'\t\t\t\t\t{key2}: \t{val2} ({pcnt:.2f}%)')
    
    return df
    
def main():
    LOG_PATH = 'cleaning_log.txt'
    FODB_S4_PATH = '../../data/raw_data/FOdb_puncta.csv'
    FODB_S5_PATH = '../../data/raw_data/FOdb_SD5.csv'
    
    with open_logfile(LOG_PATH):
        s4 = pd.read_csv(FODB_S4_PATH)
        s5 = pd.read_csv(FODB_S5_PATH)
        
        retained_features = clean_s5(s5)
        cleaned_s4 = clean_s4(s4, retained_features)
        
        label_df = make_label_df(cleaned_s4)
        embeddings = make_embeddings(cleaned_s4, retained_features)
        
        # save the results
        cleaned_s4.to_csv('cleaned_dataset_s4.csv', index=False)
        log_update("\nSaved cleaned table S5 to cleaned_dataset_s4.csv")
        
        label_df.to_csv('splits.csv', index=False)
        log_update("\nSaved train-test splits with nucleus, cytoplasm, and formation labels to splits.csv")
        
        with open('FOdb_physicochemical_embeddings.pkl','wb') as f:
            pickle.dump(embeddings, f)
        log_update("\nSaved physicochemical embeddings as a dictionary to FOdb_physicochemical_embeddings.pkl")
    
if __name__ == '__main__':
    main()