Fill-Mask
Transformers
Safetensors
esm
svincoff's picture
mutation prediction discovery and recovery
3efa812
### Clean the Salokas data, find TF and Kinase fusions in the test set
import pandas as pd
import os
def get_gene_type(gene, d):
if gene in d:
if d[gene] == 'kinase':
return 'Kinase'
if d[gene] == 'tf':
return 'TF'
else:
return 'Other'
# Load TF and Kinase Fusions
def main():
os.makedirs("processed_data", exist_ok=True)
tf_kinase_parts = pd.read_csv("raw_data/salokas_2020_tableS3.csv")
print(tf_kinase_parts)
ht_tf_kinase_dict = dict(zip(tf_kinase_parts['Gene'],tf_kinase_parts['Kinase or TF']))
## Categorize everything in fuson_db
fuson_db = pd.read_csv("../../../data/fuson_db.csv")
print(fuson_db['benchmark'].value_counts())
print(fuson_db.loc[fuson_db['benchmark'].notna()])
fgenes = fuson_db.loc[fuson_db['benchmark'].notna()]['fusiongenes'].to_list()
print(fuson_db.columns)
print(fuson_db)
# This one has each row with one fusiongene name
fuson_ht_db = pd.read_csv("../../../data/blast/fuson_ht_db.csv")
print(fuson_ht_db.columns)
print(fuson_ht_db)
fuson_ht_db[['hg','tg']] = fuson_ht_db['fusiongenes'].str.split("::",expand=True)
print(fuson_ht_db.loc[fuson_ht_db['hg']=='PAX3'])
print(fuson_ht_db)
fuson_ht_db['hg_type'] = fuson_ht_db['hg'].apply(lambda x: get_gene_type(x, ht_tf_kinase_dict))
fuson_ht_db['tg_type'] = fuson_ht_db['tg'].apply(lambda x: get_gene_type(x, ht_tf_kinase_dict))
fuson_ht_db['fusion_type'] = fuson_ht_db['hg_type']+'::'+fuson_ht_db['tg_type']
fuson_ht_db['type']=['fusion']*len(fuson_ht_db)
# Keep things in the test set
test_set = pd.read_csv("../../../data/splits/test_df.csv")
print(test_set.columns, len(test_set))
test_seqs = test_set['sequence'].tolist()
fuson_ht_db = fuson_ht_db.loc[
fuson_ht_db['aa_seq'].isin(test_seqs)
].sort_values(by=['fusion_type']).reset_index(drop=True)
fuson_ht_db.to_csv("processed_data/test_seqs_tftf_kk.csv", index=False)
# isolate a few transcription factor fusions of interest and keep the longest sequence of each
fusion_genes_of_interest = [
"EWSR1::FLI1", "PAX3::FOXO1", "TRIM24::RET", "ETV6::NTRK3"
]
df_of_interest = fuson_ht_db.loc[
fuson_ht_db['fusiongenes'].isin(fusion_genes_of_interest)
].sort_values(by=['fusiongenes','length'],ascending=[True,False]).reset_index(drop=True).drop_duplicates(subset='fusiongenes').reset_index(drop=True)
#df_of_interest.to_csv("domain_conservation_fusions.csv",index=False)
# Make a file for input into
discovery_input = df_of_interest[['fusiongenes','length','aa_seq']]
discovery_input['start_residue_index'] = [1]*len(discovery_input)
discovery_input['n'] = [3]*len(discovery_input)
discovery_input = discovery_input.rename(columns={'length':'end_residue_index',
'aa_seq': 'full_fusion_sequence',
'fusiongenes':'fusion_name'})
discovery_input[['fusion_name','full_fusion_sequence','start_residue_index','end_residue_index','n']].to_csv("processed_data/domain_conservation_fusions_inputfile.csv",index=False)
print(discovery_input)
if __name__ == "__main__":
main()