|
|
|
import pandas as pd |
|
import os |
|
|
|
def get_gene_type(gene, d): |
|
if gene in d: |
|
if d[gene] == 'kinase': |
|
return 'Kinase' |
|
if d[gene] == 'tf': |
|
return 'TF' |
|
else: |
|
return 'Other' |
|
|
|
|
|
def main(): |
|
os.makedirs("processed_data", exist_ok=True) |
|
|
|
tf_kinase_parts = pd.read_csv("raw_data/salokas_2020_tableS3.csv") |
|
print(tf_kinase_parts) |
|
ht_tf_kinase_dict = dict(zip(tf_kinase_parts['Gene'],tf_kinase_parts['Kinase or TF'])) |
|
|
|
|
|
fuson_db = pd.read_csv("../../../data/fuson_db.csv") |
|
print(fuson_db['benchmark'].value_counts()) |
|
print(fuson_db.loc[fuson_db['benchmark'].notna()]) |
|
fgenes = fuson_db.loc[fuson_db['benchmark'].notna()]['fusiongenes'].to_list() |
|
print(fuson_db.columns) |
|
print(fuson_db) |
|
|
|
|
|
fuson_ht_db = pd.read_csv("../../../data/blast/fuson_ht_db.csv") |
|
print(fuson_ht_db.columns) |
|
print(fuson_ht_db) |
|
fuson_ht_db[['hg','tg']] = fuson_ht_db['fusiongenes'].str.split("::",expand=True) |
|
print(fuson_ht_db.loc[fuson_ht_db['hg']=='PAX3']) |
|
print(fuson_ht_db) |
|
|
|
fuson_ht_db['hg_type'] = fuson_ht_db['hg'].apply(lambda x: get_gene_type(x, ht_tf_kinase_dict)) |
|
fuson_ht_db['tg_type'] = fuson_ht_db['tg'].apply(lambda x: get_gene_type(x, ht_tf_kinase_dict)) |
|
fuson_ht_db['fusion_type'] = fuson_ht_db['hg_type']+'::'+fuson_ht_db['tg_type'] |
|
fuson_ht_db['type']=['fusion']*len(fuson_ht_db) |
|
|
|
|
|
test_set = pd.read_csv("../../../data/splits/test_df.csv") |
|
print(test_set.columns, len(test_set)) |
|
test_seqs = test_set['sequence'].tolist() |
|
fuson_ht_db = fuson_ht_db.loc[ |
|
fuson_ht_db['aa_seq'].isin(test_seqs) |
|
].sort_values(by=['fusion_type']).reset_index(drop=True) |
|
fuson_ht_db.to_csv("processed_data/test_seqs_tftf_kk.csv", index=False) |
|
|
|
|
|
fusion_genes_of_interest = [ |
|
"EWSR1::FLI1", "PAX3::FOXO1", "TRIM24::RET", "ETV6::NTRK3" |
|
] |
|
df_of_interest = fuson_ht_db.loc[ |
|
fuson_ht_db['fusiongenes'].isin(fusion_genes_of_interest) |
|
].sort_values(by=['fusiongenes','length'],ascending=[True,False]).reset_index(drop=True).drop_duplicates(subset='fusiongenes').reset_index(drop=True) |
|
|
|
|
|
discovery_input = df_of_interest[['fusiongenes','length','aa_seq']] |
|
discovery_input['start_residue_index'] = [1]*len(discovery_input) |
|
discovery_input['n'] = [3]*len(discovery_input) |
|
discovery_input = discovery_input.rename(columns={'length':'end_residue_index', |
|
'aa_seq': 'full_fusion_sequence', |
|
'fusiongenes':'fusion_name'}) |
|
discovery_input[['fusion_name','full_fusion_sequence','start_residue_index','end_residue_index','n']].to_csv("processed_data/domain_conservation_fusions_inputfile.csv",index=False) |
|
print(discovery_input) |
|
|
|
if __name__ == "__main__": |
|
main() |