|
|
|
|
|
import requests |
|
import json |
|
import pandas as pd |
|
import numpy as np |
|
|
|
import requests |
|
import re |
|
import os |
|
import shutil |
|
|
|
from Bio.PDB import MMCIFParser |
|
import Bio.PDB as PDB |
|
from Bio import pairwise2 |
|
from Bio.pairwise2 import format_alignment |
|
from bs4 import BeautifulSoup |
|
import pdb |
|
|
|
from fuson_plm.utils.logging import log_update, open_logfile |
|
|
|
|
|
class AlphaFoldStructure: |
|
''' |
|
This class processes an mmCIF file, either uploaded or downloaded from the AlphaFold2 database, to provide comprehensive information. |
|
''' |
|
def __init__(self, fold_path=None, uniprot_to_download=None, uniprot_output_dir= None, secondary_structure_types=None): |
|
|
|
if fold_path is not None: |
|
fold_fname = fold_path.split('/')[-1] |
|
prefix, suffix = fold_fname.split('.') |
|
|
|
if suffix == 'pdb': |
|
|
|
conversion_path = 'mmcif_converted_files' |
|
if not(os.path.exists(conversion_path)): |
|
os.makedirs(conversion_path) |
|
|
|
fold_path = self.__convert_pdb_to_mmcif__(fold_path, f'{conversion_path}/{prefix}.cif') |
|
|
|
self.file_path = fold_path |
|
|
|
|
|
if uniprot_to_download is not None: |
|
if fold_path is not None: |
|
log_update("WARNING: both a fold_path and a uniprot_to_download were provided. Running default: downloading the CIF file for provided UniProt ID.") |
|
self.file_path = self.__download_mmCIF(uniprot_to_download, output_path=uniprot_output_dir) |
|
|
|
|
|
if secondary_structure_types is None: |
|
self.secondary_structure_types = self.__pull_secondary_structure_types() |
|
else: |
|
self.secondary_structure_types = secondary_structure_types |
|
|
|
|
|
if self.file_path: |
|
self.cif_lines = self.__parse_cif() |
|
self.secondary_structures = self.__extract_secondary_structures() |
|
self.structure_dict = self.__calc_pLDDTs() |
|
self.sequence = self.structure_dict['seq'] |
|
self.plddts = self.structure_dict['res_pLDDTs'] |
|
self.avg_pLDDT = self.structure_dict['avg_pLDDT'] |
|
self.residues_df = self.__create_residues_summary_dataframe() |
|
self.secondary_structures_df = self.__create_secondary_structures_summary_dataframe() |
|
|
|
else: |
|
log_update("ERROR: structure could not be created. No CIF file found.") |
|
|
|
def __convert_pdb_to_mmcif__(self, pdb_filename, mmcif_filename): |
|
parser = PDB.PDBParser() |
|
structure = parser.get_structure('structure', pdb_filename) |
|
|
|
io = PDB.MMCIFIO() |
|
io.set_structure(structure) |
|
io.save(mmcif_filename) |
|
return mmcif_filename |
|
|
|
def __download_mmCIF(self, uniprot_id, output_path=None): |
|
''' |
|
Download mmCIF file with provided uniprot_id and optional output_path for the downloaded file. |
|
|
|
Return: path to downloaded file if successful, None otherwise |
|
''' |
|
full_file_name = f"AF-{uniprot_id}-F1-model_v4.cif" |
|
|
|
if output_path is None: |
|
output_path = full_file_name |
|
else: |
|
output_path = f"{output_path}/{full_file_name}" |
|
|
|
|
|
url = f"https://alphafold.ebi.ac.uk/files/{full_file_name}" |
|
response = requests.get(url) |
|
|
|
if response.status_code == 200: |
|
with open(output_path, 'wb') as file: |
|
file.write(response.content) |
|
|
|
else: |
|
log_update(f"Failed to download file. Status code: {response.status_code}") |
|
return None |
|
|
|
return output_path |
|
|
|
def __pull_secondary_structure_types(self): |
|
''' |
|
Pull a dictionary of secondary structure types and their descriptions from the PDB mmCIF website (necessary for annotating the CIF file) |
|
Only called if the user does not provide such a dictionary themselves. |
|
''' |
|
|
|
|
|
url = "https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_struct_conf_type.id.html" |
|
response = requests.get(url) |
|
|
|
if response.status_code != 200: |
|
raise Exception("Failed to retrieve mmCIF dictionary") |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
header = soup.find('h4', class_='panel-title') |
|
if header is None or 'Controlled Vocabulary' not in header.text: |
|
raise Exception("Could not find the 'Controlled Vocabulary' header") |
|
|
|
|
|
|
|
|
|
|
|
table = header.find_next('table') |
|
if table is None: |
|
raise Exception("Could not find the table following the 'Controlled Vocabulary' header") |
|
|
|
|
|
|
|
|
|
|
|
secondary_structure_types = {} |
|
rows = table.find_all('tr') |
|
for row in rows[1:]: |
|
cols = row.find_all('td') |
|
if len(cols) > 1: |
|
type_id = cols[0].text.strip() |
|
description = cols[1].text.replace('\t', ' ').strip() |
|
|
|
|
|
description = re.sub(' +', ' ', description) |
|
|
|
|
|
if '(protein)' in description: |
|
secondary_structure_types[type_id] = description |
|
|
|
return secondary_structure_types |
|
|
|
def get_secondary_structure_types(self): |
|
''' |
|
Display secondary structure types |
|
''' |
|
log_update("Secondary Structure Types in mmCIF files:") |
|
for ss_type, description in self.secondary_structure_types.items(): |
|
log_update(f"{ss_type}: {description}") |
|
|
|
return self.secondary_structure_types |
|
|
|
def __parse_cif(self): |
|
''' |
|
Read cif file lines from self.file_path |
|
''' |
|
with open(self.file_path, 'r') as file: |
|
lines = file.readlines() |
|
return lines |
|
|
|
def __extract_secondary_structures(self): |
|
''' |
|
Iterate through the lines of the cif files to find each secondary structure. |
|
Returns a tuple for each amino acid that has a secondary structure annotation. Tuple contains: |
|
1. Structure Type (e.g. STRN) |
|
2. Structure ID (e.g. STRN1) |
|
3. Description (e.g. beta strand) |
|
4. Position (e.g. 3) |
|
''' |
|
secondary_structures = [] |
|
parsing_secondary_structure = False |
|
|
|
|
|
for line in self.cif_lines: |
|
|
|
if line.startswith("_struct_conf.conf_type_id"): |
|
parsing_secondary_structure = True |
|
continue |
|
|
|
if parsing_secondary_structure: |
|
if line.startswith("#"): |
|
parsing_secondary_structure = False |
|
continue |
|
|
|
columns = line.split() |
|
|
|
if len(columns) >= 7: |
|
sec_struc_type = columns[6] |
|
sec_struc_id = columns[13] |
|
start_res = int(columns[2]) |
|
end_res = int(columns[9]) |
|
sec_struc_name = self.secondary_structure_types.get(sec_struc_type, 'Unknown') |
|
|
|
for pos in range(start_res, end_res + 1): |
|
secondary_structures.append((sec_struc_type, sec_struc_id, sec_struc_name, pos)) |
|
|
|
return secondary_structures |
|
|
|
def __calc_pLDDTs(self): |
|
''' |
|
This method iterates through the cif file to return a dictionary with a few key pieces of info: |
|
1. Sequence |
|
2. pLDDTs for each residue |
|
3. Average pLDDT |
|
''' |
|
|
|
|
|
aa_dict = { |
|
"ALA": "A", "CYS": "C", "ASP": "D", "GLU": "E", "PHE": "F", |
|
"GLY": "G", "HIS": "H", "ILE": "I", "LYS": "K", "LEU": "L", |
|
"MET": "M", "ASN": "N", "PRO": "P", "GLN": "Q", "ARG": "R", |
|
"SER": "S", "THR": "T", "VAL": "V", "TRP": "W", "TYR": "Y" |
|
} |
|
|
|
parser = MMCIFParser(QUIET=True) |
|
data = parser.get_structure("structure", self.file_path) |
|
|
|
|
|
model = data.get_models() |
|
models = list(model) |
|
chains = list(models[0].get_chains()) |
|
|
|
|
|
all_pLDDTs = [] |
|
for n in range(len(chains)): |
|
chainname = chr(n + 65) |
|
residues = list(chains[n].get_residues()) |
|
seq = '' |
|
pLDDTs = [0] * len(residues) |
|
|
|
|
|
for i in range(len(residues)): |
|
r = residues[i] |
|
|
|
try: |
|
seq += aa_dict[r.get_resname()] |
|
|
|
except KeyError: |
|
log_update('residue name invalid') |
|
break |
|
|
|
|
|
atoms = list(r.get_atoms()) |
|
bfactor = atoms[0].get_bfactor() |
|
for a in range(len(atoms)): |
|
|
|
if atoms[a].get_bfactor() != bfactor: |
|
break |
|
|
|
pLDDTs[i] = bfactor |
|
|
|
all_pLDDTs.extend(pLDDTs) |
|
|
|
avg_pLDDT = np.mean(all_pLDDTs) |
|
return_dict = { |
|
'avg_pLDDT': round(avg_pLDDT, 2), |
|
'res_pLDDTs': all_pLDDTs, |
|
'seq': seq |
|
} |
|
return return_dict |
|
|
|
def __create_residues_summary_dataframe(self): |
|
''' |
|
Create a dataframe that summarizes the secondary structure information for each residue. |
|
Columns: |
|
1. Position: amino acid position (e.g. 3) |
|
2. Residue: amino acid 1-letter code (e.g. A) |
|
3. pLDDT: alphafold2's pLDDT score for this residue to 2 decimal places (e.g. 77.54) |
|
4. Structure Type: type of secondary structure (e.g. STRN) |
|
5. Structure ID: ID of this secondary structure (e.g. STRN1) |
|
5. Description: description of this secondary structure (e.g. beta strand) |
|
6. Disordered: is this residue disordered or not? A residue is not disordered if it's in a HELX or STRN. (True/False) |
|
|
|
''' |
|
|
|
df_secondary_structures = pd.DataFrame(self.secondary_structures, columns=['Structure Type', 'Structure ID', 'Description', 'Position']) |
|
|
|
|
|
df_temp = pd.DataFrame( |
|
data={ |
|
'Position': list(range(1, len(self.sequence) + 1)), |
|
'Residue': list(self.sequence), |
|
'pLDDT': self.plddts |
|
}) |
|
|
|
df_secondary_structures = pd.merge(df_secondary_structures, df_temp, on='Position', how='right') |
|
|
|
df_secondary_structures['Disordered'] = df_secondary_structures['Structure Type'].apply( |
|
lambda x: False if (type(x)==str and (('HELX' in x) or ('STRN' in x))) else True |
|
) |
|
|
|
return df_secondary_structures |
|
|
|
def __create_secondary_structures_summary_dataframe(self): |
|
''' |
|
Create a dataframe grouped by each Structure ID, providing a summary of each secondary structure in the chain. |
|
Columns: |
|
1. Structure ID: ID of this secondary structure (e.g. STRN1) |
|
2. Start: start position of this secondary structure (e.g. 3) |
|
3. End: end position of this secondary structure (e.g. 12) |
|
4. Start Residue: amino acid 1-letter code of the start position (e.g. A) |
|
5. End Residue: amino acid 1-letter code of the end position (e.g. L) |
|
6. Disordered: is this residue disordered or not? A residue is not disordered if it's in a HELX or STRN. (True/False) |
|
7. Description: description of this secondary structure (e.g. beta strand) |
|
8. Structure Type: type of secondary structure (e.g. STRN) |
|
9. avg_pLDDT: average pLDDT for this secondary structure (e.g. 77.54) |
|
''' |
|
|
|
|
|
secondary_structures_df = self.residues_df.groupby('Structure ID').agg({ |
|
'Position': ['first', 'last'], |
|
'Residue': ['first','last'], |
|
'Disordered': 'first', |
|
'Description': 'first', |
|
'Structure Type': 'first', |
|
'pLDDT': 'mean' |
|
}).reset_index() |
|
|
|
|
|
secondary_structures_df.columns = ['Structure ID', 'Start', 'End', 'Start Residue', 'End Residue', 'Disordered', 'Description', 'Structure Type', 'avg_pLDDT'] |
|
secondary_structures_df['avg_pLDDT'] = secondary_structures_df['avg_pLDDT'].round(2) |
|
|
|
|
|
return secondary_structures_df |
|
|
|
def get_residues_df(self): |
|
return self.residues_df |
|
|
|
def get_secondary_structures_df(self): |
|
return self.secondary_structures_df |
|
|
|
def get_full_sequence(self): |
|
return ''.join([res for res in self.residues_df['Residue']]) |
|
|
|
def get_average_plddt(self): |
|
plddt_values = [plddt for plddt in self.residues_df['pLDDT'] if plddt is not None] |
|
return sum(plddt_values) / len(plddt_values) if plddt_values else None |
|
|
|
def pull_secondary_structure_types(): |
|
url = "https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_struct_conf_type.id.html" |
|
response = requests.get(url) |
|
|
|
if response.status_code != 200: |
|
raise Exception("Failed to retrieve mmCIF dictionary") |
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
|
with open('mmcif_dictionary.txt', 'w') as f: |
|
f.write(soup.prettify()) |
|
|
|
|
|
header = soup.find('h4', class_='panel-title') |
|
if header is None or 'Controlled Vocabulary' not in header.text: |
|
raise Exception("Could not find the 'Controlled Vocabulary' header") |
|
|
|
|
|
|
|
|
|
|
|
table = header.find_next('table') |
|
if table is None: |
|
raise Exception("Could not find the table following the 'Controlled Vocabulary' header") |
|
|
|
|
|
|
|
|
|
secondary_structure_types = {} |
|
rows = table.find_all('tr') |
|
for row in rows[1:]: |
|
cols = row.find_all('td') |
|
if len(cols) > 1: |
|
type_id = cols[0].text.strip() |
|
description = cols[1].text.replace('\t', ' ').strip() |
|
|
|
|
|
description = re.sub(' +', ' ', description) |
|
|
|
if '(protein)' in description: |
|
secondary_structure_types[type_id] = description |
|
|
|
return secondary_structure_types |
|
|
|
|
|
def process_fusionpdb_fusion_files(files, level_2_3_structure_info, folder, save_path=None): |
|
|
|
secondary_structure_types = pull_secondary_structure_types() |
|
|
|
|
|
level_2_3_structure_info['Fold AA seq'] = ['']*len(level_2_3_structure_info) |
|
level_2_3_structure_info['Avg pLDDT'] = [0]*len(level_2_3_structure_info) |
|
level_2_3_structure_info['pLDDTs'] = ['']*len(level_2_3_structure_info) |
|
|
|
|
|
pre_loop_processed = [] |
|
if os.path.exists(save_path): |
|
pre_loop_processed = pd.read_csv(save_path) |
|
pre_loop_processed = pre_loop_processed['Structure Link'].tolist() |
|
pre_loop_processed = [x.split('/')[-1] for x in pre_loop_processed] |
|
log_update(f"Total structures already processed: {len(pre_loop_processed)}") |
|
|
|
log_update("\nProcessing fusion structures...") |
|
|
|
for i, structure in enumerate(files): |
|
log_update(f'\tProcessing #{i+1}: {structure}') |
|
|
|
|
|
if structure in pre_loop_processed: |
|
log_update(f"\t\tAlready processed. Continuing...") |
|
continue |
|
|
|
|
|
obj = AlphaFoldStructure(fold_path=f'{folder}/{structure}', secondary_structure_types=secondary_structure_types) |
|
aa_seq = obj.get_full_sequence() |
|
avg_plddt = obj.get_average_plddt() |
|
residues_df = obj.get_residues_df() |
|
all_plddts = ",".join(residues_df['pLDDT'].astype(str).tolist()) |
|
|
|
log_update(f"\t\tAvg pLDDT: {round(avg_plddt,2)}\tFold AA seq: {aa_seq}\tFirst 5 pLDDTs: {','.join(all_plddts.split(',')[0:5])}") |
|
|
|
level_2_3_structure_info.loc[level_2_3_structure_info['Structure Link'].str.contains(f"/{structure}"), 'Fold AA seq'] = aa_seq |
|
level_2_3_structure_info.loc[level_2_3_structure_info['Structure Link'].str.contains(f"/{structure}"), 'Avg pLDDT'] = avg_plddt |
|
level_2_3_structure_info.loc[level_2_3_structure_info['Structure Link'].str.contains(f"/{structure}"), 'pLDDTs'] = all_plddts |
|
|
|
|
|
cur_df = level_2_3_structure_info.loc[level_2_3_structure_info['Structure Link'].str.contains(f"/{structure}")].reset_index(drop=True) |
|
if os.path.exists(save_path): |
|
cur_df.to_csv(save_path,mode='a',header=False,index=False) |
|
else: |
|
cur_df.to_csv(save_path,index=False) |
|
|
|
|
|
level_2_3_structure_info = pd.read_csv(save_path) |
|
return level_2_3_structure_info |
|
|
|
def process_fusionpdb_head_tail_files(ht, save_path='heads_and_tails_structures_processed.csv'): |
|
|
|
log_update("\nProcessing head and tail structures...") |
|
|
|
|
|
secondary_structure_types = pull_secondary_structure_types() |
|
|
|
|
|
os.makedirs('raw_data/fusionpdb/head_tail_af2db_structures',exist_ok=True) |
|
|
|
|
|
pre_loop_processed = [] |
|
if os.path.exists(save_path): |
|
pre_loop_processed = pd.read_csv(save_path) |
|
pre_loop_processed = pre_loop_processed['UniProtID'].tolist() |
|
log_update(f"Heads and tails already processed: {len(pre_loop_processed)}") |
|
|
|
ht_structures_df = pd.DataFrame( |
|
data = { |
|
'UniProtID': ['']*len(ht), |
|
'Avg pLDDT': ['']*len(ht), |
|
'All pLDDTs': ['']*len(ht), |
|
'Seq': ['']*len(ht) |
|
} |
|
) |
|
|
|
for i, uniprotid in enumerate(ht): |
|
log_update(f'\tProcessing #{i+1}: {uniprotid}') |
|
aa_seq, avg_plddt, all_plddts = None, None, None |
|
|
|
|
|
if uniprotid in pre_loop_processed: |
|
log_update(f"\t\tAlready processed. Continuing") |
|
continue |
|
|
|
try: |
|
obj = AlphaFoldStructure(uniprot_to_download=uniprotid, secondary_structure_types=secondary_structure_types, |
|
uniprot_output_dir='raw_data/fusionpdb/head_tail_af2db_structures') |
|
aa_seq = obj.get_full_sequence() |
|
avg_plddt = obj.get_average_plddt() |
|
residues_df = obj.get_residues_df() |
|
all_plddts = ",".join(residues_df['pLDDT'].astype(str).tolist()) |
|
|
|
log_update(f"\t\tAvg pLDDT: {round(avg_plddt,2)}\tFold AA seq: {aa_seq}\tFirst 5 pLDDTs: {','.join(all_plddts.split(',')[0:5])}") |
|
|
|
except: |
|
log_update(f"\t\tAvg pLDDT: {None}\tFold AA seq: {None}\tFirst 5 pLDDTs: {None}") |
|
|
|
|
|
ht_structures_df.loc[i, 'UniProtID'] = uniprotid |
|
ht_structures_df.loc[i, 'Avg pLDDT'] = avg_plddt |
|
ht_structures_df.loc[i, 'All pLDDTs'] = all_plddts |
|
ht_structures_df.loc[i, 'Seq'] = aa_seq |
|
|
|
|
|
cur_df = pd.DataFrame(ht_structures_df.iloc[i,:]).T.reset_index(drop=True) |
|
if os.path.exists(save_path): |
|
cur_df.to_csv(save_path,mode='a',header=False,index=False) |
|
else: |
|
cur_df.to_csv(save_path,index=False) |
|
|
|
|
|
ht_structures_df = pd.read_csv(save_path) |
|
level_2_3 = pd.read_csv(f'processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv') |
|
level_2_3['FusionGene'] = level_2_3['FusionGene'].str.replace('-','::') |
|
heads = level_2_3['HGUniProtAcc'].tolist() |
|
tails = level_2_3['TGUniProtAcc'].tolist() |
|
ht = heads + tails |
|
ht = set([x for x in ht if type(x)==str]) |
|
ht = set(','.join(ht).split(',')) |
|
|
|
log_update(f"total heads and tails: {len(ht)}") |
|
log_update(f"total processed: {len(ht_structures_df)}\t{len(ht_structures_df['UniProtID'].unique())}") |
|
|
|
|
|
missing = set(ht) - set(ht_structures_df['UniProtID'].unique()) |
|
log_update(f"missing: {len(missing)}") |
|
log_update(missing) |
|
|
|
|
|
ht_structures_df = ht_structures_df.replace('',np.nan) |
|
need_to_fold = ht_structures_df[ht_structures_df['Avg pLDDT'].isna()]['UniProtID'].tolist() |
|
with open('processed_data/fusionpdb/intermediates/uniprotids_not_in_afdb.txt','w') as f: |
|
for uniprotid in need_to_fold: |
|
f.write(f'{uniprotid}\n') |
|
|
|
idmap = pd.read_csv(f'raw_data/fusionpdb/not_in_afdb_idmap.txt',sep='\t') |
|
idmap = idmap[idmap['Entry'].isin(need_to_fold)].reset_index(drop=True) |
|
idmap = idmap[['Entry','Sequence']].rename(columns={ |
|
'Entry': 'ID'}) |
|
idmap['Length'] = idmap['Sequence'].apply(len) |
|
|
|
log_update("Investigating heads and tails that were not in the AF2 database:") |
|
log_update(f"\tMin length: {min(idmap['Length'])}") |
|
log_update(f"\tMax length: {max(idmap['Length'])}") |
|
idmap = idmap.sort_values(by='Length',ascending=True).reset_index(drop=True) |
|
|
|
|
|
id='Q9NNW7' |
|
if id in idmap['ID'].tolist(): |
|
ht_structures_df.loc[ |
|
ht_structures_df['UniProtID']=='Q9NNW7', 'Avg pLDDT' |
|
] = 91.68 |
|
ht_structures_df.loc[ |
|
ht_structures_df['UniProtID']=='Q9NNW7', 'Seq' |
|
] = idmap.loc[ |
|
idmap['ID']=='Q9NNW7', 'Sequence' |
|
].item() |
|
|
|
|
|
id='Q16881' |
|
if id in idmap['ID'].tolist(): |
|
ht_structures_df.loc[ |
|
ht_structures_df['UniProtID']==id, 'Avg pLDDT' |
|
] = 89.55 |
|
ht_structures_df.loc[ |
|
ht_structures_df['UniProtID']==id, 'Seq' |
|
] = idmap.loc[ |
|
idmap['ID']==id, 'Sequence' |
|
].item() |
|
|
|
|
|
id='Q86V15' |
|
if id in idmap['ID'].tolist(): |
|
ht_structures_df.loc[ |
|
ht_structures_df['UniProtID']==id, 'Avg pLDDT' |
|
] = 48.14 |
|
ht_structures_df.loc[ |
|
ht_structures_df['UniProtID']==id, 'Seq' |
|
] = idmap.loc[ |
|
idmap['ID']==id, 'Sequence' |
|
].item() |
|
|
|
return ht_structures_df |
|
|
|
def process_fusions_and_hts(): |
|
|
|
level_2_3_structure_info_og = pd.read_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv') |
|
|
|
|
|
folder = 'raw_data/fusionpdb/structures' |
|
|
|
files = os.listdir(folder) |
|
log_update(f"total pdbs: {len(files)}") |
|
log_update(f"examples: {files[:5]}") |
|
|
|
os.makedirs('processed_data/fusionpdb', exist_ok=True) |
|
|
|
|
|
level_2_3_structure_info = process_fusionpdb_fusion_files(files, level_2_3_structure_info_og, folder, save_path='processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structures_processed.csv') |
|
|
|
|
|
level_2_3 = pd.read_csv(f'processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv') |
|
level_2_3['FusionGene'] = level_2_3['FusionGene'].str.replace('-','::') |
|
|
|
heads = level_2_3['HGUniProtAcc'].tolist() |
|
tails = level_2_3['TGUniProtAcc'].tolist() |
|
ht = heads + tails |
|
ht = set([x for x in ht if type(x)==str]) |
|
ht = set(','.join(ht).split(',')) |
|
log_update(f"Unique heads/tails: {len(ht)}") |
|
|
|
heads_tails_analyzed = process_fusionpdb_head_tail_files(list(ht), save_path='processed_data/fusionpdb/heads_tails_structural_data.csv') |
|
|
|
|
|
level_2 = pd.read_csv(f'raw_data/fusionpdb/FusionPDB_level2_curated_09_05_2024.csv') |
|
level_3 = pd.read_csv(f'raw_data/fusionpdb/FusionPDB_level3_curated_09_05_2024.csv') |
|
joined_23 = pd.concat([level_2,level_3]).reset_index(drop=True) |
|
joined_23['FusionGene'] = joined_23['FusionGene'].str.replace('-','::') |
|
log_update(f"\nnumber of duplicated fusion gene rows: {len(joined_23[joined_23['FusionGene'].duplicated()])}") |
|
|
|
fo_gid_dict = dict(zip(joined_23['FusionGene'],joined_23['FusionGID'])) |
|
log_update(len(fo_gid_dict)) |
|
|
|
|
|
|
|
level_2_3_structure_info_clean = level_2_3_structure_info.replace('',np.nan) |
|
level_2_3_structure_info_clean = level_2_3_structure_info_clean.dropna(subset=['Fold AA seq']).reset_index(drop=True) |
|
log_update(f"length of processed structure file: {len(level_2_3_structure_info_clean)}") |
|
level_2_3_structure_info_clean['pLDDT'] = level_2_3_structure_info_clean['Avg pLDDT'].round(2) |
|
level_2_3_structure_info_clean = level_2_3_structure_info_clean.drop(columns=['Avg pLDDT']) |
|
level_2_3_structure_info_clean['FusionGene'] = level_2_3_structure_info_clean['FusionGene'].str.replace('-','::') |
|
level_2_3_structure_info_clean['FusionGID'] = level_2_3_structure_info_clean['FusionGene'].apply(lambda x: fo_gid_dict[x]) |
|
|
|
|
|
log_update("Using FusionPDB as ground truth for sequences...") |
|
raw_download = pd.read_csv('../../data/raw_data/FusionPDB.txt',sep='\t',header=None) |
|
raw_download['FusionGene'] = raw_download[7]+ '::' + raw_download[11] |
|
raw_download = raw_download.rename(columns={18:'Raw Download AA Seq'}) |
|
log_update(f"FusionPDB raw download size: {len(raw_download)}") |
|
|
|
level_2_3_structure_info_clean_ids = set(level_2_3_structure_info_clean['FusionGene'].tolist()) |
|
level_2_3_structure_info_clean_seqs = set(level_2_3_structure_info_clean['Fold AA seq'].tolist()) |
|
raw_download_ids = set(raw_download['FusionGene'].tolist()) |
|
raw_download_seqs = set(raw_download['Raw Download AA Seq'].tolist()) |
|
log_update(f"Number of overlapping gene IDs: {len(level_2_3_structure_info_clean_ids.intersection(raw_download_ids))}") |
|
log_update(f"Number of overlapping sequences: {len(level_2_3_structure_info_clean_seqs.intersection(raw_download_seqs))}") |
|
|
|
|
|
test_merge_1 = pd.merge( |
|
level_2_3_structure_info_clean.rename(columns={'Fold AA seq': 'Raw Download AA Seq'}), |
|
raw_download, |
|
on=['FusionGene','Raw Download AA Seq'], |
|
how='inner' |
|
) |
|
test_merge_1 = test_merge_1.drop(columns=['AA seq']) |
|
test_merge_1['Seq Source'] = ['AlphaFold,Raw Download']*len(test_merge_1) |
|
log_update(f"Merge on AlphaFold AA Seq and raw Download AA Seq. len={len(test_merge_1)}") |
|
|
|
test_merge_2 = pd.merge( |
|
level_2_3_structure_info_clean.rename(columns={'AA seq': 'Raw Download AA Seq'}), |
|
raw_download, |
|
on=['FusionGene','Raw Download AA Seq'], |
|
how='inner' |
|
) |
|
test_merge_2 = test_merge_2.drop(columns=['Fold AA seq']) |
|
test_merge_2['Seq Source'] = ['Webpage,Raw Download']*len(test_merge_2) |
|
log_update(f"Merge on Webpage AA Seq and Raw Download AA Seq. len={len(test_merge_2)}") |
|
|
|
test_merge = pd.concat([test_merge_1,test_merge_2]) |
|
test_merge['Len(AA seq)'] = test_merge['Raw Download AA Seq'].apply(lambda x: len(x)) |
|
|
|
test_merge = test_merge.drop_duplicates().reset_index(drop=True) |
|
|
|
|
|
log_update(f"len test_merge before keeping CIFs over identical PDBs: {len(test_merge)}") |
|
test_merge = test_merge.sort_values(by='Structure Type',ascending=True).reset_index(drop=True).groupby(['Hgene', 'Hchr', 'Hbp', 'Hstrand', 'Tgene', 'Tchr', |
|
'Tbp', 'Tstrand', 'Len(AA seq)', 'FusionGene', |
|
'Level', 'Raw Download AA Seq', 'pLDDT', 'pLDDTs','FusionGID', 'Seq Source']).agg( |
|
{ |
|
'Structure Link': 'first', |
|
'Structure Type': 'first' |
|
} |
|
).reset_index() |
|
log_update(f"len after: {len(test_merge)}") |
|
|
|
|
|
log_update(f"len test_merge before combining seq sources: {len(test_merge)}") |
|
test_merge = test_merge.groupby(['Structure Link','Hgene', 'Hchr', 'Hbp', 'Hstrand', 'Tgene', 'Tchr', |
|
'Tbp', 'Tstrand', 'Len(AA seq)', 'FusionGene','Structure Type', |
|
'Level', 'Raw Download AA Seq', 'pLDDT', 'pLDDTs', 'FusionGID', ]).agg( |
|
{ |
|
'Seq Source': lambda x: ','.join(x) |
|
} |
|
).reset_index() |
|
test_merge['Seq Source'] = test_merge['Seq Source'].apply(lambda x: ','.join(set(x.split(',')))) |
|
log_update(f"len after: {len(test_merge)}") |
|
|
|
|
|
dup_seqs = test_merge[test_merge['Raw Download AA Seq'].duplicated()]['Raw Download AA Seq'].unique().tolist() |
|
|
|
|
|
log_update(f"len test_merge before randomly choosing first fold when one seq has multiple folds: {len(test_merge)}") |
|
test_merge = test_merge.groupby(['Hgene', 'Hchr', 'Hbp', 'Hstrand', 'Tgene', 'Tchr', |
|
'Tbp', 'Tstrand', 'Len(AA seq)', 'FusionGene', |
|
'Level', 'Raw Download AA Seq', 'FusionGID', ]).agg( |
|
{ |
|
'Structure Link': 'first', |
|
'Structure Type': 'first', |
|
'Seq Source': 'first', |
|
'pLDDT': 'first', |
|
'pLDDTs': 'first' |
|
} |
|
).reset_index() |
|
log_update(f"len after: {len(test_merge)}") |
|
|
|
|
|
source_str = test_merge['Seq Source'].value_counts().reset_index().rename(columns={'index': 'Seq Source','Seq Source': 'count'}).to_string(index=False) |
|
source_str = "\t\t" + source_str.replace("\n","\n\t\t") |
|
log_update(f"Distribution of sequence sources:\n{source_str}") |
|
|
|
|
|
test_merge = test_merge.loc[test_merge['Seq Source'].str.contains('AlphaFold')].reset_index(drop=True) |
|
log_update(f"Dropped rows where AlphaFold sequence was incorrect. New DataFrame length: {len(test_merge)}") |
|
|
|
assert len(test_merge[test_merge.duplicated(['FusionGID','Raw Download AA Seq'])])==0 |
|
|
|
|
|
test_merge['pLDDT'] = test_merge['pLDDT'].round(2) |
|
|
|
|
|
test_merge_v2 = test_merge[ |
|
['FusionGID', 'FusionGene', 'Raw Download AA Seq','Len(AA seq)', 'Hgene', 'Hchr', 'Hbp', 'Hstrand', 'Tgene', 'Tchr', 'Tbp', 'Tstrand', |
|
'Level','Structure Link', 'Structure Type', 'pLDDT', 'pLDDTs', 'Seq Source'] |
|
].rename( |
|
columns={ |
|
'Raw Download AA Seq': 'Fusion_Seq', |
|
'Seq Source': 'Fusion_Seq_Source', |
|
'Structure Link': 'Fusion_Structure_Link', |
|
'Structure Type': 'Fusion_Structure_Type', |
|
'pLDDT': 'Fusion_pLDDT', |
|
'pLDDTs': 'Fusion_AA_pLDDTs', |
|
'Len(AA seq)': 'Fusion_Length' |
|
} |
|
) |
|
log_update(f"Unique FusionGIDs: {len(test_merge_v2['FusionGID'].unique())}") |
|
log_update(f"Number of structures: {len(test_merge_v2)}") |
|
|
|
|
|
log_update("\nChecking for duplicate sequences..") |
|
log_update(f"\tThe structure-based fusion database of length {len(test_merge_v2)} has {len(test_merge_v2['Fusion_Seq'].unique())} unique fusion sequences.") |
|
dup_seqs = test_merge_v2[test_merge_v2['Fusion_Seq'].duplicated()]['Fusion_Seq'].tolist() |
|
dup_seqs_df = test_merge_v2.loc[test_merge_v2['Fusion_Seq'].isin(dup_seqs)].reset_index(drop=True) |
|
dup_seqs_df['FusionGID'] = dup_seqs_df['FusionGID'].astype(str) |
|
dup_seqs_df = dup_seqs_df.groupby('Fusion_Seq').agg({ |
|
'FusionGID': lambda x: ','.join(x), |
|
'FusionGene': lambda x: ','.join(x) |
|
}) |
|
dup_seqs_df_str = dup_seqs_df.to_string(index=False) |
|
dup_seqs_df_str = "\t"+dup_seqs_df_str.replace("\n","\n\t") |
|
log_update(f"\tShowing FUsionGIDs and FusionGenes for duplicated sequences below:\n{dup_seqs_df_str}") |
|
|
|
|
|
heads_tails_analyzed['Avg pLDDT'] = heads_tails_analyzed['Avg pLDDT'].round(2) |
|
|
|
level_2_3_v2 = pd.merge( |
|
level_2_3, |
|
heads_tails_analyzed.rename(columns={'UniProtID': 'HGUniProtAcc', 'Avg pLDDT': 'HG_pLDDT', 'All pLDDTs': 'HG_AA_pLDDTs', 'Seq': 'HG_Seq'}), |
|
on='HGUniProtAcc', |
|
how='left' |
|
) |
|
|
|
level_2_3_v2 = pd.merge( |
|
level_2_3_v2, |
|
heads_tails_analyzed.rename(columns={'UniProtID': 'TGUniProtAcc', 'Avg pLDDT': 'TG_pLDDT', 'All pLDDTs': 'TG_AA_pLDDTs', 'Seq': 'TG_Seq'}), |
|
on='TGUniProtAcc', |
|
how='left' |
|
) |
|
|
|
|
|
test_merge_v2.to_csv(f'processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv',index=False) |
|
log_update("Saved file with all fusion structure pLDDTs to: processed_data/fusionpdb/FusionPDB_level2-3_cleaned_structure_info.csv") |
|
|
|
|
|
level_2_3_v2.to_csv(f'processed_data/fusionpdb/FusionPDB_level2-3_cleaned_FusionGID_info.csv',index=False) |
|
log_update("Saved file with all fusion protein heads and tails, and their structure pLDDTs to: processed_data/fusionpdb/FusionPDB_level2-3_cleaned_FusionGID_info.csv") |
|
|
|
def main(): |
|
with open_logfile("process_fusion_structures_log.txt"): |
|
process_fusions_and_hts() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|