FusOn-pLM / fuson_plm /benchmarking /caid /scrape_fusionpdb.py
svincoff's picture
caid benchmark
bae913a
# Script for fetching FusionPDB level 2 and 3 data
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import glob
import ast
import os
from pandas.errors import EmptyDataError
from fuson_plm.utils.logging import open_logfile, log_update, get_local_date_yr
def get_levels_dataframe(level, print_progress=False):
data, headers = scrape_level(level)
if print_progress:
# Output the extracted data - just a few rows
if level==2:
log_update(f'\nTable size {len(data)}; expected 2212')
if level==3:
log_update(f'\nTable size {len(data)}; expected 266')
log_update('Example rows 1-5:')
for i, row in enumerate(data):
log_update(row)
if i>5: break
df = pd.DataFrame(data, columns=headers)
df['URL'] = df['FusionGID'].apply(lambda x: x[1])
df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0])
return df
def scrape_level(level):
level = str(level)
# The URL of the website
url = f"https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level{level}"
# Sending a request to the website
response = requests.get(url)
# Parsing the HTML content of the website
soup = BeautifulSoup(response.content, 'html.parser')
# Find the specific <h1> tag
if level == '2':
specific_h1 = soup.find('h1', text='2212 Fusion gene(s) for your query: level2')
if level== '3':
specific_h1 = soup.find('h1', text='266 Fusion gene(s) for your query: level3')
# Find the specific table following the <h1> tag
table = specific_h1.find_next('table', class_='geneList')
# Extract headers (only first 6 fields)
headers = [header.get_text().strip() for header in table.find_all('td', class_='content_middle_gene_summary')][0:6]
# Extract rows
rows = table.find_all('tr')[1:] # Skip the header row
# Extract data from rows
data = []
for row in rows:
columns = row.find_all('td', class_='content_middle_gene_summary')
if not columns:
continue
row_data = []
for column in columns:
link = column.find('a')
if link:
href = link['href']
fusion_gid = link.get_text(strip=True)
full_url = f"https://compbio.uth.edu/FusionPDB/{href}"
row_data.append((fusion_gid, full_url))
else:
row_data.append(column.get_text(strip=True))
data.append(row_data)
return data, headers
def get_structure_link_dataframe(id, print_progress=False):
rows = get_structure_links(id)
# IF printing progress, output the extracted data - just a few rows
if print_progress:
log_update(f'\nTable size {len(rows)}')
log_update('Example rows 1-5:')
for i, row in enumerate(rows):
log_update(row)
if i>5: break
# Make the dataframe - new row for each link - ONLY if there's actually data
if len(rows)>0:
df = pd.DataFrame(rows)
df = df.rename(columns={
'Fusion protein PDB link (fusion AA seq ID in FusionPDB)': 'Structure Link'
})
# make a new row for each link
df = df.explode('Structure Link').reset_index(drop=True)
df['Structure Link'] = df['Structure Link'].apply(lambda x: 'https://compbio.uth.edu/FusionPDB/' + str(x))
df['Structure Type'] = df['Structure Link'].apply(lambda x: 'PDB' if 'pdb_files' in x else ('CIF' if 'cif_files' in x else 'Unknown'))
df['FO_Name'] = df['Hgene'] + '::' + df['Tgene']
# Rename FO_Name to FusionGene
df = df.rename(columns={'FO_Name':'FusionGene'})
df['ID'] = [id]*len(df)
expected_cols = ['ID','Structure Link','Hgene','Hchr','Hbp','Hstrand','Tgene','Tchr','Tbp','Tstrand','Len(AA seq)','Structure Type','FusionGene','AA seq']
for col in expected_cols:
if not(col in list(df.columns)):
df[col] = ['']*len(df)
df = df[expected_cols]
#df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0])
else:
df = pd.DataFrame()
return df
def get_structure_links(id, print_progress=False):
# Define the URL
url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}"
# Fetch the webpage content
response = requests.get(url)
html_content = response.content
# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
#with open(f'FusionPDB_{id}_soup.txt','w') as f:
#f.write(soup.prettify())
# Find the table with the title "Fusion Protein Structures"
table_title = soup.find('a', {'name': 'FusionSTR'})
rows = []
# Check that the h2 text is "Fusion Protein Structures"
if table_title and table_title.find('h2').text.strip() == 'Fusion Protein Structures':
# Find the next table with class 'geneList' after the title
table = table_title.find_next('table', class_='geneList')
table = table.find_next('table')
if table:
if print_progress: log_update('table found')
# Extract the headers from the first row
header_row = table.find('tr')
headers = [header.get_text(strip=True) for header in header_row.find_all('strong')]
# Extract the rows
rows = []
for row in table.find_all('tr')[1:]: # Skip the header row
cells = row.find_all('td')
row_data = {}
skip_next = False # Flag to skip the next cell
for i, cell in enumerate(cells):
# Get the link text if a link is present, otherwise get the text
if skip_next:
skip_next = False
continue # Skip this cell
cell_text = cell.get_text(strip=True)
if "3D view using mol*" in cell_text:
skip_next = True # Set the flag to skip the next cell
continue # Skip this cell
links = cell.find_all('a')
if links:
row_data[headers[i]] = [link.get('href') for link in links]
else:
celltext = cell.get_text(strip=True)
if len(celltext)>0:
row_data[headers[i]] = celltext
if len(row_data)>0: rows.append(row_data)
else:
log_update('table not found')
return rows
def process_td_elements(soup_object, add_links=False):
# Find all td elements with class "content_left_gene_summary"
td_elements = soup_object.find_all('td', class_='content_left_gene_summary')
# Extract and print the information
data = []
for td in td_elements:
# Extract the text content
strong_tag = td.find('strong')
if strong_tag:
text_content = strong_tag.get_text(strip=True)
else:
text_content = td.get_text(strip=True)
# Extract the link if available
if add_links:
link_tag = td.find('a')
if link_tag:
link = link_tag.get('href')
text_content += f" ({link})"
data.append(text_content)
return data
def get_hgene_tgene_info(id, print_progress=False):
# Define the URL
url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}"
# Fetch the webpage content
response = requests.get(url)
html_content = response.content
# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
#with open(f'FusionPDB_{id}_soup.txt','w') as f:
#f.write(soup.prettify())
# Find the table with the title "Fusion Protein Summary"
title_table = soup.find('table', class_='title')
if title_table and title_table.find('h2') and title_table.find('h2').get_text(strip=True) == 'Fusion Protein Summary':
# Next table is irrelevent; skip to 'geneList' table 2 tables from now
gene_list_table = title_table.find_next_sibling('table', class_='geneList').find_next_sibling('table', class_='geneList')
# Extract relevant data from this table
data = {
"Fusion gene name": [],
"FusionPDB ID": [],
"FusionGDB2.0 ID": [],
"Gene symbol": [],
"Gene ID": [],
"Gene name": [],
"Synonyms": [],
"Cytomap": [],
"Type of gene": [],
"Description": [],
"Modification date": [],
"UniProtAcc": []
}
td_data = process_td_elements(gene_list_table)
# need to split td_data into 2 parts: before 'Gene symbol' and after 'Gene symbol'
split_ind = td_data.index('Gene symbol')
fusion_info, ht_info = [td_data[0:split_ind], td_data[split_ind::]]
# first, process fusion info
for info in fusion_info:
if ':' in info:
#log_update(info)
key, value = info.split(':')[0:2]
if key in data:
data[key.strip()] = value.strip()
# now, process ht_info up to
# iterate 3 at a time
j_start=0
for i in range(0, len(ht_info), 3):
# get the elements
#log_update(len(ht_info[i:i+3]), ht_info[i:i+3])
key, value1, value2 = ht_info[i:i+3]
if key in data:
data[key.strip()] = [value1.strip(), value2.strip()]
if key=='UniProtAcc':
break
return data
def process_ids(ids, outdir='', level=2):
csv_filename = f'{outdir}/FusionPDB_level{level}_fusion_structure_links.csv'
already_processed_ids = []
if os.path.isfile(csv_filename):
already_processed_ids = pd.read_csv(csv_filename)
already_processed_ids = already_processed_ids['ID'].tolist()
structureless_ids = pd.read_csv("raw_data/fusionpdb/fusionpdb_structureless_ids.txt",sep="\t",header=None)[0].tolist()
log_update(f'\nLevel {level}:\n\tDownloading structure links for FusionPDB IDs:')
for i, id in enumerate(ids):
# only process if the csv_filename isn't already a file and if it's not one of the ones with no structure
if (id in already_processed_ids) or (id in structureless_ids):
continue
df = get_structure_link_dataframe(id)
if os.path.isfile(csv_filename):
df.to_csv(csv_filename, mode='a', index=False,header=False)
else:
df.to_csv(csv_filename, mode='w', index=False)
log_update(f'\t\t{i+1}. {id}')
def process_ids_ht(ids, outdir='',level=2):
outfile = f'{outdir}/level{level}_head_tail_info.txt'
if not(os.path.isfile(outfile)):
log_update(f"\n\tAcquiring UniProt accessions of head and tail genes for Level {level}")
with open(outfile, 'a+') as f1:
for id in ids:
data = get_hgene_tgene_info(id)
data = {
'FusionGID': data['FusionPDB ID'],
'HGID': data['Gene ID'][0],
'TGID': data['Gene ID'][1],
'HGUniProtAcc': data['UniProtAcc'][0],
'TGUniProtAcc': data['UniProtAcc'][1]
}
f1.write(str(data))
f1.write('\n')
f1.flush()
else:
log_update(f"\nAlready acquired UniProt accessions of head and tail genes for Level {level} at: {outfile}")
def download_file(url, directory):
# Download file, IF its destination doesn't already have a file there
local_filename = os.path.join(directory, url.split('/')[-1])
if os.path.exists(local_filename):
return local_filename
response = requests.get(url)
response.raise_for_status()
with open(local_filename, 'wb') as file:
file.write(response.content)
return local_filename
def download_structures(download_links):
# Directory where you want to save the downloaded files
download_directory = "raw_data/fusionpdb/structures"
os.makedirs(download_directory, exist_ok=True)
# Download all files
for link in download_links:
try:
log_update(f"Downloading {link}...")
download_file(link, download_directory)
log_update(f"\tDownloaded {link} to {download_directory}")
except Exception as e:
log_update(f"\tFailed to download {link}. Reason: {e}")
log_update("All downloads completed.")
def combine_ht_info():
# read the head and tail that was collected for levels 2 and 4
outdir = 'raw_data/fusionpdb'
head_tail_data = []
with open(f'{outdir}/level2_head_tail_info.txt','r') as f:
for line in f:
# Parse the line as a dictionary using ast.literal_eval
record = ast.literal_eval(line.strip())
head_tail_data.append(record)
with open(f'{outdir}/level3_head_tail_info.txt','r') as f:
for line in f:
# Parse the line as a dictionary using ast.literal_eval
record = ast.literal_eval(line.strip())
head_tail_data.append(record)
ht_df = pd.DataFrame(head_tail_data)
ht_df['FusionGID'] = ht_df['FusionGID'].astype(str)
return ht_df
# Add a column for the source of UniProtAcc
def find_h_source(row):
if row['HGUniProtAcc'] is not None:
return 'FusionPDB'
elif row['Entry_Hgene'] is not None:
return 'UniProt ID Map'
else:
return None
def find_t_source(row):
if row['TGUniProtAcc'] is not None:
return 'FusionPDB'
elif row['Entry_Tgene'] is not None:
return 'UniProt ID Map'
else:
return None
def correct_huniprot(row):
if row['HGUniProtAcc'] is not None:
return row['HGUniProtAcc']
elif row['Entry_Hgene'] is not None:
return row['Entry_Hgene']
else:
return None
def correct_tuniprot(row):
if row['TGUniProtAcc'] is not None:
return row['TGUniProtAcc']
elif row['Entry_Tgene'] is not None:
return row['Entry_Tgene']
else:
return None
def combine_ht_info_with_structure_links(giant, ht_df):
# Add in the head and tail data
giant_with_hts = pd.merge(giant, ht_df, on='FusionGID', how='left')
# make sure it's all strings here
giant_with_hts['HGID_x'] = giant_with_hts['HGID_x'].astype(str)
giant_with_hts['HGID_y'] = giant_with_hts['HGID_y'].astype(str)
giant_with_hts['TGID_x'] = giant_with_hts['TGID_x'].astype(str)
giant_with_hts['TGID_y'] = giant_with_hts['TGID_y'].astype(str)
giant_with_hts['HGID_match'] = giant_with_hts['HGID_x'] == giant_with_hts['HGID_y']
giant_with_hts['TGID_match'] = giant_with_hts['TGID_x'] == giant_with_hts['TGID_y']
# check if they're all true
assert giant_with_hts['HGID_match'].all() and giant_with_hts['TGID_match'].all()
# cool, all of them are true so now drop the extra columns
giant_with_hts = giant_with_hts.drop(['HGID_x','TGID_x','HGID_match','TGID_match'],axis=1).rename(columns={'HGID_y':'HGID','TGID_y':'TGID'})
giant_with_hts = giant_with_hts.replace('.',np.nan)
# Check - how many rows have uniprot IDs for both head and tail?
hgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].isna()])
tgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].notna()])
hgid_and_tgid = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].notna()])
neither = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].isna()])
log_update(f"\nFusions with HGID only: {hgid_only}")
log_update(f"Fusions with TGID only: {tgid_only}")
log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}")
log_update(f"Fusions with neither: {neither}")
log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(giant_with_hts)}")
# Collect all unmapped HGIDs and TGIDs
unmapped_h = set(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna()]['Hgene'].tolist())
unmapped_t = set(giant_with_hts[giant_with_hts['TGUniProtAcc'].isna()]['Tgene'].tolist())
unmapped_parts = unmapped_h.union(unmapped_t)
log_update(f"unmapped hgenes: {len(unmapped_h)}")
log_update(f"unmapped tgenes: {len(unmapped_t)}")
log_update(f"unmapped parts (hgids or tgids): {len(unmapped_parts)}")
# We need to remap some Hgenes and Tgenes. There are some cases where FusionPDB got the wrong UniProt accessions.
wrong_uniprot_ids =[
'PRY',
'TIAF1',
'DCAF8L2',
'UMAD1',
'TIPIN',
'GAB3',
'OTOA',
'PAGR1',
'PRY2',
'FAM178A',
'SPATS2L',
'VMAC',
'ZNFX1',
'TFPT',
'TRANK1',
'RRP15',
'PAXBP1',
'RB1CC1',
'PACRGL',
'TRMT1L',
'PPPDE2',
'YY1AP1',
'RGP1',
'SHKBP1',
'RINT1',
'PRAM1',
'PIR',
'TMBIM6',
'PICK1',
'PLEC',
'NUDCD3',
'CCBL1',
'S100PBP',
'RTL1',
'C10orf140',
'CD177',
'SLF2',
'STARD3NL',
'RELL2',
'AMIGO1',
'TRAF3IP1',
'PNOC',
'PERM1',
'UBE2F',
'TBKBP1',
'PAN3',
'NSFL1C',
'SPAST',
'TOX4',
'RGPD8',
'ZDHHC9',
'SLAMF9',
'TNNT1',
'TEKT5',
'TPI1',
'TAAR6',
'SKIDA1',
'PMS1'
]
# Add Hgene accessions with commas
wrong_uniprot_ids += giant_with_hts[
~(giant_with_hts['HGUniProtAcc'].isna()) &
(giant_with_hts['HGUniProtAcc'].str.contains(","))
]['HGUniProtAcc'].tolist()
# Add Tgene accessions with commas
wrong_uniprot_ids += giant_with_hts[
~(giant_with_hts['TGUniProtAcc'].isna()) &
(giant_with_hts['TGUniProtAcc'].str.contains(","))
]['TGUniProtAcc'].tolist()
# Get a list of the Hgenes and Tgenes that need to be ID mapped, AGAIN
hts_tomap_part2 = giant_with_hts[giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids)]['Hgene'].tolist()
hts_tomap_part2 += giant_with_hts[giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids)]['Tgene'].tolist()
hts_tomap_part2 = set(hts_tomap_part2)
log_update(f"Total head and tail genes that need to be mapped again: {len(hts_tomap_part2)}")
# Write parts that need remapping to a file for submission to UniProt
with open('processed_data/fusionpdb/intermediates/unmapped_parts.txt','w') as f:
for part in unmapped_parts:
f.write(f'{part}\n')
for part in hts_tomap_part2:
f.write(f'{part}\n')
# set the accession to nan if it's in wrong_uniprot_ids
giant_with_hts.loc[
giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids),
'HGUniProtAcc'
] = np.nan
giant_with_hts.loc[
giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids),
'TGUniProtAcc'
] = np.nan
# We did the ID Map ahead of time on UniProt. Use this file.
idmap = pd.read_csv(f'raw_data/fusionpdb/hgene_tgene_uniprot_idmap_07_10_2024.txt',sep='\t')
# are there multiple GeneIDs for anything?
idmap['n_GeneID'] = idmap['GeneID'].apply(lambda x: [y for y in str(x).strip().split(';') if len(y)>0])
idmap['n_GeneID'] = idmap['n_GeneID'].apply(lambda x: len(x))
# are they all length 1?
log_update(f"Genes may have the following total #s of gene IDs: {idmap['n_GeneID'].unique()}")
# no they're not
# do they all end in ;?
log_update(f"All GeneIDs end in ; {idmap['GeneID'].apply(lambda x: x[-1] == ';' if type(x)==str else True).all()}") # say true if it's Nan, we don't care about this
# yes they do
# Merge new IDMap data from UniProt re-mapping with previous data
# Merge twice: one time, we merge as if the mapped genes are Hgenes; the other time, as if the mapped genes are Tgenes
idmap_merge = pd.merge(giant_with_hts, idmap[['From','Entry','GeneID']].rename(columns={'From':'Hgene', 'Entry': 'Entry_Hgene', 'GeneID': 'GeneID_Hgene'}), on='Hgene',how='left')
idmap_merge = pd.merge(idmap_merge, idmap[['From','Entry','GeneID']].rename(columns={'From':'Tgene', 'Entry': 'Entry_Tgene', 'GeneID': 'GeneID_Tgene'}), on='Tgene',how='left')
# From the original data, we have HGIDs and from the UniProt result, we have HGIDs with ; on the end. So make a 'HGID;' column to see if these match
idmap_merge['HGID;'] = idmap_merge['HGID'].astype(str) + ';'
idmap_merge['TGID;'] = idmap_merge['TGID'].astype(str) + ';'
# "Found" is true if the HGID; from the FusionPDB mapping is one of the GeneIDs returned by UniProt
idmap_merge['HGID_Found'] = idmap_merge.apply(lambda row: row['HGID;'] in str(row['GeneID_Hgene']), axis=1)
idmap_merge['TGID_Found'] = idmap_merge.apply(lambda row: row['TGID;'] in str(row['GeneID_Tgene']), axis=1)
# what do we keep from idmap merge?
# we keep columns where: there's an HGID and a TGID, OR
# if one of them is nan, its given gene id is in the list returned by uniprot
idmap_merge_success = idmap_merge.loc[
# Both were there to begin with
((idmap_merge['HGUniProtAcc'].notna()) & (idmap_merge['TGUniProtAcc'].notna())) |
# Hgene was missing, correct HGID was found
((idmap_merge['HGUniProtAcc'].isna()) & (idmap_merge['HGID_Found']==True)) |
# Tgene was missing, correct TGID was found
((idmap_merge['TGUniProtAcc'].isna()) & (idmap_merge['TGID_Found']==True))
].reset_index(drop=True)
idmap_merge_success['FusionGID'] = idmap_merge_success['FusionGID'].astype(str)
log_update(f"rows: {len(idmap_merge_success)}")
log_update(f"unique successful fusion GIDs: {len(idmap_merge_success['FusionGID'].unique())}")
# There are duplicate rowes with different GIDs for Hgenes and Tgenes. Here's the scheme for which row to keep.
# Priority 1: HGUniProtAcc.notna() and TGUniProtAcc.notna()
# Priority 2: HGID_Found and TGID_Found, and FusionGID is NOT in partition 1
# Priority 3: HGID_Found or TGID_Found, and FusionGID is NOT in partition 1 or 2.
# ^ If we get here, only one (Hgene OR Tgene) was found, since all combinations were tested.
# For whichever one worked, combine the appropriate information and get rid of the rest
partition1 = idmap_merge_success.loc[
# Both were there to begin with
((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna()))
].reset_index(drop=True)
partition1_gids = set(partition1['FusionGID'].tolist())
log_update("Partition 1: HGUniProtAcc.notna() and TGUniProtAcc.notna() --> both UniProt accessions were found on FusionPDB")
log_update(f"\t# GIDs: {len(partition1_gids)}")
partition2 = idmap_merge_success.loc[
# Hgene was missing, correct HGID was found or Tgene was missing, correct TGID was found.
(idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True) &
~(idmap_merge_success['FusionGID'].isin(partition1_gids))
].reset_index(drop=True)
partition2_gids = set(partition2['FusionGID'].tolist())
log_update("Partition 2: HGID_Found & TGID_Found --> both UniProt accessions were mapped successfully; one or both was found by remapping on UniProt")
log_update(f"\t# GIDs: {len(partition2_gids)}")
partition3 = idmap_merge_success.loc[
# it didn't satisfy one of the criteria for the first two partitions
~(
((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna())) |
((idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True))
) &
# one of the mapping was a success
((idmap_merge_success['HGID_Found']==True) | (idmap_merge_success['TGID_Found']==True)) &
# the FusionGID is not in partition 1 or 2
~(idmap_merge_success['FusionGID'].isin(partition1_gids)) &
~(idmap_merge_success['FusionGID'].isin(partition2_gids))
].reset_index(drop=True)
partition3_gids = set(partition3['FusionGID'].tolist())
log_update("Partition 3: HGID_Found or TGID_Found --> only one was successful, this was the best we can do")
log_update(f"\t# GIDs: {len(partition3_gids)}")
# check for dups
partition1_dups = partition1[partition1.duplicated('FusionGID')]['FusionGID'].unique().tolist()
partition2_dups = partition2[partition2.duplicated('FusionGID')]['FusionGID'].unique().tolist()
partition3_dups = partition3[partition3.duplicated('FusionGID')]['FusionGID'].unique().tolist()
log_update(f"\nDuplicate IDs in partition 1: {len(partition1_dups)}")
log_update(f"Duplicate IDs in partition 2: {len(partition2_dups)}")
log_update(f"Duplicate IDs in partition 3: {len(partition3_dups)} \tDuplicate rows: {len(partition3[partition3['FusionGID'].isin(partition3_dups)])}")
log_update(f"\nRows in original dataset: {len(idmap_merge_success)}")
log_update(f"Rows in partitions: {len(partition1)+len(partition2)+len(partition3)}")
# Check that all GIDs are represented
all_starting_fusiongids = set(idmap_merge_success['FusionGID'].tolist())
all_fusiongids = set(partition1['FusionGID'].tolist()) | set(partition2['FusionGID'].tolist()) | set(partition3['FusionGID'].tolist())
log_update(f"\nFusion GIDs captured in original dataset: {len(all_fusiongids)} {len(partition1_gids)+len(partition2_gids)+len(partition3_gids)}")
log_update(f"Fusion GIDs captured in the 3 partitions: {len(all_starting_fusiongids)}")
log_update(f"Same set? {all_starting_fusiongids == all_fusiongids}")
# Deal with partition 3's duplicates - group by FusionGID and join all the UniProt IDs returned for HGene and TGene
partition3['Entry_Hgene'] = partition3['Entry_Hgene'].astype(str)
partition3['Entry_Tgene'] = partition3['Entry_Tgene'].astype(str)
partition3 = partition3.groupby('FusionGID').agg({
'FusionGID': 'first',
'FusionGene': 'first',
'Hgene': 'first',
'Tgene': 'first',
'URL': 'first',
'HGID': 'first',
'TGID': 'first',
'HGUniProtAcc': 'first',
'TGUniProtAcc': 'first',
'Entry_Hgene': lambda x: ','.join(set([y for y in x])),
'GeneID_Hgene': 'first',
'Entry_Tgene': lambda x: ','.join(set([y for y in x])),
'GeneID_Tgene': 'first',
'HGID;': 'first',
'TGID;': 'first',
'HGID_Found': 'first',# there should only be one
'TGID_Found': 'first'# there should only be one
}
).reset_index(drop=True)
# Finally, recombine
recombined = pd.concat(
[
partition1,
partition2,
partition3
]
).reset_index(drop=True)
# there should be no duplicate GIDs in this
log_update(f"Duplicate GID rows: {len(recombined[recombined.duplicated('FusionGID')])}")
recombined = recombined.replace({np.nan: None, 'nan': None})
# Add the UniProt source so it's clear where we got these IDs from
recombined['HGUniProtAcc_Source'] = recombined.apply(lambda row: find_h_source(row), axis=1)
recombined['TGUniProtAcc_Source'] = recombined.apply(lambda row: find_t_source(row), axis=1)
recombined['HGUniProtAcc'] = recombined.apply(lambda row: correct_huniprot(row), axis=1)
recombined['TGUniProtAcc'] = recombined.apply(lambda row: correct_tuniprot(row), axis=1)
# Check: every row that has "UniProt ID Map" for HGUniProtAcc_Source should have an Entry_Hgene
log_update(f"Every row with UniProt ID Map as HGUniProtAcc_Source has an Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']=='UniProt ID Map']['Entry_Hgene'].apply(lambda x: x is not None).all()}")
log_update(f"Every row with UniProt ID Map as TGUniProtAcc_Source has an Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']=='UniProt ID Map']['Entry_Tgene'].apply(lambda x: x is not None).all()}")
log_update(f"Every row with no HGUniProtAcc_Source has no Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']==None]['Entry_Hgene'].apply(lambda x: x is None).all()}")
log_update(f"Every row with no TGUniProtAcc_Source has no Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']==None]['Entry_Tgene'].apply(lambda x: x is None).all()}")
# keep only 'FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source'
recombined = recombined[['FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source']]
recombined = recombined.replace({None: np.nan})
# print how many have each id
# how many rows have uniprot IDs for both?
hgid_only = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].isna()])
tgid_only = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].notna()])
hgid_and_tgid = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].notna()])
neither = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].isna()])
log_update(f"Fusions with HGID only: {hgid_only}")
log_update(f"Fusions with TGID only: {tgid_only}")
log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}")
log_update(f"Fusions with neither: {neither}")
log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(recombined)}")
return recombined
def scrape_fusionpdb_level_2_3():
# Scrape level 2 and save it
os.makedirs("raw_data/fusionpdb",exist_ok=True)
os.makedirs("processed_data/fusionpdb",exist_ok=True)
os.makedirs("processed_data/fusionpdb/intermediates",exist_ok=True)
matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level2_curated_*.csv')
if len(matching_file)>0:
log_update(f"\nLevel 2 was already scraped for IDs. Loading data from: {matching_file[0]}")
level2_df = pd.read_csv(matching_file[0])
else:
log_update(f"\nScraping Level 2 IDs from FusionPDB")
dt_tag = get_local_date_yr()
level2_df = get_levels_dataframe(2, print_progress=True)
level2_df['FusionGID'] = level2_df['FusionGID'].astype(str)
level2_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level2_curated_{dt_tag}.csv',index=False)
# Scrape level 3 and save it
matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level3_curated_*.csv')
if len(matching_file)>>0:
log_update(f"\nLevel 3 was already scraped for IDs. Loading data from: {matching_file[0]}")
level3_df = pd.read_csv(matching_file[0])
else:
log_update(f"\nScraping Level 3 IDs from FusionPDB")
dt_tag = get_local_date_yr()
level3_df = get_levels_dataframe(3, print_progress=True)
level3_df['FusionGID'] = level3_df['FusionGID'].astype(str)
level3_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level3_curated_{dt_tag}.csv',index=False)
# Check for ID overlap
level2_ids = set(level2_df['FusionGID'].tolist())
level3_ids = set(level3_df['FusionGID'].tolist())
log_update(f"Total overlapping fusionGIDs between levels 2 and 3: {len(level2_ids.intersection(level3_ids))}")
##### LEVEL 2: Get links to FusionPDB's provided structure files for all level 2 proteins in FusionPDB. Then, combine them into one file.
links_save_dir = 'raw_data/fusionpdb'
os.makedirs(links_save_dir,exist_ok=True)
process_ids(level2_ids,outdir=links_save_dir,level=2) # only processes ids that haven't been processed yet
# Get head and tail gene info for level 2 proteins
process_ids_ht(level2_ids,outdir=links_save_dir,level=2)
##### LEVEL 3: Get links to FusionPDB's provided structure files for all level 3 proteins in FusionPDB. Then, combine them into one file.
links_save_dir = 'raw_data/fusionpdb'
process_ids(level3_ids,outdir=links_save_dir,level=3)
# Get head and tail gene info for level 2 proteins
process_ids_ht(level3_ids,outdir=links_save_dir,level=3)
# Combine head and tail data
ht_df = combine_ht_info()
ht_df.to_csv("processed_data/fusionpdb/fusion_heads_and_tails.csv",index=False)
# Combine level 2 and level 3 data in two ways: (1) giant structure links file, (2) giant file with head and tail info
log_update("\nCombining level 2 and 3 data")
giant_level2 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'})
giant_level2 = pd.merge(giant_level2, level2_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left')
log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv: {len(giant_level2)}")
giant_level3 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'})
giant_level3 = pd.merge(giant_level3, level3_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left')
log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv: {len(giant_level3)}")
giant_level2['Level'] = [2]*len(giant_level2)
giant_level3['Level'] = [3]*len(giant_level3)
##### Create some intermediate "giant" files, combining bulk info for levels 2 and 3
# These files are intermediate for two reasons:
# - giant_level2-3_fusion_protein_structure_links: doesn't have head and tail structural info
# - giant_level2-3_fusion_protein_head_tail_info: only has fusion proteins that have at least one mappable head/tail
giant_sl = pd.concat([giant_level2,giant_level3]).drop_duplicates().reset_index(drop=True)
giant_sl.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv',index=False)
# Structure link file should be done - make sure there are no duplicate GIDs
log_update(f"\nSaving file with all Level 2 and 3 Structure links (size: {len(giant_sl)}) to: processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv")
# Create and download CSV of combined head and tail info
giant_ht = pd.concat([level2_df,level3_df]).reset_index(drop=True)
# ensure the type is string
giant_ht['FusionGID'] = giant_ht['FusionGID'].astype(str)
giant_with_ht = combine_ht_info_with_structure_links(giant_ht, ht_df)
giant_with_ht.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv',index=False)
# Download structure links
download_structures(giant_sl['Structure Link'].tolist())
def main():
with open_logfile("fetch_fusionpdb_data_log.txt"):
scrape_fusionpdb_level_2_3()
if __name__ == "__main__":
main()