FusOn-pLM / fuson_plm /benchmarking /caid /scrape_fusionpdb.py

caid benchmark

bae913a about 2 months ago

33.8 kB

	# Script for fetching FusionPDB level 2 and 3 data
	import requests
	import pandas as pd
	import numpy as np
	from bs4 import BeautifulSoup
	import glob
	import ast
	import os
	from pandas.errors import EmptyDataError

	from fuson_plm.utils.logging import open_logfile, log_update, get_local_date_yr

	def get_levels_dataframe(level, print_progress=False):
	data, headers = scrape_level(level)

	if print_progress:
	# Output the extracted data - just a few rows
	if level==2:
	log_update(f'\nTable size {len(data)}; expected 2212')
	if level==3:
	log_update(f'\nTable size {len(data)}; expected 266')
	log_update('Example rows 1-5:')
	for i, row in enumerate(data):
	log_update(row)
	if i>5: break

	df = pd.DataFrame(data, columns=headers)
	df['URL'] = df['FusionGID'].apply(lambda x: x[1])
	df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0])
	return df

	def scrape_level(level):
	level = str(level)
	# The URL of the website
	url = f"https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level{level}"

	# Sending a request to the website
	response = requests.get(url)

	# Parsing the HTML content of the website
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the specific <h1> tag
	if level == '2':
	specific_h1 = soup.find('h1', text='2212 Fusion gene(s) for your query: level2')
	if level== '3':
	specific_h1 = soup.find('h1', text='266 Fusion gene(s) for your query: level3')

	# Find the specific table following the <h1> tag
	table = specific_h1.find_next('table', class_='geneList')

	# Extract headers (only first 6 fields)
	headers = [header.get_text().strip() for header in table.find_all('td', class_='content_middle_gene_summary')][0:6]

	# Extract rows
	rows = table.find_all('tr')[1:] # Skip the header row

	# Extract data from rows
	data = []
	for row in rows:
	columns = row.find_all('td', class_='content_middle_gene_summary')
	if not columns:
	continue

	row_data = []
	for column in columns:
	link = column.find('a')
	if link:
	href = link['href']
	fusion_gid = link.get_text(strip=True)
	full_url = f"https://compbio.uth.edu/FusionPDB/{href}"
	row_data.append((fusion_gid, full_url))
	else:
	row_data.append(column.get_text(strip=True))
	data.append(row_data)

	return data, headers

	def get_structure_link_dataframe(id, print_progress=False):
	rows = get_structure_links(id)

	# IF printing progress, output the extracted data - just a few rows
	if print_progress:
	log_update(f'\nTable size {len(rows)}')
	log_update('Example rows 1-5:')
	for i, row in enumerate(rows):
	log_update(row)
	if i>5: break

	# Make the dataframe - new row for each link - ONLY if there's actually data
	if len(rows)>0:
	df = pd.DataFrame(rows)
	df = df.rename(columns={
	'Fusion protein PDB link (fusion AA seq ID in FusionPDB)': 'Structure Link'
	})
	# make a new row for each link
	df = df.explode('Structure Link').reset_index(drop=True)

	df['Structure Link'] = df['Structure Link'].apply(lambda x: 'https://compbio.uth.edu/FusionPDB/' + str(x))
	df['Structure Type'] = df['Structure Link'].apply(lambda x: 'PDB' if 'pdb_files' in x else ('CIF' if 'cif_files' in x else 'Unknown'))
	df['FO_Name'] = df['Hgene'] + '::' + df['Tgene']
	# Rename FO_Name to FusionGene
	df = df.rename(columns={'FO_Name':'FusionGene'})
	df['ID'] = [id]*len(df)

	expected_cols = ['ID','Structure Link','Hgene','Hchr','Hbp','Hstrand','Tgene','Tchr','Tbp','Tstrand','Len(AA seq)','Structure Type','FusionGene','AA seq']
	for col in expected_cols:
	if not(col in list(df.columns)):
	df[col] = ['']*len(df)
	df = df[expected_cols]
	#df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0])
	else:
	df = pd.DataFrame()

	return df

	def get_structure_links(id, print_progress=False):
	# Define the URL
	url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}"

	# Fetch the webpage content
	response = requests.get(url)
	html_content = response.content

	# Create a BeautifulSoup object
	soup = BeautifulSoup(html_content, 'html.parser')
	#with open(f'FusionPDB_{id}_soup.txt','w') as f:
	#f.write(soup.prettify())

	# Find the table with the title "Fusion Protein Structures"
	table_title = soup.find('a', {'name': 'FusionSTR'})
	rows = []
	# Check that the h2 text is "Fusion Protein Structures"
	if table_title and table_title.find('h2').text.strip() == 'Fusion Protein Structures':
	# Find the next table with class 'geneList' after the title
	table = table_title.find_next('table', class_='geneList')
	table = table.find_next('table')

	if table:
	if print_progress: log_update('table found')
	# Extract the headers from the first row
	header_row = table.find('tr')
	headers = [header.get_text(strip=True) for header in header_row.find_all('strong')]

	# Extract the rows
	rows = []
	for row in table.find_all('tr')[1:]: # Skip the header row
	cells = row.find_all('td')
	row_data = {}
	skip_next = False # Flag to skip the next cell
	for i, cell in enumerate(cells):
	# Get the link text if a link is present, otherwise get the text
	if skip_next:
	skip_next = False
	continue # Skip this cell

	cell_text = cell.get_text(strip=True)
	if "3D view using mol*" in cell_text:
	skip_next = True # Set the flag to skip the next cell
	continue # Skip this cell

	links = cell.find_all('a')
	if links:
	row_data[headers[i]] = [link.get('href') for link in links]
	else:
	celltext = cell.get_text(strip=True)
	if len(celltext)>0:
	row_data[headers[i]] = celltext
	if len(row_data)>0: rows.append(row_data)
	else:
	log_update('table not found')

	return rows

	def process_td_elements(soup_object, add_links=False):
	# Find all td elements with class "content_left_gene_summary"
	td_elements = soup_object.find_all('td', class_='content_left_gene_summary')

	# Extract and print the information
	data = []

	for td in td_elements:
	# Extract the text content
	strong_tag = td.find('strong')
	if strong_tag:
	text_content = strong_tag.get_text(strip=True)
	else:
	text_content = td.get_text(strip=True)

	# Extract the link if available
	if add_links:
	link_tag = td.find('a')
	if link_tag:
	link = link_tag.get('href')
	text_content += f" ({link})"

	data.append(text_content)

	return data

	def get_hgene_tgene_info(id, print_progress=False):
	# Define the URL
	url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}"

	# Fetch the webpage content
	response = requests.get(url)
	html_content = response.content

	# Create a BeautifulSoup object
	soup = BeautifulSoup(html_content, 'html.parser')
	#with open(f'FusionPDB_{id}_soup.txt','w') as f:
	#f.write(soup.prettify())

	# Find the table with the title "Fusion Protein Summary"
	title_table = soup.find('table', class_='title')
	if title_table and title_table.find('h2') and title_table.find('h2').get_text(strip=True) == 'Fusion Protein Summary':
	# Next table is irrelevent; skip to 'geneList' table 2 tables from now
	gene_list_table = title_table.find_next_sibling('table', class_='geneList').find_next_sibling('table', class_='geneList')

	# Extract relevant data from this table
	data = {
	"Fusion gene name": [],
	"FusionPDB ID": [],
	"FusionGDB2.0 ID": [],
	"Gene symbol": [],
	"Gene ID": [],
	"Gene name": [],
	"Synonyms": [],
	"Cytomap": [],
	"Type of gene": [],
	"Description": [],
	"Modification date": [],
	"UniProtAcc": []
	}

	td_data = process_td_elements(gene_list_table)

	# need to split td_data into 2 parts: before 'Gene symbol' and after 'Gene symbol'
	split_ind = td_data.index('Gene symbol')
	fusion_info, ht_info = [td_data[0:split_ind], td_data[split_ind::]]

	# first, process fusion info
	for info in fusion_info:
	if ':' in info:
	#log_update(info)
	key, value = info.split(':')[0:2]
	if key in data:
	data[key.strip()] = value.strip()

	# now, process ht_info up to
	# iterate 3 at a time
	j_start=0
	for i in range(0, len(ht_info), 3):
	# get the elements
	#log_update(len(ht_info[i:i+3]), ht_info[i:i+3])
	key, value1, value2 = ht_info[i:i+3]
	if key in data:
	data[key.strip()] = [value1.strip(), value2.strip()]
	if key=='UniProtAcc':
	break

	return data

	def process_ids(ids, outdir='', level=2):
	csv_filename = f'{outdir}/FusionPDB_level{level}_fusion_structure_links.csv'
	already_processed_ids = []
	if os.path.isfile(csv_filename):
	already_processed_ids = pd.read_csv(csv_filename)
	already_processed_ids = already_processed_ids['ID'].tolist()

	structureless_ids = pd.read_csv("raw_data/fusionpdb/fusionpdb_structureless_ids.txt",sep="\t",header=None)[0].tolist()

	log_update(f'\nLevel {level}:\n\tDownloading structure links for FusionPDB IDs:')
	for i, id in enumerate(ids):
	# only process if the csv_filename isn't already a file and if it's not one of the ones with no structure
	if (id in already_processed_ids) or (id in structureless_ids):
	continue
	df = get_structure_link_dataframe(id)
	if os.path.isfile(csv_filename):
	df.to_csv(csv_filename, mode='a', index=False,header=False)
	else:
	df.to_csv(csv_filename, mode='w', index=False)

	log_update(f'\t\t{i+1}. {id}')

	def process_ids_ht(ids, outdir='',level=2):
	outfile = f'{outdir}/level{level}_head_tail_info.txt'
	if not(os.path.isfile(outfile)):
	log_update(f"\n\tAcquiring UniProt accessions of head and tail genes for Level {level}")
	with open(outfile, 'a+') as f1:
	for id in ids:
	data = get_hgene_tgene_info(id)
	data = {
	'FusionGID': data['FusionPDB ID'],
	'HGID': data['Gene ID'][0],
	'TGID': data['Gene ID'][1],
	'HGUniProtAcc': data['UniProtAcc'][0],
	'TGUniProtAcc': data['UniProtAcc'][1]
	}
	f1.write(str(data))
	f1.write('\n')
	f1.flush()
	else:
	log_update(f"\nAlready acquired UniProt accessions of head and tail genes for Level {level} at: {outfile}")

	def download_file(url, directory):
	# Download file, IF its destination doesn't already have a file there
	local_filename = os.path.join(directory, url.split('/')[-1])
	if os.path.exists(local_filename):
	return local_filename
	response = requests.get(url)
	response.raise_for_status()
	with open(local_filename, 'wb') as file:
	file.write(response.content)
	return local_filename

	def download_structures(download_links):
	# Directory where you want to save the downloaded files
	download_directory = "raw_data/fusionpdb/structures"
	os.makedirs(download_directory, exist_ok=True)

	# Download all files
	for link in download_links:
	try:
	log_update(f"Downloading {link}...")
	download_file(link, download_directory)
	log_update(f"\tDownloaded {link} to {download_directory}")
	except Exception as e:
	log_update(f"\tFailed to download {link}. Reason: {e}")

	log_update("All downloads completed.")

	def combine_ht_info():
	# read the head and tail that was collected for levels 2 and 4
	outdir = 'raw_data/fusionpdb'
	head_tail_data = []
	with open(f'{outdir}/level2_head_tail_info.txt','r') as f:
	for line in f:
	# Parse the line as a dictionary using ast.literal_eval
	record = ast.literal_eval(line.strip())
	head_tail_data.append(record)

	with open(f'{outdir}/level3_head_tail_info.txt','r') as f:
	for line in f:
	# Parse the line as a dictionary using ast.literal_eval
	record = ast.literal_eval(line.strip())
	head_tail_data.append(record)

	ht_df = pd.DataFrame(head_tail_data)
	ht_df['FusionGID'] = ht_df['FusionGID'].astype(str)
	return ht_df

	# Add a column for the source of UniProtAcc
	def find_h_source(row):
	if row['HGUniProtAcc'] is not None:
	return 'FusionPDB'
	elif row['Entry_Hgene'] is not None:
	return 'UniProt ID Map'
	else:
	return None

	def find_t_source(row):
	if row['TGUniProtAcc'] is not None:
	return 'FusionPDB'
	elif row['Entry_Tgene'] is not None:
	return 'UniProt ID Map'
	else:
	return None

	def correct_huniprot(row):
	if row['HGUniProtAcc'] is not None:
	return row['HGUniProtAcc']
	elif row['Entry_Hgene'] is not None:
	return row['Entry_Hgene']
	else:
	return None

	def correct_tuniprot(row):
	if row['TGUniProtAcc'] is not None:
	return row['TGUniProtAcc']
	elif row['Entry_Tgene'] is not None:
	return row['Entry_Tgene']
	else:
	return None

	def combine_ht_info_with_structure_links(giant, ht_df):
	# Add in the head and tail data
	giant_with_hts = pd.merge(giant, ht_df, on='FusionGID', how='left')
	# make sure it's all strings here
	giant_with_hts['HGID_x'] = giant_with_hts['HGID_x'].astype(str)
	giant_with_hts['HGID_y'] = giant_with_hts['HGID_y'].astype(str)
	giant_with_hts['TGID_x'] = giant_with_hts['TGID_x'].astype(str)
	giant_with_hts['TGID_y'] = giant_with_hts['TGID_y'].astype(str)

	giant_with_hts['HGID_match'] = giant_with_hts['HGID_x'] == giant_with_hts['HGID_y']
	giant_with_hts['TGID_match'] = giant_with_hts['TGID_x'] == giant_with_hts['TGID_y']

	# check if they're all true
	assert giant_with_hts['HGID_match'].all() and giant_with_hts['TGID_match'].all()
	# cool, all of them are true so now drop the extra columns
	giant_with_hts = giant_with_hts.drop(['HGID_x','TGID_x','HGID_match','TGID_match'],axis=1).rename(columns={'HGID_y':'HGID','TGID_y':'TGID'})
	giant_with_hts = giant_with_hts.replace('.',np.nan)

	# Check - how many rows have uniprot IDs for both head and tail?
	hgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].isna()])
	tgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].notna()])
	hgid_and_tgid = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].notna()])
	neither = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].isna()])

	log_update(f"\nFusions with HGID only: {hgid_only}")
	log_update(f"Fusions with TGID only: {tgid_only}")
	log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}")
	log_update(f"Fusions with neither: {neither}")
	log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(giant_with_hts)}")

	# Collect all unmapped HGIDs and TGIDs
	unmapped_h = set(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna()]['Hgene'].tolist())
	unmapped_t = set(giant_with_hts[giant_with_hts['TGUniProtAcc'].isna()]['Tgene'].tolist())

	unmapped_parts = unmapped_h.union(unmapped_t)
	log_update(f"unmapped hgenes: {len(unmapped_h)}")
	log_update(f"unmapped tgenes: {len(unmapped_t)}")
	log_update(f"unmapped parts (hgids or tgids): {len(unmapped_parts)}")

	# We need to remap some Hgenes and Tgenes. There are some cases where FusionPDB got the wrong UniProt accessions.
	wrong_uniprot_ids =[
	'PRY',
	'TIAF1',
	'DCAF8L2',
	'UMAD1',
	'TIPIN',
	'GAB3',
	'OTOA',
	'PAGR1',
	'PRY2',
	'FAM178A',
	'SPATS2L',
	'VMAC',
	'ZNFX1',
	'TFPT',
	'TRANK1',
	'RRP15',
	'PAXBP1',
	'RB1CC1',
	'PACRGL',
	'TRMT1L',
	'PPPDE2',
	'YY1AP1',
	'RGP1',
	'SHKBP1',
	'RINT1',
	'PRAM1',
	'PIR',
	'TMBIM6',
	'PICK1',
	'PLEC',
	'NUDCD3',
	'CCBL1',
	'S100PBP',
	'RTL1',
	'C10orf140',
	'CD177',
	'SLF2',
	'STARD3NL',
	'RELL2',
	'AMIGO1',
	'TRAF3IP1',
	'PNOC',
	'PERM1',
	'UBE2F',
	'TBKBP1',
	'PAN3',
	'NSFL1C',
	'SPAST',
	'TOX4',
	'RGPD8',
	'ZDHHC9',
	'SLAMF9',
	'TNNT1',
	'TEKT5',
	'TPI1',
	'TAAR6',
	'SKIDA1',
	'PMS1'
	]
	# Add Hgene accessions with commas
	wrong_uniprot_ids += giant_with_hts[
	~(giant_with_hts['HGUniProtAcc'].isna()) &
	(giant_with_hts['HGUniProtAcc'].str.contains(","))
	]['HGUniProtAcc'].tolist()
	# Add Tgene accessions with commas
	wrong_uniprot_ids += giant_with_hts[
	~(giant_with_hts['TGUniProtAcc'].isna()) &
	(giant_with_hts['TGUniProtAcc'].str.contains(","))
	]['TGUniProtAcc'].tolist()

	# Get a list of the Hgenes and Tgenes that need to be ID mapped, AGAIN
	hts_tomap_part2 = giant_with_hts[giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids)]['Hgene'].tolist()
	hts_tomap_part2 += giant_with_hts[giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids)]['Tgene'].tolist()
	hts_tomap_part2 = set(hts_tomap_part2)
	log_update(f"Total head and tail genes that need to be mapped again: {len(hts_tomap_part2)}")

	# Write parts that need remapping to a file for submission to UniProt
	with open('processed_data/fusionpdb/intermediates/unmapped_parts.txt','w') as f:
	for part in unmapped_parts:
	f.write(f'{part}\n')
	for part in hts_tomap_part2:
	f.write(f'{part}\n')

	# set the accession to nan if it's in wrong_uniprot_ids
	giant_with_hts.loc[
	giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids),
	'HGUniProtAcc'
	] = np.nan
	giant_with_hts.loc[
	giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids),
	'TGUniProtAcc'
	] = np.nan

	# We did the ID Map ahead of time on UniProt. Use this file.
	idmap = pd.read_csv(f'raw_data/fusionpdb/hgene_tgene_uniprot_idmap_07_10_2024.txt',sep='\t')
	# are there multiple GeneIDs for anything?
	idmap['n_GeneID'] = idmap['GeneID'].apply(lambda x: [y for y in str(x).strip().split(';') if len(y)>0])
	idmap['n_GeneID'] = idmap['n_GeneID'].apply(lambda x: len(x))
	# are they all length 1?
	log_update(f"Genes may have the following total #s of gene IDs: {idmap['n_GeneID'].unique()}")
	# no they're not

	# do they all end in ;?
	log_update(f"All GeneIDs end in ; {idmap['GeneID'].apply(lambda x: x[-1] == ';' if type(x)==str else True).all()}") # say true if it's Nan, we don't care about this
	# yes they do

	# Merge new IDMap data from UniProt re-mapping with previous data
	# Merge twice: one time, we merge as if the mapped genes are Hgenes; the other time, as if the mapped genes are Tgenes
	idmap_merge = pd.merge(giant_with_hts, idmap[['From','Entry','GeneID']].rename(columns={'From':'Hgene', 'Entry': 'Entry_Hgene', 'GeneID': 'GeneID_Hgene'}), on='Hgene',how='left')
	idmap_merge = pd.merge(idmap_merge, idmap[['From','Entry','GeneID']].rename(columns={'From':'Tgene', 'Entry': 'Entry_Tgene', 'GeneID': 'GeneID_Tgene'}), on='Tgene',how='left')
	# From the original data, we have HGIDs and from the UniProt result, we have HGIDs with ; on the end. So make a 'HGID;' column to see if these match
	idmap_merge['HGID;'] = idmap_merge['HGID'].astype(str) + ';'
	idmap_merge['TGID;'] = idmap_merge['TGID'].astype(str) + ';'

	# "Found" is true if the HGID; from the FusionPDB mapping is one of the GeneIDs returned by UniProt
	idmap_merge['HGID_Found'] = idmap_merge.apply(lambda row: row['HGID;'] in str(row['GeneID_Hgene']), axis=1)
	idmap_merge['TGID_Found'] = idmap_merge.apply(lambda row: row['TGID;'] in str(row['GeneID_Tgene']), axis=1)

	# what do we keep from idmap merge?
	# we keep columns where: there's an HGID and a TGID, OR
	# if one of them is nan, its given gene id is in the list returned by uniprot
	idmap_merge_success = idmap_merge.loc[
	# Both were there to begin with
	((idmap_merge['HGUniProtAcc'].notna()) & (idmap_merge['TGUniProtAcc'].notna())) \|
	# Hgene was missing, correct HGID was found
	((idmap_merge['HGUniProtAcc'].isna()) & (idmap_merge['HGID_Found']==True)) \|
	# Tgene was missing, correct TGID was found
	((idmap_merge['TGUniProtAcc'].isna()) & (idmap_merge['TGID_Found']==True))
	].reset_index(drop=True)
	idmap_merge_success['FusionGID'] = idmap_merge_success['FusionGID'].astype(str)
	log_update(f"rows: {len(idmap_merge_success)}")
	log_update(f"unique successful fusion GIDs: {len(idmap_merge_success['FusionGID'].unique())}")

	# There are duplicate rowes with different GIDs for Hgenes and Tgenes. Here's the scheme for which row to keep.
	# Priority 1: HGUniProtAcc.notna() and TGUniProtAcc.notna()
	# Priority 2: HGID_Found and TGID_Found, and FusionGID is NOT in partition 1
	# Priority 3: HGID_Found or TGID_Found, and FusionGID is NOT in partition 1 or 2.
	# ^ If we get here, only one (Hgene OR Tgene) was found, since all combinations were tested.
	# For whichever one worked, combine the appropriate information and get rid of the rest

	partition1 = idmap_merge_success.loc[
	# Both were there to begin with
	((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna()))
	].reset_index(drop=True)
	partition1_gids = set(partition1['FusionGID'].tolist())
	log_update("Partition 1: HGUniProtAcc.notna() and TGUniProtAcc.notna() --> both UniProt accessions were found on FusionPDB")
	log_update(f"\t# GIDs: {len(partition1_gids)}")

	partition2 = idmap_merge_success.loc[
	# Hgene was missing, correct HGID was found or Tgene was missing, correct TGID was found.
	(idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True) &
	~(idmap_merge_success['FusionGID'].isin(partition1_gids))
	].reset_index(drop=True)
	partition2_gids = set(partition2['FusionGID'].tolist())
	log_update("Partition 2: HGID_Found & TGID_Found --> both UniProt accessions were mapped successfully; one or both was found by remapping on UniProt")
	log_update(f"\t# GIDs: {len(partition2_gids)}")

	partition3 = idmap_merge_success.loc[
	# it didn't satisfy one of the criteria for the first two partitions
	~(
	((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna())) \|
	((idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True))
	) &
	# one of the mapping was a success
	((idmap_merge_success['HGID_Found']==True) \| (idmap_merge_success['TGID_Found']==True)) &
	# the FusionGID is not in partition 1 or 2
	~(idmap_merge_success['FusionGID'].isin(partition1_gids)) &
	~(idmap_merge_success['FusionGID'].isin(partition2_gids))
	].reset_index(drop=True)
	partition3_gids = set(partition3['FusionGID'].tolist())
	log_update("Partition 3: HGID_Found or TGID_Found --> only one was successful, this was the best we can do")
	log_update(f"\t# GIDs: {len(partition3_gids)}")

	# check for dups
	partition1_dups = partition1[partition1.duplicated('FusionGID')]['FusionGID'].unique().tolist()
	partition2_dups = partition2[partition2.duplicated('FusionGID')]['FusionGID'].unique().tolist()
	partition3_dups = partition3[partition3.duplicated('FusionGID')]['FusionGID'].unique().tolist()

	log_update(f"\nDuplicate IDs in partition 1: {len(partition1_dups)}")
	log_update(f"Duplicate IDs in partition 2: {len(partition2_dups)}")
	log_update(f"Duplicate IDs in partition 3: {len(partition3_dups)} \tDuplicate rows: {len(partition3[partition3['FusionGID'].isin(partition3_dups)])}")

	log_update(f"\nRows in original dataset: {len(idmap_merge_success)}")
	log_update(f"Rows in partitions: {len(partition1)+len(partition2)+len(partition3)}")

	# Check that all GIDs are represented
	all_starting_fusiongids = set(idmap_merge_success['FusionGID'].tolist())
	all_fusiongids = set(partition1['FusionGID'].tolist()) \| set(partition2['FusionGID'].tolist()) \| set(partition3['FusionGID'].tolist())
	log_update(f"\nFusion GIDs captured in original dataset: {len(all_fusiongids)} {len(partition1_gids)+len(partition2_gids)+len(partition3_gids)}")
	log_update(f"Fusion GIDs captured in the 3 partitions: {len(all_starting_fusiongids)}")
	log_update(f"Same set? {all_starting_fusiongids == all_fusiongids}")

	# Deal with partition 3's duplicates - group by FusionGID and join all the UniProt IDs returned for HGene and TGene
	partition3['Entry_Hgene'] = partition3['Entry_Hgene'].astype(str)
	partition3['Entry_Tgene'] = partition3['Entry_Tgene'].astype(str)

	partition3 = partition3.groupby('FusionGID').agg({
	'FusionGID': 'first',
	'FusionGene': 'first',
	'Hgene': 'first',
	'Tgene': 'first',
	'URL': 'first',
	'HGID': 'first',
	'TGID': 'first',
	'HGUniProtAcc': 'first',
	'TGUniProtAcc': 'first',
	'Entry_Hgene': lambda x: ','.join(set([y for y in x])),
	'GeneID_Hgene': 'first',
	'Entry_Tgene': lambda x: ','.join(set([y for y in x])),
	'GeneID_Tgene': 'first',
	'HGID;': 'first',
	'TGID;': 'first',
	'HGID_Found': 'first',# there should only be one
	'TGID_Found': 'first'# there should only be one
	}
	).reset_index(drop=True)

	# Finally, recombine
	recombined = pd.concat(
	[
	partition1,
	partition2,
	partition3
	]
	).reset_index(drop=True)
	# there should be no duplicate GIDs in this
	log_update(f"Duplicate GID rows: {len(recombined[recombined.duplicated('FusionGID')])}")
	recombined = recombined.replace({np.nan: None, 'nan': None})

	# Add the UniProt source so it's clear where we got these IDs from
	recombined['HGUniProtAcc_Source'] = recombined.apply(lambda row: find_h_source(row), axis=1)
	recombined['TGUniProtAcc_Source'] = recombined.apply(lambda row: find_t_source(row), axis=1)
	recombined['HGUniProtAcc'] = recombined.apply(lambda row: correct_huniprot(row), axis=1)
	recombined['TGUniProtAcc'] = recombined.apply(lambda row: correct_tuniprot(row), axis=1)

	# Check: every row that has "UniProt ID Map" for HGUniProtAcc_Source should have an Entry_Hgene
	log_update(f"Every row with UniProt ID Map as HGUniProtAcc_Source has an Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']=='UniProt ID Map']['Entry_Hgene'].apply(lambda x: x is not None).all()}")
	log_update(f"Every row with UniProt ID Map as TGUniProtAcc_Source has an Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']=='UniProt ID Map']['Entry_Tgene'].apply(lambda x: x is not None).all()}")
	log_update(f"Every row with no HGUniProtAcc_Source has no Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']==None]['Entry_Hgene'].apply(lambda x: x is None).all()}")
	log_update(f"Every row with no TGUniProtAcc_Source has no Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']==None]['Entry_Tgene'].apply(lambda x: x is None).all()}")

	# keep only 'FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source'
	recombined = recombined[['FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source']]
	recombined = recombined.replace({None: np.nan})
	# print how many have each id
	# how many rows have uniprot IDs for both?
	hgid_only = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].isna()])
	tgid_only = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].notna()])
	hgid_and_tgid = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].notna()])
	neither = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].isna()])

	log_update(f"Fusions with HGID only: {hgid_only}")
	log_update(f"Fusions with TGID only: {tgid_only}")
	log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}")
	log_update(f"Fusions with neither: {neither}")
	log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(recombined)}")

	return recombined

	def scrape_fusionpdb_level_2_3():
	# Scrape level 2 and save it
	os.makedirs("raw_data/fusionpdb",exist_ok=True)
	os.makedirs("processed_data/fusionpdb",exist_ok=True)
	os.makedirs("processed_data/fusionpdb/intermediates",exist_ok=True)
	matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level2_curated_*.csv')
	if len(matching_file)>0:
	log_update(f"\nLevel 2 was already scraped for IDs. Loading data from: {matching_file[0]}")
	level2_df = pd.read_csv(matching_file[0])
	else:
	log_update(f"\nScraping Level 2 IDs from FusionPDB")
	dt_tag = get_local_date_yr()
	level2_df = get_levels_dataframe(2, print_progress=True)
	level2_df['FusionGID'] = level2_df['FusionGID'].astype(str)
	level2_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level2_curated_{dt_tag}.csv',index=False)

	# Scrape level 3 and save it
	matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level3_curated_*.csv')
	if len(matching_file)>>0:
	log_update(f"\nLevel 3 was already scraped for IDs. Loading data from: {matching_file[0]}")
	level3_df = pd.read_csv(matching_file[0])
	else:
	log_update(f"\nScraping Level 3 IDs from FusionPDB")
	dt_tag = get_local_date_yr()
	level3_df = get_levels_dataframe(3, print_progress=True)
	level3_df['FusionGID'] = level3_df['FusionGID'].astype(str)
	level3_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level3_curated_{dt_tag}.csv',index=False)

	# Check for ID overlap
	level2_ids = set(level2_df['FusionGID'].tolist())
	level3_ids = set(level3_df['FusionGID'].tolist())
	log_update(f"Total overlapping fusionGIDs between levels 2 and 3: {len(level2_ids.intersection(level3_ids))}")

	##### LEVEL 2: Get links to FusionPDB's provided structure files for all level 2 proteins in FusionPDB. Then, combine them into one file.
	links_save_dir = 'raw_data/fusionpdb'
	os.makedirs(links_save_dir,exist_ok=True)
	process_ids(level2_ids,outdir=links_save_dir,level=2) # only processes ids that haven't been processed yet

	# Get head and tail gene info for level 2 proteins
	process_ids_ht(level2_ids,outdir=links_save_dir,level=2)

	##### LEVEL 3: Get links to FusionPDB's provided structure files for all level 3 proteins in FusionPDB. Then, combine them into one file.
	links_save_dir = 'raw_data/fusionpdb'
	process_ids(level3_ids,outdir=links_save_dir,level=3)

	# Get head and tail gene info for level 2 proteins
	process_ids_ht(level3_ids,outdir=links_save_dir,level=3)

	# Combine head and tail data
	ht_df = combine_ht_info()
	ht_df.to_csv("processed_data/fusionpdb/fusion_heads_and_tails.csv",index=False)

	# Combine level 2 and level 3 data in two ways: (1) giant structure links file, (2) giant file with head and tail info
	log_update("\nCombining level 2 and 3 data")
	giant_level2 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'})
	giant_level2 = pd.merge(giant_level2, level2_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left')
	log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv: {len(giant_level2)}")

	giant_level3 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'})
	giant_level3 = pd.merge(giant_level3, level3_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left')
	log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv: {len(giant_level3)}")

	giant_level2['Level'] = [2]*len(giant_level2)
	giant_level3['Level'] = [3]*len(giant_level3)

	##### Create some intermediate "giant" files, combining bulk info for levels 2 and 3
	# These files are intermediate for two reasons:
	# - giant_level2-3_fusion_protein_structure_links: doesn't have head and tail structural info
	# - giant_level2-3_fusion_protein_head_tail_info: only has fusion proteins that have at least one mappable head/tail

	giant_sl = pd.concat([giant_level2,giant_level3]).drop_duplicates().reset_index(drop=True)
	giant_sl.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv',index=False)
	# Structure link file should be done - make sure there are no duplicate GIDs
	log_update(f"\nSaving file with all Level 2 and 3 Structure links (size: {len(giant_sl)}) to: processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv")

	# Create and download CSV of combined head and tail info
	giant_ht = pd.concat([level2_df,level3_df]).reset_index(drop=True)
	# ensure the type is string
	giant_ht['FusionGID'] = giant_ht['FusionGID'].astype(str)
	giant_with_ht = combine_ht_info_with_structure_links(giant_ht, ht_df)
	giant_with_ht.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv',index=False)

	# Download structure links
	download_structures(giant_sl['Structure Link'].tolist())

	def main():
	with open_logfile("fetch_fusionpdb_data_log.txt"):
	scrape_fusionpdb_level_2_3()

	if __name__ == "__main__":
	main()