# Script for fetching FusionPDB level 2 and 3 data import requests import pandas as pd import numpy as np from bs4 import BeautifulSoup import glob import ast import os from pandas.errors import EmptyDataError from fuson_plm.utils.logging import open_logfile, log_update, get_local_date_yr def get_levels_dataframe(level, print_progress=False): data, headers = scrape_level(level) if print_progress: # Output the extracted data - just a few rows if level==2: log_update(f'\nTable size {len(data)}; expected 2212') if level==3: log_update(f'\nTable size {len(data)}; expected 266') log_update('Example rows 1-5:') for i, row in enumerate(data): log_update(row) if i>5: break df = pd.DataFrame(data, columns=headers) df['URL'] = df['FusionGID'].apply(lambda x: x[1]) df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0]) return df def scrape_level(level): level = str(level) # The URL of the website url = f"https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level{level}" # Sending a request to the website response = requests.get(url) # Parsing the HTML content of the website soup = BeautifulSoup(response.content, 'html.parser') # Find the specific

tag if level == '2': specific_h1 = soup.find('h1', text='2212 Fusion gene(s) for your query: level2') if level== '3': specific_h1 = soup.find('h1', text='266 Fusion gene(s) for your query: level3') # Find the specific table following the

tag table = specific_h1.find_next('table', class_='geneList') # Extract headers (only first 6 fields) headers = [header.get_text().strip() for header in table.find_all('td', class_='content_middle_gene_summary')][0:6] # Extract rows rows = table.find_all('tr')[1:] # Skip the header row # Extract data from rows data = [] for row in rows: columns = row.find_all('td', class_='content_middle_gene_summary') if not columns: continue row_data = [] for column in columns: link = column.find('a') if link: href = link['href'] fusion_gid = link.get_text(strip=True) full_url = f"https://compbio.uth.edu/FusionPDB/{href}" row_data.append((fusion_gid, full_url)) else: row_data.append(column.get_text(strip=True)) data.append(row_data) return data, headers def get_structure_link_dataframe(id, print_progress=False): rows = get_structure_links(id) # IF printing progress, output the extracted data - just a few rows if print_progress: log_update(f'\nTable size {len(rows)}') log_update('Example rows 1-5:') for i, row in enumerate(rows): log_update(row) if i>5: break # Make the dataframe - new row for each link - ONLY if there's actually data if len(rows)>0: df = pd.DataFrame(rows) df = df.rename(columns={ 'Fusion protein PDB link (fusion AA seq ID in FusionPDB)': 'Structure Link' }) # make a new row for each link df = df.explode('Structure Link').reset_index(drop=True) df['Structure Link'] = df['Structure Link'].apply(lambda x: 'https://compbio.uth.edu/FusionPDB/' + str(x)) df['Structure Type'] = df['Structure Link'].apply(lambda x: 'PDB' if 'pdb_files' in x else ('CIF' if 'cif_files' in x else 'Unknown')) df['FO_Name'] = df['Hgene'] + '::' + df['Tgene'] # Rename FO_Name to FusionGene df = df.rename(columns={'FO_Name':'FusionGene'}) df['ID'] = [id]*len(df) expected_cols = ['ID','Structure Link','Hgene','Hchr','Hbp','Hstrand','Tgene','Tchr','Tbp','Tstrand','Len(AA seq)','Structure Type','FusionGene','AA seq'] for col in expected_cols: if not(col in list(df.columns)): df[col] = ['']*len(df) df = df[expected_cols] #df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0]) else: df = pd.DataFrame() return df def get_structure_links(id, print_progress=False): # Define the URL url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}" # Fetch the webpage content response = requests.get(url) html_content = response.content # Create a BeautifulSoup object soup = BeautifulSoup(html_content, 'html.parser') #with open(f'FusionPDB_{id}_soup.txt','w') as f: #f.write(soup.prettify()) # Find the table with the title "Fusion Protein Structures" table_title = soup.find('a', {'name': 'FusionSTR'}) rows = [] # Check that the h2 text is "Fusion Protein Structures" if table_title and table_title.find('h2').text.strip() == 'Fusion Protein Structures': # Find the next table with class 'geneList' after the title table = table_title.find_next('table', class_='geneList') table = table.find_next('table') if table: if print_progress: log_update('table found') # Extract the headers from the first row header_row = table.find('tr') headers = [header.get_text(strip=True) for header in header_row.find_all('strong')] # Extract the rows rows = [] for row in table.find_all('tr')[1:]: # Skip the header row cells = row.find_all('td') row_data = {} skip_next = False # Flag to skip the next cell for i, cell in enumerate(cells): # Get the link text if a link is present, otherwise get the text if skip_next: skip_next = False continue # Skip this cell cell_text = cell.get_text(strip=True) if "3D view using mol*" in cell_text: skip_next = True # Set the flag to skip the next cell continue # Skip this cell links = cell.find_all('a') if links: row_data[headers[i]] = [link.get('href') for link in links] else: celltext = cell.get_text(strip=True) if len(celltext)>0: row_data[headers[i]] = celltext if len(row_data)>0: rows.append(row_data) else: log_update('table not found') return rows def process_td_elements(soup_object, add_links=False): # Find all td elements with class "content_left_gene_summary" td_elements = soup_object.find_all('td', class_='content_left_gene_summary') # Extract and print the information data = [] for td in td_elements: # Extract the text content strong_tag = td.find('strong') if strong_tag: text_content = strong_tag.get_text(strip=True) else: text_content = td.get_text(strip=True) # Extract the link if available if add_links: link_tag = td.find('a') if link_tag: link = link_tag.get('href') text_content += f" ({link})" data.append(text_content) return data def get_hgene_tgene_info(id, print_progress=False): # Define the URL url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}" # Fetch the webpage content response = requests.get(url) html_content = response.content # Create a BeautifulSoup object soup = BeautifulSoup(html_content, 'html.parser') #with open(f'FusionPDB_{id}_soup.txt','w') as f: #f.write(soup.prettify()) # Find the table with the title "Fusion Protein Summary" title_table = soup.find('table', class_='title') if title_table and title_table.find('h2') and title_table.find('h2').get_text(strip=True) == 'Fusion Protein Summary': # Next table is irrelevent; skip to 'geneList' table 2 tables from now gene_list_table = title_table.find_next_sibling('table', class_='geneList').find_next_sibling('table', class_='geneList') # Extract relevant data from this table data = { "Fusion gene name": [], "FusionPDB ID": [], "FusionGDB2.0 ID": [], "Gene symbol": [], "Gene ID": [], "Gene name": [], "Synonyms": [], "Cytomap": [], "Type of gene": [], "Description": [], "Modification date": [], "UniProtAcc": [] } td_data = process_td_elements(gene_list_table) # need to split td_data into 2 parts: before 'Gene symbol' and after 'Gene symbol' split_ind = td_data.index('Gene symbol') fusion_info, ht_info = [td_data[0:split_ind], td_data[split_ind::]] # first, process fusion info for info in fusion_info: if ':' in info: #log_update(info) key, value = info.split(':')[0:2] if key in data: data[key.strip()] = value.strip() # now, process ht_info up to # iterate 3 at a time j_start=0 for i in range(0, len(ht_info), 3): # get the elements #log_update(len(ht_info[i:i+3]), ht_info[i:i+3]) key, value1, value2 = ht_info[i:i+3] if key in data: data[key.strip()] = [value1.strip(), value2.strip()] if key=='UniProtAcc': break return data def process_ids(ids, outdir='', level=2): csv_filename = f'{outdir}/FusionPDB_level{level}_fusion_structure_links.csv' already_processed_ids = [] if os.path.isfile(csv_filename): already_processed_ids = pd.read_csv(csv_filename) already_processed_ids = already_processed_ids['ID'].tolist() structureless_ids = pd.read_csv("raw_data/fusionpdb/fusionpdb_structureless_ids.txt",sep="\t",header=None)[0].tolist() log_update(f'\nLevel {level}:\n\tDownloading structure links for FusionPDB IDs:') for i, id in enumerate(ids): # only process if the csv_filename isn't already a file and if it's not one of the ones with no structure if (id in already_processed_ids) or (id in structureless_ids): continue df = get_structure_link_dataframe(id) if os.path.isfile(csv_filename): df.to_csv(csv_filename, mode='a', index=False,header=False) else: df.to_csv(csv_filename, mode='w', index=False) log_update(f'\t\t{i+1}. {id}') def process_ids_ht(ids, outdir='',level=2): outfile = f'{outdir}/level{level}_head_tail_info.txt' if not(os.path.isfile(outfile)): log_update(f"\n\tAcquiring UniProt accessions of head and tail genes for Level {level}") with open(outfile, 'a+') as f1: for id in ids: data = get_hgene_tgene_info(id) data = { 'FusionGID': data['FusionPDB ID'], 'HGID': data['Gene ID'][0], 'TGID': data['Gene ID'][1], 'HGUniProtAcc': data['UniProtAcc'][0], 'TGUniProtAcc': data['UniProtAcc'][1] } f1.write(str(data)) f1.write('\n') f1.flush() else: log_update(f"\nAlready acquired UniProt accessions of head and tail genes for Level {level} at: {outfile}") def download_file(url, directory): # Download file, IF its destination doesn't already have a file there local_filename = os.path.join(directory, url.split('/')[-1]) if os.path.exists(local_filename): return local_filename response = requests.get(url) response.raise_for_status() with open(local_filename, 'wb') as file: file.write(response.content) return local_filename def download_structures(download_links): # Directory where you want to save the downloaded files download_directory = "raw_data/fusionpdb/structures" os.makedirs(download_directory, exist_ok=True) # Download all files for link in download_links: try: log_update(f"Downloading {link}...") download_file(link, download_directory) log_update(f"\tDownloaded {link} to {download_directory}") except Exception as e: log_update(f"\tFailed to download {link}. Reason: {e}") log_update("All downloads completed.") def combine_ht_info(): # read the head and tail that was collected for levels 2 and 4 outdir = 'raw_data/fusionpdb' head_tail_data = [] with open(f'{outdir}/level2_head_tail_info.txt','r') as f: for line in f: # Parse the line as a dictionary using ast.literal_eval record = ast.literal_eval(line.strip()) head_tail_data.append(record) with open(f'{outdir}/level3_head_tail_info.txt','r') as f: for line in f: # Parse the line as a dictionary using ast.literal_eval record = ast.literal_eval(line.strip()) head_tail_data.append(record) ht_df = pd.DataFrame(head_tail_data) ht_df['FusionGID'] = ht_df['FusionGID'].astype(str) return ht_df # Add a column for the source of UniProtAcc def find_h_source(row): if row['HGUniProtAcc'] is not None: return 'FusionPDB' elif row['Entry_Hgene'] is not None: return 'UniProt ID Map' else: return None def find_t_source(row): if row['TGUniProtAcc'] is not None: return 'FusionPDB' elif row['Entry_Tgene'] is not None: return 'UniProt ID Map' else: return None def correct_huniprot(row): if row['HGUniProtAcc'] is not None: return row['HGUniProtAcc'] elif row['Entry_Hgene'] is not None: return row['Entry_Hgene'] else: return None def correct_tuniprot(row): if row['TGUniProtAcc'] is not None: return row['TGUniProtAcc'] elif row['Entry_Tgene'] is not None: return row['Entry_Tgene'] else: return None def combine_ht_info_with_structure_links(giant, ht_df): # Add in the head and tail data giant_with_hts = pd.merge(giant, ht_df, on='FusionGID', how='left') # make sure it's all strings here giant_with_hts['HGID_x'] = giant_with_hts['HGID_x'].astype(str) giant_with_hts['HGID_y'] = giant_with_hts['HGID_y'].astype(str) giant_with_hts['TGID_x'] = giant_with_hts['TGID_x'].astype(str) giant_with_hts['TGID_y'] = giant_with_hts['TGID_y'].astype(str) giant_with_hts['HGID_match'] = giant_with_hts['HGID_x'] == giant_with_hts['HGID_y'] giant_with_hts['TGID_match'] = giant_with_hts['TGID_x'] == giant_with_hts['TGID_y'] # check if they're all true assert giant_with_hts['HGID_match'].all() and giant_with_hts['TGID_match'].all() # cool, all of them are true so now drop the extra columns giant_with_hts = giant_with_hts.drop(['HGID_x','TGID_x','HGID_match','TGID_match'],axis=1).rename(columns={'HGID_y':'HGID','TGID_y':'TGID'}) giant_with_hts = giant_with_hts.replace('.',np.nan) # Check - how many rows have uniprot IDs for both head and tail? hgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].isna()]) tgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].notna()]) hgid_and_tgid = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].notna()]) neither = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].isna()]) log_update(f"\nFusions with HGID only: {hgid_only}") log_update(f"Fusions with TGID only: {tgid_only}") log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}") log_update(f"Fusions with neither: {neither}") log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(giant_with_hts)}") # Collect all unmapped HGIDs and TGIDs unmapped_h = set(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna()]['Hgene'].tolist()) unmapped_t = set(giant_with_hts[giant_with_hts['TGUniProtAcc'].isna()]['Tgene'].tolist()) unmapped_parts = unmapped_h.union(unmapped_t) log_update(f"unmapped hgenes: {len(unmapped_h)}") log_update(f"unmapped tgenes: {len(unmapped_t)}") log_update(f"unmapped parts (hgids or tgids): {len(unmapped_parts)}") # We need to remap some Hgenes and Tgenes. There are some cases where FusionPDB got the wrong UniProt accessions. wrong_uniprot_ids =[ 'PRY', 'TIAF1', 'DCAF8L2', 'UMAD1', 'TIPIN', 'GAB3', 'OTOA', 'PAGR1', 'PRY2', 'FAM178A', 'SPATS2L', 'VMAC', 'ZNFX1', 'TFPT', 'TRANK1', 'RRP15', 'PAXBP1', 'RB1CC1', 'PACRGL', 'TRMT1L', 'PPPDE2', 'YY1AP1', 'RGP1', 'SHKBP1', 'RINT1', 'PRAM1', 'PIR', 'TMBIM6', 'PICK1', 'PLEC', 'NUDCD3', 'CCBL1', 'S100PBP', 'RTL1', 'C10orf140', 'CD177', 'SLF2', 'STARD3NL', 'RELL2', 'AMIGO1', 'TRAF3IP1', 'PNOC', 'PERM1', 'UBE2F', 'TBKBP1', 'PAN3', 'NSFL1C', 'SPAST', 'TOX4', 'RGPD8', 'ZDHHC9', 'SLAMF9', 'TNNT1', 'TEKT5', 'TPI1', 'TAAR6', 'SKIDA1', 'PMS1' ] # Add Hgene accessions with commas wrong_uniprot_ids += giant_with_hts[ ~(giant_with_hts['HGUniProtAcc'].isna()) & (giant_with_hts['HGUniProtAcc'].str.contains(",")) ]['HGUniProtAcc'].tolist() # Add Tgene accessions with commas wrong_uniprot_ids += giant_with_hts[ ~(giant_with_hts['TGUniProtAcc'].isna()) & (giant_with_hts['TGUniProtAcc'].str.contains(",")) ]['TGUniProtAcc'].tolist() # Get a list of the Hgenes and Tgenes that need to be ID mapped, AGAIN hts_tomap_part2 = giant_with_hts[giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids)]['Hgene'].tolist() hts_tomap_part2 += giant_with_hts[giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids)]['Tgene'].tolist() hts_tomap_part2 = set(hts_tomap_part2) log_update(f"Total head and tail genes that need to be mapped again: {len(hts_tomap_part2)}") # Write parts that need remapping to a file for submission to UniProt with open('processed_data/fusionpdb/intermediates/unmapped_parts.txt','w') as f: for part in unmapped_parts: f.write(f'{part}\n') for part in hts_tomap_part2: f.write(f'{part}\n') # set the accession to nan if it's in wrong_uniprot_ids giant_with_hts.loc[ giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids), 'HGUniProtAcc' ] = np.nan giant_with_hts.loc[ giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids), 'TGUniProtAcc' ] = np.nan # We did the ID Map ahead of time on UniProt. Use this file. idmap = pd.read_csv(f'raw_data/fusionpdb/hgene_tgene_uniprot_idmap_07_10_2024.txt',sep='\t') # are there multiple GeneIDs for anything? idmap['n_GeneID'] = idmap['GeneID'].apply(lambda x: [y for y in str(x).strip().split(';') if len(y)>0]) idmap['n_GeneID'] = idmap['n_GeneID'].apply(lambda x: len(x)) # are they all length 1? log_update(f"Genes may have the following total #s of gene IDs: {idmap['n_GeneID'].unique()}") # no they're not # do they all end in ;? log_update(f"All GeneIDs end in ; {idmap['GeneID'].apply(lambda x: x[-1] == ';' if type(x)==str else True).all()}") # say true if it's Nan, we don't care about this # yes they do # Merge new IDMap data from UniProt re-mapping with previous data # Merge twice: one time, we merge as if the mapped genes are Hgenes; the other time, as if the mapped genes are Tgenes idmap_merge = pd.merge(giant_with_hts, idmap[['From','Entry','GeneID']].rename(columns={'From':'Hgene', 'Entry': 'Entry_Hgene', 'GeneID': 'GeneID_Hgene'}), on='Hgene',how='left') idmap_merge = pd.merge(idmap_merge, idmap[['From','Entry','GeneID']].rename(columns={'From':'Tgene', 'Entry': 'Entry_Tgene', 'GeneID': 'GeneID_Tgene'}), on='Tgene',how='left') # From the original data, we have HGIDs and from the UniProt result, we have HGIDs with ; on the end. So make a 'HGID;' column to see if these match idmap_merge['HGID;'] = idmap_merge['HGID'].astype(str) + ';' idmap_merge['TGID;'] = idmap_merge['TGID'].astype(str) + ';' # "Found" is true if the HGID; from the FusionPDB mapping is one of the GeneIDs returned by UniProt idmap_merge['HGID_Found'] = idmap_merge.apply(lambda row: row['HGID;'] in str(row['GeneID_Hgene']), axis=1) idmap_merge['TGID_Found'] = idmap_merge.apply(lambda row: row['TGID;'] in str(row['GeneID_Tgene']), axis=1) # what do we keep from idmap merge? # we keep columns where: there's an HGID and a TGID, OR # if one of them is nan, its given gene id is in the list returned by uniprot idmap_merge_success = idmap_merge.loc[ # Both were there to begin with ((idmap_merge['HGUniProtAcc'].notna()) & (idmap_merge['TGUniProtAcc'].notna())) | # Hgene was missing, correct HGID was found ((idmap_merge['HGUniProtAcc'].isna()) & (idmap_merge['HGID_Found']==True)) | # Tgene was missing, correct TGID was found ((idmap_merge['TGUniProtAcc'].isna()) & (idmap_merge['TGID_Found']==True)) ].reset_index(drop=True) idmap_merge_success['FusionGID'] = idmap_merge_success['FusionGID'].astype(str) log_update(f"rows: {len(idmap_merge_success)}") log_update(f"unique successful fusion GIDs: {len(idmap_merge_success['FusionGID'].unique())}") # There are duplicate rowes with different GIDs for Hgenes and Tgenes. Here's the scheme for which row to keep. # Priority 1: HGUniProtAcc.notna() and TGUniProtAcc.notna() # Priority 2: HGID_Found and TGID_Found, and FusionGID is NOT in partition 1 # Priority 3: HGID_Found or TGID_Found, and FusionGID is NOT in partition 1 or 2. # ^ If we get here, only one (Hgene OR Tgene) was found, since all combinations were tested. # For whichever one worked, combine the appropriate information and get rid of the rest partition1 = idmap_merge_success.loc[ # Both were there to begin with ((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna())) ].reset_index(drop=True) partition1_gids = set(partition1['FusionGID'].tolist()) log_update("Partition 1: HGUniProtAcc.notna() and TGUniProtAcc.notna() --> both UniProt accessions were found on FusionPDB") log_update(f"\t# GIDs: {len(partition1_gids)}") partition2 = idmap_merge_success.loc[ # Hgene was missing, correct HGID was found or Tgene was missing, correct TGID was found. (idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True) & ~(idmap_merge_success['FusionGID'].isin(partition1_gids)) ].reset_index(drop=True) partition2_gids = set(partition2['FusionGID'].tolist()) log_update("Partition 2: HGID_Found & TGID_Found --> both UniProt accessions were mapped successfully; one or both was found by remapping on UniProt") log_update(f"\t# GIDs: {len(partition2_gids)}") partition3 = idmap_merge_success.loc[ # it didn't satisfy one of the criteria for the first two partitions ~( ((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna())) | ((idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True)) ) & # one of the mapping was a success ((idmap_merge_success['HGID_Found']==True) | (idmap_merge_success['TGID_Found']==True)) & # the FusionGID is not in partition 1 or 2 ~(idmap_merge_success['FusionGID'].isin(partition1_gids)) & ~(idmap_merge_success['FusionGID'].isin(partition2_gids)) ].reset_index(drop=True) partition3_gids = set(partition3['FusionGID'].tolist()) log_update("Partition 3: HGID_Found or TGID_Found --> only one was successful, this was the best we can do") log_update(f"\t# GIDs: {len(partition3_gids)}") # check for dups partition1_dups = partition1[partition1.duplicated('FusionGID')]['FusionGID'].unique().tolist() partition2_dups = partition2[partition2.duplicated('FusionGID')]['FusionGID'].unique().tolist() partition3_dups = partition3[partition3.duplicated('FusionGID')]['FusionGID'].unique().tolist() log_update(f"\nDuplicate IDs in partition 1: {len(partition1_dups)}") log_update(f"Duplicate IDs in partition 2: {len(partition2_dups)}") log_update(f"Duplicate IDs in partition 3: {len(partition3_dups)} \tDuplicate rows: {len(partition3[partition3['FusionGID'].isin(partition3_dups)])}") log_update(f"\nRows in original dataset: {len(idmap_merge_success)}") log_update(f"Rows in partitions: {len(partition1)+len(partition2)+len(partition3)}") # Check that all GIDs are represented all_starting_fusiongids = set(idmap_merge_success['FusionGID'].tolist()) all_fusiongids = set(partition1['FusionGID'].tolist()) | set(partition2['FusionGID'].tolist()) | set(partition3['FusionGID'].tolist()) log_update(f"\nFusion GIDs captured in original dataset: {len(all_fusiongids)} {len(partition1_gids)+len(partition2_gids)+len(partition3_gids)}") log_update(f"Fusion GIDs captured in the 3 partitions: {len(all_starting_fusiongids)}") log_update(f"Same set? {all_starting_fusiongids == all_fusiongids}") # Deal with partition 3's duplicates - group by FusionGID and join all the UniProt IDs returned for HGene and TGene partition3['Entry_Hgene'] = partition3['Entry_Hgene'].astype(str) partition3['Entry_Tgene'] = partition3['Entry_Tgene'].astype(str) partition3 = partition3.groupby('FusionGID').agg({ 'FusionGID': 'first', 'FusionGene': 'first', 'Hgene': 'first', 'Tgene': 'first', 'URL': 'first', 'HGID': 'first', 'TGID': 'first', 'HGUniProtAcc': 'first', 'TGUniProtAcc': 'first', 'Entry_Hgene': lambda x: ','.join(set([y for y in x])), 'GeneID_Hgene': 'first', 'Entry_Tgene': lambda x: ','.join(set([y for y in x])), 'GeneID_Tgene': 'first', 'HGID;': 'first', 'TGID;': 'first', 'HGID_Found': 'first',# there should only be one 'TGID_Found': 'first'# there should only be one } ).reset_index(drop=True) # Finally, recombine recombined = pd.concat( [ partition1, partition2, partition3 ] ).reset_index(drop=True) # there should be no duplicate GIDs in this log_update(f"Duplicate GID rows: {len(recombined[recombined.duplicated('FusionGID')])}") recombined = recombined.replace({np.nan: None, 'nan': None}) # Add the UniProt source so it's clear where we got these IDs from recombined['HGUniProtAcc_Source'] = recombined.apply(lambda row: find_h_source(row), axis=1) recombined['TGUniProtAcc_Source'] = recombined.apply(lambda row: find_t_source(row), axis=1) recombined['HGUniProtAcc'] = recombined.apply(lambda row: correct_huniprot(row), axis=1) recombined['TGUniProtAcc'] = recombined.apply(lambda row: correct_tuniprot(row), axis=1) # Check: every row that has "UniProt ID Map" for HGUniProtAcc_Source should have an Entry_Hgene log_update(f"Every row with UniProt ID Map as HGUniProtAcc_Source has an Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']=='UniProt ID Map']['Entry_Hgene'].apply(lambda x: x is not None).all()}") log_update(f"Every row with UniProt ID Map as TGUniProtAcc_Source has an Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']=='UniProt ID Map']['Entry_Tgene'].apply(lambda x: x is not None).all()}") log_update(f"Every row with no HGUniProtAcc_Source has no Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']==None]['Entry_Hgene'].apply(lambda x: x is None).all()}") log_update(f"Every row with no TGUniProtAcc_Source has no Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']==None]['Entry_Tgene'].apply(lambda x: x is None).all()}") # keep only 'FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source' recombined = recombined[['FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source']] recombined = recombined.replace({None: np.nan}) # print how many have each id # how many rows have uniprot IDs for both? hgid_only = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].isna()]) tgid_only = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].notna()]) hgid_and_tgid = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].notna()]) neither = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].isna()]) log_update(f"Fusions with HGID only: {hgid_only}") log_update(f"Fusions with TGID only: {tgid_only}") log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}") log_update(f"Fusions with neither: {neither}") log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(recombined)}") return recombined def scrape_fusionpdb_level_2_3(): # Scrape level 2 and save it os.makedirs("raw_data/fusionpdb",exist_ok=True) os.makedirs("processed_data/fusionpdb",exist_ok=True) os.makedirs("processed_data/fusionpdb/intermediates",exist_ok=True) matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level2_curated_*.csv') if len(matching_file)>0: log_update(f"\nLevel 2 was already scraped for IDs. Loading data from: {matching_file[0]}") level2_df = pd.read_csv(matching_file[0]) else: log_update(f"\nScraping Level 2 IDs from FusionPDB") dt_tag = get_local_date_yr() level2_df = get_levels_dataframe(2, print_progress=True) level2_df['FusionGID'] = level2_df['FusionGID'].astype(str) level2_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level2_curated_{dt_tag}.csv',index=False) # Scrape level 3 and save it matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level3_curated_*.csv') if len(matching_file)>>0: log_update(f"\nLevel 3 was already scraped for IDs. Loading data from: {matching_file[0]}") level3_df = pd.read_csv(matching_file[0]) else: log_update(f"\nScraping Level 3 IDs from FusionPDB") dt_tag = get_local_date_yr() level3_df = get_levels_dataframe(3, print_progress=True) level3_df['FusionGID'] = level3_df['FusionGID'].astype(str) level3_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level3_curated_{dt_tag}.csv',index=False) # Check for ID overlap level2_ids = set(level2_df['FusionGID'].tolist()) level3_ids = set(level3_df['FusionGID'].tolist()) log_update(f"Total overlapping fusionGIDs between levels 2 and 3: {len(level2_ids.intersection(level3_ids))}") ##### LEVEL 2: Get links to FusionPDB's provided structure files for all level 2 proteins in FusionPDB. Then, combine them into one file. links_save_dir = 'raw_data/fusionpdb' os.makedirs(links_save_dir,exist_ok=True) process_ids(level2_ids,outdir=links_save_dir,level=2) # only processes ids that haven't been processed yet # Get head and tail gene info for level 2 proteins process_ids_ht(level2_ids,outdir=links_save_dir,level=2) ##### LEVEL 3: Get links to FusionPDB's provided structure files for all level 3 proteins in FusionPDB. Then, combine them into one file. links_save_dir = 'raw_data/fusionpdb' process_ids(level3_ids,outdir=links_save_dir,level=3) # Get head and tail gene info for level 2 proteins process_ids_ht(level3_ids,outdir=links_save_dir,level=3) # Combine head and tail data ht_df = combine_ht_info() ht_df.to_csv("processed_data/fusionpdb/fusion_heads_and_tails.csv",index=False) # Combine level 2 and level 3 data in two ways: (1) giant structure links file, (2) giant file with head and tail info log_update("\nCombining level 2 and 3 data") giant_level2 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'}) giant_level2 = pd.merge(giant_level2, level2_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left') log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv: {len(giant_level2)}") giant_level3 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'}) giant_level3 = pd.merge(giant_level3, level3_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left') log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv: {len(giant_level3)}") giant_level2['Level'] = [2]*len(giant_level2) giant_level3['Level'] = [3]*len(giant_level3) ##### Create some intermediate "giant" files, combining bulk info for levels 2 and 3 # These files are intermediate for two reasons: # - giant_level2-3_fusion_protein_structure_links: doesn't have head and tail structural info # - giant_level2-3_fusion_protein_head_tail_info: only has fusion proteins that have at least one mappable head/tail giant_sl = pd.concat([giant_level2,giant_level3]).drop_duplicates().reset_index(drop=True) giant_sl.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv',index=False) # Structure link file should be done - make sure there are no duplicate GIDs log_update(f"\nSaving file with all Level 2 and 3 Structure links (size: {len(giant_sl)}) to: processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv") # Create and download CSV of combined head and tail info giant_ht = pd.concat([level2_df,level3_df]).reset_index(drop=True) # ensure the type is string giant_ht['FusionGID'] = giant_ht['FusionGID'].astype(str) giant_with_ht = combine_ht_info_with_structure_links(giant_ht, ht_df) giant_with_ht.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv',index=False) # Download structure links download_structures(giant_sl['Structure Link'].tolist()) def main(): with open_logfile("fetch_fusionpdb_data_log.txt"): scrape_fusionpdb_level_2_3() if __name__ == "__main__": main()