File size: 33,833 Bytes

bae913a

# Script for fetching FusionPDB level 2 and 3 data
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import glob
import ast
import os
from pandas.errors import EmptyDataError

from fuson_plm.utils.logging import open_logfile, log_update, get_local_date_yr

def get_levels_dataframe(level, print_progress=False):
  data, headers = scrape_level(level)

  if print_progress:
    # Output the extracted data - just a few rows
    if level==2:
      log_update(f'\nTable size {len(data)}; expected 2212')
    if level==3:
      log_update(f'\nTable size {len(data)}; expected 266')
    log_update('Example rows 1-5:')
    for i, row in enumerate(data):
        log_update(row)
        if i>5: break

  df = pd.DataFrame(data, columns=headers)
  df['URL'] = df['FusionGID'].apply(lambda x: x[1])
  df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0])
  return df

def scrape_level(level):
  level = str(level)
  # The URL of the website
  url = f"https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level{level}"

  # Sending a request to the website
  response = requests.get(url)

  # Parsing the HTML content of the website
  soup = BeautifulSoup(response.content, 'html.parser')

  # Find the specific <h1> tag
  if level == '2':
    specific_h1 = soup.find('h1', text='2212 Fusion gene(s) for your query: level2')
  if level== '3':
    specific_h1 = soup.find('h1', text='266 Fusion gene(s) for your query: level3')

  # Find the specific table following the <h1> tag
  table = specific_h1.find_next('table', class_='geneList')

  # Extract headers (only first 6 fields)
  headers = [header.get_text().strip() for header in table.find_all('td', class_='content_middle_gene_summary')][0:6]

  # Extract rows
  rows = table.find_all('tr')[1:]  # Skip the header row

  # Extract data from rows
  data = []
  for row in rows:
      columns = row.find_all('td', class_='content_middle_gene_summary')
      if not columns:
          continue

      row_data = []
      for column in columns:
          link = column.find('a')
          if link:
              href = link['href']
              fusion_gid = link.get_text(strip=True)
              full_url = f"https://compbio.uth.edu/FusionPDB/{href}"
              row_data.append((fusion_gid, full_url))
          else:
              row_data.append(column.get_text(strip=True))
      data.append(row_data)

  return data, headers

def get_structure_link_dataframe(id, print_progress=False):
  rows = get_structure_links(id)

  # IF printing progress, output the extracted data - just a few rows
  if print_progress:
    log_update(f'\nTable size {len(rows)}')
    log_update('Example rows 1-5:')
    for i, row in enumerate(rows):
        log_update(row)
        if i>5: break

  # Make the dataframe - new row for each link - ONLY if there's actually data
  if len(rows)>0:
    df = pd.DataFrame(rows)
    df = df.rename(columns={
        'Fusion protein PDB link (fusion AA seq ID in FusionPDB)': 'Structure Link'
    })
    # make a new row for each link
    df = df.explode('Structure Link').reset_index(drop=True)

    df['Structure Link'] = df['Structure Link'].apply(lambda x: 'https://compbio.uth.edu/FusionPDB/' + str(x))
    df['Structure Type'] = df['Structure Link'].apply(lambda x: 'PDB' if 'pdb_files' in x else ('CIF' if 'cif_files' in x else 'Unknown'))
    df['FO_Name'] = df['Hgene'] + '::' + df['Tgene']
    # Rename FO_Name to FusionGene
    df = df.rename(columns={'FO_Name':'FusionGene'})
    df['ID'] = [id]*len(df)
    
    expected_cols = ['ID','Structure Link','Hgene','Hchr','Hbp','Hstrand','Tgene','Tchr','Tbp','Tstrand','Len(AA seq)','Structure Type','FusionGene','AA seq']
    for col in expected_cols:
      if not(col in list(df.columns)):
        df[col] = ['']*len(df)
    df = df[expected_cols]
    #df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0])
  else:
    df = pd.DataFrame()

  return df

def get_structure_links(id, print_progress=False):
  # Define the URL
  url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}"

  # Fetch the webpage content
  response = requests.get(url)
  html_content = response.content

  # Create a BeautifulSoup object
  soup = BeautifulSoup(html_content, 'html.parser')
  #with open(f'FusionPDB_{id}_soup.txt','w') as f:
    #f.write(soup.prettify())

  # Find the table with the title "Fusion Protein Structures"
  table_title = soup.find('a', {'name': 'FusionSTR'})
  rows = []
  # Check that the h2 text is "Fusion Protein Structures"
  if table_title and table_title.find('h2').text.strip() == 'Fusion Protein Structures':
      # Find the next table with class 'geneList' after the title
      table = table_title.find_next('table', class_='geneList')
      table = table.find_next('table')

      if table:
          if print_progress: log_update('table found')
          # Extract the headers from the first row
          header_row = table.find('tr')
          headers = [header.get_text(strip=True) for header in header_row.find_all('strong')]

          # Extract the rows
          rows = []
          for row in table.find_all('tr')[1:]:  # Skip the header row
              cells = row.find_all('td')
              row_data = {}
              skip_next = False  # Flag to skip the next cell
              for i, cell in enumerate(cells):
                  # Get the link text if a link is present, otherwise get the text
                  if skip_next:
                    skip_next = False
                    continue  # Skip this cell

                  cell_text = cell.get_text(strip=True)
                  if "3D view using mol*" in cell_text:
                      skip_next = True  # Set the flag to skip the next cell
                      continue  # Skip this cell

                  links = cell.find_all('a')
                  if links:
                      row_data[headers[i]] = [link.get('href') for link in links]
                  else:
                      celltext = cell.get_text(strip=True)
                      if len(celltext)>0:
                        row_data[headers[i]] = celltext
              if len(row_data)>0: rows.append(row_data)
      else:
          log_update('table not found')

  return rows

def process_td_elements(soup_object, add_links=False):
  # Find all td elements with class "content_left_gene_summary"
  td_elements = soup_object.find_all('td', class_='content_left_gene_summary')

  # Extract and print the information
  data = []

  for td in td_elements:
      # Extract the text content
      strong_tag = td.find('strong')
      if strong_tag:
          text_content = strong_tag.get_text(strip=True)
      else:
          text_content = td.get_text(strip=True)

      # Extract the link if available
      if add_links:
        link_tag = td.find('a')
        if link_tag:
            link = link_tag.get('href')
            text_content += f" ({link})"

      data.append(text_content)

  return data

def get_hgene_tgene_info(id, print_progress=False):
  # Define the URL
  url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}"

  # Fetch the webpage content
  response = requests.get(url)
  html_content = response.content

  # Create a BeautifulSoup object
  soup = BeautifulSoup(html_content, 'html.parser')
  #with open(f'FusionPDB_{id}_soup.txt','w') as f:
    #f.write(soup.prettify())

  # Find the table with the title "Fusion Protein Summary"
  title_table = soup.find('table', class_='title')
  if title_table and title_table.find('h2') and title_table.find('h2').get_text(strip=True) == 'Fusion Protein Summary':
      # Next table is irrelevent; skip to 'geneList' table 2 tables from now
      gene_list_table = title_table.find_next_sibling('table', class_='geneList').find_next_sibling('table', class_='geneList')

      # Extract relevant data from this table
      data = {
        "Fusion gene name": [],
        "FusionPDB ID": [],
        "FusionGDB2.0 ID": [],
        "Gene symbol": [],
        "Gene ID": [],
        "Gene name": [],
        "Synonyms": [],
        "Cytomap": [],
        "Type of gene": [],
        "Description": [],
        "Modification date": [],
        "UniProtAcc": []
      }

      td_data = process_td_elements(gene_list_table)

      # need to split td_data into 2 parts: before 'Gene symbol' and after 'Gene symbol'
      split_ind = td_data.index('Gene symbol')
      fusion_info, ht_info = [td_data[0:split_ind], td_data[split_ind::]]

      # first, process fusion info
      for info in fusion_info:
        if ':' in info:
          #log_update(info)
          key, value = info.split(':')[0:2]
          if key in data:
            data[key.strip()] = value.strip()

      # now, process ht_info up to
      # iterate 3 at a time
      j_start=0
      for i in range(0, len(ht_info), 3):
        # get the elements
        #log_update(len(ht_info[i:i+3]), ht_info[i:i+3])
        key, value1, value2 = ht_info[i:i+3]
        if key in data:
          data[key.strip()] = [value1.strip(), value2.strip()]
        if key=='UniProtAcc':
          break

  return data

def process_ids(ids, outdir='', level=2):
    csv_filename = f'{outdir}/FusionPDB_level{level}_fusion_structure_links.csv'
    already_processed_ids = []
    if os.path.isfile(csv_filename):
      already_processed_ids = pd.read_csv(csv_filename)
      already_processed_ids = already_processed_ids['ID'].tolist()
      
    structureless_ids = pd.read_csv("raw_data/fusionpdb/fusionpdb_structureless_ids.txt",sep="\t",header=None)[0].tolist()
      
    log_update(f'\nLevel {level}:\n\tDownloading structure links for FusionPDB IDs:')
    for i, id in enumerate(ids):
        # only process if the csv_filename isn't already a file and if it's not one of the ones with no structure
        if (id in already_processed_ids) or (id in structureless_ids):
            continue
        df = get_structure_link_dataframe(id)
        if os.path.isfile(csv_filename):
          df.to_csv(csv_filename, mode='a', index=False,header=False)
        else:
          df.to_csv(csv_filename, mode='w', index=False)
          
        log_update(f'\t\t{i+1}. {id}')
          
def process_ids_ht(ids, outdir='',level=2):
  outfile = f'{outdir}/level{level}_head_tail_info.txt'
  if not(os.path.isfile(outfile)):
    log_update(f"\n\tAcquiring UniProt accessions of head and tail genes for Level {level}")
    with open(outfile, 'a+') as f1:
      for id in ids:
        data = get_hgene_tgene_info(id)
        data = {
            'FusionGID': data['FusionPDB ID'],
            'HGID': data['Gene ID'][0],
            'TGID': data['Gene ID'][1],
            'HGUniProtAcc': data['UniProtAcc'][0],
            'TGUniProtAcc': data['UniProtAcc'][1]
        }
        f1.write(str(data))
        f1.write('\n')
        f1.flush()
  else:
    log_update(f"\nAlready acquired UniProt accessions of head and tail genes for Level {level} at: {outfile}")
    
def download_file(url, directory):
    # Download  file, IF its destination doesn't already have a file there
    local_filename = os.path.join(directory, url.split('/')[-1])
    if os.path.exists(local_filename):
        return local_filename
    response = requests.get(url)
    response.raise_for_status()
    with open(local_filename, 'wb') as file:
        file.write(response.content)
    return local_filename
  
def download_structures(download_links):
  # Directory where you want to save the downloaded files
  download_directory = "raw_data/fusionpdb/structures"
  os.makedirs(download_directory, exist_ok=True)
  
  # Download all files
  for link in download_links:
    try:
      log_update(f"Downloading {link}...")
      download_file(link, download_directory)
      log_update(f"\tDownloaded {link} to {download_directory}")
    except Exception as e:
      log_update(f"\tFailed to download {link}. Reason: {e}")

  log_update("All downloads completed.")
  
def combine_ht_info():
  # read the head and tail that was collected for levels 2 and 4
  outdir = 'raw_data/fusionpdb'
  head_tail_data = []
  with open(f'{outdir}/level2_head_tail_info.txt','r') as f:
    for line in f:
        # Parse the line as a dictionary using ast.literal_eval
        record = ast.literal_eval(line.strip())
        head_tail_data.append(record)

  with open(f'{outdir}/level3_head_tail_info.txt','r') as f:
    for line in f:
        # Parse the line as a dictionary using ast.literal_eval
        record = ast.literal_eval(line.strip())
        head_tail_data.append(record)
  
  ht_df = pd.DataFrame(head_tail_data)
  ht_df['FusionGID'] = ht_df['FusionGID'].astype(str)
  return ht_df

# Add a column for the source of UniProtAcc
def find_h_source(row):
    if row['HGUniProtAcc'] is not None:
      return 'FusionPDB'
    elif row['Entry_Hgene'] is not None:
      return 'UniProt ID Map'
    else:
      return None

def find_t_source(row):
    if row['TGUniProtAcc'] is not None:
      return 'FusionPDB'
    elif row['Entry_Tgene'] is not None:
      return 'UniProt ID Map'
    else:
      return None

def correct_huniprot(row):
  if row['HGUniProtAcc'] is not None:
    return row['HGUniProtAcc']
  elif row['Entry_Hgene'] is not None:
    return row['Entry_Hgene']
  else:
    return None

def correct_tuniprot(row):
  if row['TGUniProtAcc'] is not None:
    return row['TGUniProtAcc']
  elif row['Entry_Tgene'] is not None:
    return row['Entry_Tgene']
  else:
    return None
  
def combine_ht_info_with_structure_links(giant, ht_df):
  # Add in the head and tail data
  giant_with_hts = pd.merge(giant, ht_df, on='FusionGID', how='left')
  # make sure it's all strings here
  giant_with_hts['HGID_x'] = giant_with_hts['HGID_x'].astype(str)
  giant_with_hts['HGID_y'] = giant_with_hts['HGID_y'].astype(str)
  giant_with_hts['TGID_x'] = giant_with_hts['TGID_x'].astype(str)
  giant_with_hts['TGID_y'] = giant_with_hts['TGID_y'].astype(str)
  
  giant_with_hts['HGID_match'] = giant_with_hts['HGID_x'] == giant_with_hts['HGID_y']
  giant_with_hts['TGID_match'] = giant_with_hts['TGID_x'] == giant_with_hts['TGID_y']

  # check if they're all true
  assert giant_with_hts['HGID_match'].all() and giant_with_hts['TGID_match'].all()
  # cool, all of them are true so now drop the extra columns
  giant_with_hts = giant_with_hts.drop(['HGID_x','TGID_x','HGID_match','TGID_match'],axis=1).rename(columns={'HGID_y':'HGID','TGID_y':'TGID'})
  giant_with_hts = giant_with_hts.replace('.',np.nan)
  
  # Check - how many rows have uniprot IDs for both head and tail?
  hgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].isna()])
  tgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].notna()])
  hgid_and_tgid = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].notna()])
  neither = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].isna()])

  log_update(f"\nFusions with HGID only: {hgid_only}")
  log_update(f"Fusions with TGID only: {tgid_only}")
  log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}")
  log_update(f"Fusions with neither: {neither}")
  log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(giant_with_hts)}")
  
  # Collect all unmapped HGIDs and TGIDs
  unmapped_h = set(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna()]['Hgene'].tolist())
  unmapped_t = set(giant_with_hts[giant_with_hts['TGUniProtAcc'].isna()]['Tgene'].tolist())

  unmapped_parts = unmapped_h.union(unmapped_t)
  log_update(f"unmapped hgenes: {len(unmapped_h)}")
  log_update(f"unmapped tgenes: {len(unmapped_t)}")
  log_update(f"unmapped parts (hgids or tgids): {len(unmapped_parts)}")
  
  # We need to remap some Hgenes and Tgenes. There are some cases where FusionPDB got the wrong UniProt accessions. 
  wrong_uniprot_ids =[
    'PRY',
    'TIAF1',
    'DCAF8L2',
    'UMAD1',
    'TIPIN',
    'GAB3',
    'OTOA',
    'PAGR1',
    'PRY2',
    'FAM178A',
    'SPATS2L',
    'VMAC',
    'ZNFX1',
    'TFPT',
    'TRANK1',
    'RRP15',
    'PAXBP1',
    'RB1CC1',
    'PACRGL',
    'TRMT1L',
    'PPPDE2',
    'YY1AP1',
    'RGP1',
    'SHKBP1',
    'RINT1',
    'PRAM1',
    'PIR',
    'TMBIM6',
    'PICK1',
    'PLEC',
    'NUDCD3',
    'CCBL1',
    'S100PBP',
    'RTL1',
    'C10orf140',
    'CD177',
    'SLF2',
    'STARD3NL',
    'RELL2',
    'AMIGO1',
    'TRAF3IP1',
    'PNOC',
    'PERM1',
    'UBE2F',
    'TBKBP1',
    'PAN3',
    'NSFL1C',
    'SPAST',
    'TOX4',
    'RGPD8',
    'ZDHHC9',
    'SLAMF9',
    'TNNT1',
    'TEKT5',
    'TPI1',
    'TAAR6',
    'SKIDA1',
    'PMS1'
  ]
  # Add Hgene accessions with commas
  wrong_uniprot_ids += giant_with_hts[
      ~(giant_with_hts['HGUniProtAcc'].isna()) &
      (giant_with_hts['HGUniProtAcc'].str.contains(","))
      ]['HGUniProtAcc'].tolist()
  # Add Tgene accessions with commas
  wrong_uniprot_ids += giant_with_hts[
      ~(giant_with_hts['TGUniProtAcc'].isna()) &
      (giant_with_hts['TGUniProtAcc'].str.contains(","))
      ]['TGUniProtAcc'].tolist()

  # Get a list of the Hgenes and Tgenes that need to be ID mapped, AGAIN
  hts_tomap_part2 = giant_with_hts[giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids)]['Hgene'].tolist()
  hts_tomap_part2 += giant_with_hts[giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids)]['Tgene'].tolist()
  hts_tomap_part2 = set(hts_tomap_part2)
  log_update(f"Total head and tail genes that need to be mapped again: {len(hts_tomap_part2)}")
  
  # Write parts that need remapping to a file for submission to UniProt
  with open('processed_data/fusionpdb/intermediates/unmapped_parts.txt','w') as f:
    for part in unmapped_parts:
      f.write(f'{part}\n')
    for part in hts_tomap_part2:
      f.write(f'{part}\n') 
  
  # set the accession to nan if it's in wrong_uniprot_ids
  giant_with_hts.loc[
      giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids),
      'HGUniProtAcc'
  ] = np.nan
  giant_with_hts.loc[
      giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids),
      'TGUniProtAcc'
  ] = np.nan 
      
  # We did the ID Map ahead of time on UniProt. Use this file.
  idmap = pd.read_csv(f'raw_data/fusionpdb/hgene_tgene_uniprot_idmap_07_10_2024.txt',sep='\t')
  # are there multiple GeneIDs for anything?
  idmap['n_GeneID'] = idmap['GeneID'].apply(lambda x: [y for y in str(x).strip().split(';') if len(y)>0])
  idmap['n_GeneID'] = idmap['n_GeneID'].apply(lambda x: len(x))
  # are they all length 1?
  log_update(f"Genes may have the following total #s of gene IDs: {idmap['n_GeneID'].unique()}")
  # no they're not

  # do they all end in ;?
  log_update(f"All GeneIDs end in ; {idmap['GeneID'].apply(lambda x: x[-1] == ';' if type(x)==str else True).all()}") # say true if it's Nan, we don't care about this
  # yes they do

  # Merge new IDMap data from UniProt re-mapping with previous data
  # Merge twice: one time, we merge as if the mapped genes are Hgenes; the other time, as if the mapped genes are Tgenes
  idmap_merge = pd.merge(giant_with_hts, idmap[['From','Entry','GeneID']].rename(columns={'From':'Hgene', 'Entry': 'Entry_Hgene', 'GeneID': 'GeneID_Hgene'}), on='Hgene',how='left')
  idmap_merge = pd.merge(idmap_merge, idmap[['From','Entry','GeneID']].rename(columns={'From':'Tgene', 'Entry': 'Entry_Tgene', 'GeneID': 'GeneID_Tgene'}), on='Tgene',how='left')
  # From the original data, we have HGIDs and from the UniProt result, we have HGIDs with ; on the end. So make a 'HGID;' column to see if these match
  idmap_merge['HGID;'] = idmap_merge['HGID'].astype(str) + ';'
  idmap_merge['TGID;'] = idmap_merge['TGID'].astype(str) + ';'

  # "Found" is true if the HGID; from the FusionPDB mapping is one of the GeneIDs returned by UniProt
  idmap_merge['HGID_Found'] = idmap_merge.apply(lambda row: row['HGID;'] in str(row['GeneID_Hgene']), axis=1)
  idmap_merge['TGID_Found'] = idmap_merge.apply(lambda row: row['TGID;'] in str(row['GeneID_Tgene']), axis=1)
  
  # what do we keep from idmap merge?
  # we keep columns where: there's an HGID and a TGID, OR
  # if one of them is nan, its given gene id is in the list returned by uniprot
  idmap_merge_success = idmap_merge.loc[
      # Both were there to begin with
      ((idmap_merge['HGUniProtAcc'].notna()) & (idmap_merge['TGUniProtAcc'].notna())) |
      # Hgene was missing, correct HGID was found
      ((idmap_merge['HGUniProtAcc'].isna()) & (idmap_merge['HGID_Found']==True)) |
      # Tgene was missing, correct TGID was found
      ((idmap_merge['TGUniProtAcc'].isna()) & (idmap_merge['TGID_Found']==True))
  ].reset_index(drop=True)
  idmap_merge_success['FusionGID'] = idmap_merge_success['FusionGID'].astype(str)
  log_update(f"rows: {len(idmap_merge_success)}")
  log_update(f"unique successful fusion GIDs: {len(idmap_merge_success['FusionGID'].unique())}")
  
  # There are duplicate rowes with different GIDs for Hgenes and Tgenes. Here's the scheme for which row to keep.
  # Priority 1: HGUniProtAcc.notna() and TGUniProtAcc.notna()
  # Priority 2: HGID_Found and TGID_Found, and FusionGID is NOT in partition 1
  # Priority 3: HGID_Found or TGID_Found, and FusionGID is NOT in partition 1 or 2. 
  #     ^ If we get here, only one (Hgene OR Tgene) was found, since all combinations were tested. 
  #       For whichever one worked, combine the appropriate information and get rid of the rest

  partition1 = idmap_merge_success.loc[
      # Both were there to begin with
      ((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna()))
  ].reset_index(drop=True)
  partition1_gids = set(partition1['FusionGID'].tolist())
  log_update("Partition 1: HGUniProtAcc.notna() and TGUniProtAcc.notna() --> both UniProt accessions were found on FusionPDB")
  log_update(f"\t# GIDs: {len(partition1_gids)}")

  partition2 = idmap_merge_success.loc[
      # Hgene was missing, correct HGID was found or Tgene was missing, correct TGID was found. 
      (idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True) &
      ~(idmap_merge_success['FusionGID'].isin(partition1_gids))
  ].reset_index(drop=True)
  partition2_gids = set(partition2['FusionGID'].tolist())
  log_update("Partition 2: HGID_Found & TGID_Found --> both UniProt accessions were mapped successfully; one or both was found by remapping on UniProt")
  log_update(f"\t# GIDs: {len(partition2_gids)}")

  partition3 = idmap_merge_success.loc[
      # it didn't satisfy one of the criteria for the first two partitions
      ~(
          ((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna())) |
          ((idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True))
      ) &
      # one of the mapping was a success
      ((idmap_merge_success['HGID_Found']==True) | (idmap_merge_success['TGID_Found']==True)) &
      # the FusionGID is not in partition 1 or 2
      ~(idmap_merge_success['FusionGID'].isin(partition1_gids)) &
      ~(idmap_merge_success['FusionGID'].isin(partition2_gids))
  ].reset_index(drop=True)
  partition3_gids = set(partition3['FusionGID'].tolist())
  log_update("Partition 3: HGID_Found or TGID_Found --> only one was successful, this was the best we can do")
  log_update(f"\t# GIDs: {len(partition3_gids)}")

  # check for dups
  partition1_dups = partition1[partition1.duplicated('FusionGID')]['FusionGID'].unique().tolist()
  partition2_dups = partition2[partition2.duplicated('FusionGID')]['FusionGID'].unique().tolist()
  partition3_dups = partition3[partition3.duplicated('FusionGID')]['FusionGID'].unique().tolist()

  log_update(f"\nDuplicate IDs in partition 1: {len(partition1_dups)}")
  log_update(f"Duplicate IDs in partition 2: {len(partition2_dups)}")
  log_update(f"Duplicate IDs in partition 3: {len(partition3_dups)} \tDuplicate rows: {len(partition3[partition3['FusionGID'].isin(partition3_dups)])}")

  log_update(f"\nRows in original dataset: {len(idmap_merge_success)}")
  log_update(f"Rows in partitions: {len(partition1)+len(partition2)+len(partition3)}")

  # Check that all GIDs are represented
  all_starting_fusiongids = set(idmap_merge_success['FusionGID'].tolist())
  all_fusiongids = set(partition1['FusionGID'].tolist()) | set(partition2['FusionGID'].tolist()) | set(partition3['FusionGID'].tolist())
  log_update(f"\nFusion GIDs captured in original dataset: {len(all_fusiongids)} {len(partition1_gids)+len(partition2_gids)+len(partition3_gids)}")
  log_update(f"Fusion GIDs captured in the 3 partitions: {len(all_starting_fusiongids)}")
  log_update(f"Same set? {all_starting_fusiongids == all_fusiongids}")   

  # Deal with partition 3's duplicates  - group by FusionGID and join all the UniProt IDs returned for HGene and TGene
  partition3['Entry_Hgene'] = partition3['Entry_Hgene'].astype(str)
  partition3['Entry_Tgene'] = partition3['Entry_Tgene'].astype(str)

  partition3 = partition3.groupby('FusionGID').agg({
      'FusionGID': 'first',
      'FusionGene': 'first',
      'Hgene': 'first',
      'Tgene': 'first',
      'URL': 'first',
      'HGID': 'first',
      'TGID': 'first',
      'HGUniProtAcc': 'first',
      'TGUniProtAcc': 'first',
      'Entry_Hgene': lambda x: ','.join(set([y for y in x])),
      'GeneID_Hgene': 'first',
      'Entry_Tgene': lambda x: ','.join(set([y for y in x])),
      'GeneID_Tgene': 'first',
      'HGID;': 'first',
      'TGID;': 'first',
      'HGID_Found': 'first',# there should only be one
      'TGID_Found': 'first'# there should only be one
  }
  ).reset_index(drop=True)
  
  # Finally, recombine
  recombined = pd.concat(
      [
          partition1,
          partition2,
          partition3
      ]
  ).reset_index(drop=True)
  # there should be no duplicate GIDs in this
  log_update(f"Duplicate GID rows: {len(recombined[recombined.duplicated('FusionGID')])}")
  recombined = recombined.replace({np.nan: None, 'nan': None})
  
  # Add the UniProt source so it's clear where we got these IDs from
  recombined['HGUniProtAcc_Source'] = recombined.apply(lambda row: find_h_source(row), axis=1)
  recombined['TGUniProtAcc_Source'] = recombined.apply(lambda row: find_t_source(row), axis=1)
  recombined['HGUniProtAcc'] = recombined.apply(lambda row: correct_huniprot(row), axis=1)
  recombined['TGUniProtAcc'] = recombined.apply(lambda row: correct_tuniprot(row), axis=1)

  # Check: every row that has "UniProt ID Map" for HGUniProtAcc_Source should have an Entry_Hgene
  log_update(f"Every row with UniProt ID Map as HGUniProtAcc_Source has an Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']=='UniProt ID Map']['Entry_Hgene'].apply(lambda x: x is not None).all()}")
  log_update(f"Every row with UniProt ID Map as TGUniProtAcc_Source has an Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']=='UniProt ID Map']['Entry_Tgene'].apply(lambda x: x is not None).all()}")
  log_update(f"Every row with no HGUniProtAcc_Source has no Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']==None]['Entry_Hgene'].apply(lambda x: x is None).all()}")
  log_update(f"Every row with no TGUniProtAcc_Source has no Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']==None]['Entry_Tgene'].apply(lambda x: x is None).all()}")
  
  # keep only 'FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source'
  recombined = recombined[['FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source']]
  recombined = recombined.replace({None: np.nan})
  # print how many have each id
  # how many rows have uniprot IDs for both?
  hgid_only = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].isna()])
  tgid_only = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].notna()])
  hgid_and_tgid = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].notna()])
  neither = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].isna()])

  log_update(f"Fusions with HGID only: {hgid_only}")
  log_update(f"Fusions with TGID only: {tgid_only}")
  log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}")
  log_update(f"Fusions with neither: {neither}")
  log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(recombined)}")

  return recombined

def scrape_fusionpdb_level_2_3():
  # Scrape level 2 and save it
  os.makedirs("raw_data/fusionpdb",exist_ok=True)
  os.makedirs("processed_data/fusionpdb",exist_ok=True)
  os.makedirs("processed_data/fusionpdb/intermediates",exist_ok=True)
  matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level2_curated_*.csv')
  if len(matching_file)>0:
    log_update(f"\nLevel 2 was already scraped for IDs. Loading data from: {matching_file[0]}")
    level2_df = pd.read_csv(matching_file[0])
  else:
    log_update(f"\nScraping Level 2 IDs from FusionPDB")
    dt_tag = get_local_date_yr()
    level2_df = get_levels_dataframe(2, print_progress=True)
    level2_df['FusionGID'] = level2_df['FusionGID'].astype(str)
    level2_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level2_curated_{dt_tag}.csv',index=False)
  
  # Scrape level 3 and save it
  matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level3_curated_*.csv')
  if len(matching_file)>>0:
    log_update(f"\nLevel 3 was already scraped for IDs. Loading data from: {matching_file[0]}")
    level3_df = pd.read_csv(matching_file[0])
  else:
    log_update(f"\nScraping Level 3 IDs from FusionPDB")
    dt_tag = get_local_date_yr()
    level3_df = get_levels_dataframe(3, print_progress=True)
    level3_df['FusionGID'] = level3_df['FusionGID'].astype(str)
    level3_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level3_curated_{dt_tag}.csv',index=False)
  
  # Check for ID overlap
  level2_ids = set(level2_df['FusionGID'].tolist())
  level3_ids = set(level3_df['FusionGID'].tolist())
  log_update(f"Total overlapping fusionGIDs between levels 2 and 3: {len(level2_ids.intersection(level3_ids))}")
  
  ##### LEVEL 2: Get links to FusionPDB's provided structure files for all level 2 proteins in FusionPDB. Then, combine them into one file.
  links_save_dir = 'raw_data/fusionpdb'
  os.makedirs(links_save_dir,exist_ok=True)
  process_ids(level2_ids,outdir=links_save_dir,level=2)    # only processes ids that haven't been processed yet 
  
  # Get head and tail gene info for level 2 proteins
  process_ids_ht(level2_ids,outdir=links_save_dir,level=2)
  
  ##### LEVEL 3: Get links to FusionPDB's provided structure files for all level 3 proteins in FusionPDB. Then, combine them into one file.
  links_save_dir = 'raw_data/fusionpdb'
  process_ids(level3_ids,outdir=links_save_dir,level=3)
  
  # Get head and tail gene info for level 2 proteins
  process_ids_ht(level3_ids,outdir=links_save_dir,level=3)
  
  # Combine head and tail data
  ht_df = combine_ht_info()
  ht_df.to_csv("processed_data/fusionpdb/fusion_heads_and_tails.csv",index=False)
  
  # Combine level 2 and level 3 data in two ways: (1) giant structure links file, (2) giant file with head and tail info
  log_update("\nCombining level 2 and 3 data")
  giant_level2 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'})
  giant_level2 = pd.merge(giant_level2, level2_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left')
  log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv: {len(giant_level2)}")
  
  giant_level3 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'})
  giant_level3 = pd.merge(giant_level3, level3_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left')
  log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv: {len(giant_level3)}")
  
  giant_level2['Level'] = [2]*len(giant_level2)
  giant_level3['Level'] = [3]*len(giant_level3)
  
  ##### Create some intermediate "giant" files, combining bulk info for levels 2 and 3
  # These files are intermediate for two reasons:
  #     - giant_level2-3_fusion_protein_structure_links: doesn't have head and tail structural info
  #     - giant_level2-3_fusion_protein_head_tail_info: only has fusion proteins that have at least one mappable head/tail
  
  giant_sl = pd.concat([giant_level2,giant_level3]).drop_duplicates().reset_index(drop=True)
  giant_sl.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv',index=False)
  # Structure link file should be done - make sure there are no duplicate GIDs
  log_update(f"\nSaving file with all Level 2 and 3 Structure links (size: {len(giant_sl)}) to: processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv")
  
  # Create and download CSV of combined head and tail info
  giant_ht = pd.concat([level2_df,level3_df]).reset_index(drop=True)
  # ensure the type is string
  giant_ht['FusionGID'] = giant_ht['FusionGID'].astype(str)
  giant_with_ht = combine_ht_info_with_structure_links(giant_ht, ht_df)
  giant_with_ht.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv',index=False)
  
  # Download structure links
  download_structures(giant_sl['Structure Link'].tolist())
  
def main():
  with open_logfile("fetch_fusionpdb_data_log.txt"):
    scrape_fusionpdb_level_2_3()

if __name__ == "__main__":
    main()