|
|
|
import requests |
|
import pandas as pd |
|
import numpy as np |
|
from bs4 import BeautifulSoup |
|
import glob |
|
import ast |
|
import os |
|
from pandas.errors import EmptyDataError |
|
|
|
from fuson_plm.utils.logging import open_logfile, log_update, get_local_date_yr |
|
|
|
def get_levels_dataframe(level, print_progress=False): |
|
data, headers = scrape_level(level) |
|
|
|
if print_progress: |
|
|
|
if level==2: |
|
log_update(f'\nTable size {len(data)}; expected 2212') |
|
if level==3: |
|
log_update(f'\nTable size {len(data)}; expected 266') |
|
log_update('Example rows 1-5:') |
|
for i, row in enumerate(data): |
|
log_update(row) |
|
if i>5: break |
|
|
|
df = pd.DataFrame(data, columns=headers) |
|
df['URL'] = df['FusionGID'].apply(lambda x: x[1]) |
|
df['FusionGID'] = df['FusionGID'].apply(lambda x: x[0]) |
|
return df |
|
|
|
def scrape_level(level): |
|
level = str(level) |
|
|
|
url = f"https://compbio.uth.edu/FusionPDB/gene_search_result_0.cgi?type=chooseLevel&chooseLevel=level{level}" |
|
|
|
|
|
response = requests.get(url) |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
if level == '2': |
|
specific_h1 = soup.find('h1', text='2212 Fusion gene(s) for your query: level2') |
|
if level== '3': |
|
specific_h1 = soup.find('h1', text='266 Fusion gene(s) for your query: level3') |
|
|
|
|
|
table = specific_h1.find_next('table', class_='geneList') |
|
|
|
|
|
headers = [header.get_text().strip() for header in table.find_all('td', class_='content_middle_gene_summary')][0:6] |
|
|
|
|
|
rows = table.find_all('tr')[1:] |
|
|
|
|
|
data = [] |
|
for row in rows: |
|
columns = row.find_all('td', class_='content_middle_gene_summary') |
|
if not columns: |
|
continue |
|
|
|
row_data = [] |
|
for column in columns: |
|
link = column.find('a') |
|
if link: |
|
href = link['href'] |
|
fusion_gid = link.get_text(strip=True) |
|
full_url = f"https://compbio.uth.edu/FusionPDB/{href}" |
|
row_data.append((fusion_gid, full_url)) |
|
else: |
|
row_data.append(column.get_text(strip=True)) |
|
data.append(row_data) |
|
|
|
return data, headers |
|
|
|
def get_structure_link_dataframe(id, print_progress=False): |
|
rows = get_structure_links(id) |
|
|
|
|
|
if print_progress: |
|
log_update(f'\nTable size {len(rows)}') |
|
log_update('Example rows 1-5:') |
|
for i, row in enumerate(rows): |
|
log_update(row) |
|
if i>5: break |
|
|
|
|
|
if len(rows)>0: |
|
df = pd.DataFrame(rows) |
|
df = df.rename(columns={ |
|
'Fusion protein PDB link (fusion AA seq ID in FusionPDB)': 'Structure Link' |
|
}) |
|
|
|
df = df.explode('Structure Link').reset_index(drop=True) |
|
|
|
df['Structure Link'] = df['Structure Link'].apply(lambda x: 'https://compbio.uth.edu/FusionPDB/' + str(x)) |
|
df['Structure Type'] = df['Structure Link'].apply(lambda x: 'PDB' if 'pdb_files' in x else ('CIF' if 'cif_files' in x else 'Unknown')) |
|
df['FO_Name'] = df['Hgene'] + '::' + df['Tgene'] |
|
|
|
df = df.rename(columns={'FO_Name':'FusionGene'}) |
|
df['ID'] = [id]*len(df) |
|
|
|
expected_cols = ['ID','Structure Link','Hgene','Hchr','Hbp','Hstrand','Tgene','Tchr','Tbp','Tstrand','Len(AA seq)','Structure Type','FusionGene','AA seq'] |
|
for col in expected_cols: |
|
if not(col in list(df.columns)): |
|
df[col] = ['']*len(df) |
|
df = df[expected_cols] |
|
|
|
else: |
|
df = pd.DataFrame() |
|
|
|
return df |
|
|
|
def get_structure_links(id, print_progress=False): |
|
|
|
url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}" |
|
|
|
|
|
response = requests.get(url) |
|
html_content = response.content |
|
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
|
|
|
|
|
|
|
table_title = soup.find('a', {'name': 'FusionSTR'}) |
|
rows = [] |
|
|
|
if table_title and table_title.find('h2').text.strip() == 'Fusion Protein Structures': |
|
|
|
table = table_title.find_next('table', class_='geneList') |
|
table = table.find_next('table') |
|
|
|
if table: |
|
if print_progress: log_update('table found') |
|
|
|
header_row = table.find('tr') |
|
headers = [header.get_text(strip=True) for header in header_row.find_all('strong')] |
|
|
|
|
|
rows = [] |
|
for row in table.find_all('tr')[1:]: |
|
cells = row.find_all('td') |
|
row_data = {} |
|
skip_next = False |
|
for i, cell in enumerate(cells): |
|
|
|
if skip_next: |
|
skip_next = False |
|
continue |
|
|
|
cell_text = cell.get_text(strip=True) |
|
if "3D view using mol*" in cell_text: |
|
skip_next = True |
|
continue |
|
|
|
links = cell.find_all('a') |
|
if links: |
|
row_data[headers[i]] = [link.get('href') for link in links] |
|
else: |
|
celltext = cell.get_text(strip=True) |
|
if len(celltext)>0: |
|
row_data[headers[i]] = celltext |
|
if len(row_data)>0: rows.append(row_data) |
|
else: |
|
log_update('table not found') |
|
|
|
return rows |
|
|
|
def process_td_elements(soup_object, add_links=False): |
|
|
|
td_elements = soup_object.find_all('td', class_='content_left_gene_summary') |
|
|
|
|
|
data = [] |
|
|
|
for td in td_elements: |
|
|
|
strong_tag = td.find('strong') |
|
if strong_tag: |
|
text_content = strong_tag.get_text(strip=True) |
|
else: |
|
text_content = td.get_text(strip=True) |
|
|
|
|
|
if add_links: |
|
link_tag = td.find('a') |
|
if link_tag: |
|
link = link_tag.get('href') |
|
text_content += f" ({link})" |
|
|
|
data.append(text_content) |
|
|
|
return data |
|
|
|
def get_hgene_tgene_info(id, print_progress=False): |
|
|
|
url = f"https://compbio.uth.edu/FusionPDB/gene_search_result.cgi?page=page&type=quick_search&quick_search={id}" |
|
|
|
|
|
response = requests.get(url) |
|
html_content = response.content |
|
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
|
|
|
|
|
|
|
title_table = soup.find('table', class_='title') |
|
if title_table and title_table.find('h2') and title_table.find('h2').get_text(strip=True) == 'Fusion Protein Summary': |
|
|
|
gene_list_table = title_table.find_next_sibling('table', class_='geneList').find_next_sibling('table', class_='geneList') |
|
|
|
|
|
data = { |
|
"Fusion gene name": [], |
|
"FusionPDB ID": [], |
|
"FusionGDB2.0 ID": [], |
|
"Gene symbol": [], |
|
"Gene ID": [], |
|
"Gene name": [], |
|
"Synonyms": [], |
|
"Cytomap": [], |
|
"Type of gene": [], |
|
"Description": [], |
|
"Modification date": [], |
|
"UniProtAcc": [] |
|
} |
|
|
|
td_data = process_td_elements(gene_list_table) |
|
|
|
|
|
split_ind = td_data.index('Gene symbol') |
|
fusion_info, ht_info = [td_data[0:split_ind], td_data[split_ind::]] |
|
|
|
|
|
for info in fusion_info: |
|
if ':' in info: |
|
|
|
key, value = info.split(':')[0:2] |
|
if key in data: |
|
data[key.strip()] = value.strip() |
|
|
|
|
|
|
|
j_start=0 |
|
for i in range(0, len(ht_info), 3): |
|
|
|
|
|
key, value1, value2 = ht_info[i:i+3] |
|
if key in data: |
|
data[key.strip()] = [value1.strip(), value2.strip()] |
|
if key=='UniProtAcc': |
|
break |
|
|
|
return data |
|
|
|
def process_ids(ids, outdir='', level=2): |
|
csv_filename = f'{outdir}/FusionPDB_level{level}_fusion_structure_links.csv' |
|
already_processed_ids = [] |
|
if os.path.isfile(csv_filename): |
|
already_processed_ids = pd.read_csv(csv_filename) |
|
already_processed_ids = already_processed_ids['ID'].tolist() |
|
|
|
structureless_ids = pd.read_csv("raw_data/fusionpdb/fusionpdb_structureless_ids.txt",sep="\t",header=None)[0].tolist() |
|
|
|
log_update(f'\nLevel {level}:\n\tDownloading structure links for FusionPDB IDs:') |
|
for i, id in enumerate(ids): |
|
|
|
if (id in already_processed_ids) or (id in structureless_ids): |
|
continue |
|
df = get_structure_link_dataframe(id) |
|
if os.path.isfile(csv_filename): |
|
df.to_csv(csv_filename, mode='a', index=False,header=False) |
|
else: |
|
df.to_csv(csv_filename, mode='w', index=False) |
|
|
|
log_update(f'\t\t{i+1}. {id}') |
|
|
|
def process_ids_ht(ids, outdir='',level=2): |
|
outfile = f'{outdir}/level{level}_head_tail_info.txt' |
|
if not(os.path.isfile(outfile)): |
|
log_update(f"\n\tAcquiring UniProt accessions of head and tail genes for Level {level}") |
|
with open(outfile, 'a+') as f1: |
|
for id in ids: |
|
data = get_hgene_tgene_info(id) |
|
data = { |
|
'FusionGID': data['FusionPDB ID'], |
|
'HGID': data['Gene ID'][0], |
|
'TGID': data['Gene ID'][1], |
|
'HGUniProtAcc': data['UniProtAcc'][0], |
|
'TGUniProtAcc': data['UniProtAcc'][1] |
|
} |
|
f1.write(str(data)) |
|
f1.write('\n') |
|
f1.flush() |
|
else: |
|
log_update(f"\nAlready acquired UniProt accessions of head and tail genes for Level {level} at: {outfile}") |
|
|
|
def download_file(url, directory): |
|
|
|
local_filename = os.path.join(directory, url.split('/')[-1]) |
|
if os.path.exists(local_filename): |
|
return local_filename |
|
response = requests.get(url) |
|
response.raise_for_status() |
|
with open(local_filename, 'wb') as file: |
|
file.write(response.content) |
|
return local_filename |
|
|
|
def download_structures(download_links): |
|
|
|
download_directory = "raw_data/fusionpdb/structures" |
|
os.makedirs(download_directory, exist_ok=True) |
|
|
|
|
|
for link in download_links: |
|
try: |
|
log_update(f"Downloading {link}...") |
|
download_file(link, download_directory) |
|
log_update(f"\tDownloaded {link} to {download_directory}") |
|
except Exception as e: |
|
log_update(f"\tFailed to download {link}. Reason: {e}") |
|
|
|
log_update("All downloads completed.") |
|
|
|
def combine_ht_info(): |
|
|
|
outdir = 'raw_data/fusionpdb' |
|
head_tail_data = [] |
|
with open(f'{outdir}/level2_head_tail_info.txt','r') as f: |
|
for line in f: |
|
|
|
record = ast.literal_eval(line.strip()) |
|
head_tail_data.append(record) |
|
|
|
with open(f'{outdir}/level3_head_tail_info.txt','r') as f: |
|
for line in f: |
|
|
|
record = ast.literal_eval(line.strip()) |
|
head_tail_data.append(record) |
|
|
|
ht_df = pd.DataFrame(head_tail_data) |
|
ht_df['FusionGID'] = ht_df['FusionGID'].astype(str) |
|
return ht_df |
|
|
|
|
|
def find_h_source(row): |
|
if row['HGUniProtAcc'] is not None: |
|
return 'FusionPDB' |
|
elif row['Entry_Hgene'] is not None: |
|
return 'UniProt ID Map' |
|
else: |
|
return None |
|
|
|
def find_t_source(row): |
|
if row['TGUniProtAcc'] is not None: |
|
return 'FusionPDB' |
|
elif row['Entry_Tgene'] is not None: |
|
return 'UniProt ID Map' |
|
else: |
|
return None |
|
|
|
def correct_huniprot(row): |
|
if row['HGUniProtAcc'] is not None: |
|
return row['HGUniProtAcc'] |
|
elif row['Entry_Hgene'] is not None: |
|
return row['Entry_Hgene'] |
|
else: |
|
return None |
|
|
|
def correct_tuniprot(row): |
|
if row['TGUniProtAcc'] is not None: |
|
return row['TGUniProtAcc'] |
|
elif row['Entry_Tgene'] is not None: |
|
return row['Entry_Tgene'] |
|
else: |
|
return None |
|
|
|
def combine_ht_info_with_structure_links(giant, ht_df): |
|
|
|
giant_with_hts = pd.merge(giant, ht_df, on='FusionGID', how='left') |
|
|
|
giant_with_hts['HGID_x'] = giant_with_hts['HGID_x'].astype(str) |
|
giant_with_hts['HGID_y'] = giant_with_hts['HGID_y'].astype(str) |
|
giant_with_hts['TGID_x'] = giant_with_hts['TGID_x'].astype(str) |
|
giant_with_hts['TGID_y'] = giant_with_hts['TGID_y'].astype(str) |
|
|
|
giant_with_hts['HGID_match'] = giant_with_hts['HGID_x'] == giant_with_hts['HGID_y'] |
|
giant_with_hts['TGID_match'] = giant_with_hts['TGID_x'] == giant_with_hts['TGID_y'] |
|
|
|
|
|
assert giant_with_hts['HGID_match'].all() and giant_with_hts['TGID_match'].all() |
|
|
|
giant_with_hts = giant_with_hts.drop(['HGID_x','TGID_x','HGID_match','TGID_match'],axis=1).rename(columns={'HGID_y':'HGID','TGID_y':'TGID'}) |
|
giant_with_hts = giant_with_hts.replace('.',np.nan) |
|
|
|
|
|
hgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].isna()]) |
|
tgid_only = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].notna()]) |
|
hgid_and_tgid = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].notna() & giant_with_hts['TGUniProtAcc'].notna()]) |
|
neither = len(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna() & giant_with_hts['TGUniProtAcc'].isna()]) |
|
|
|
log_update(f"\nFusions with HGID only: {hgid_only}") |
|
log_update(f"Fusions with TGID only: {tgid_only}") |
|
log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}") |
|
log_update(f"Fusions with neither: {neither}") |
|
log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(giant_with_hts)}") |
|
|
|
|
|
unmapped_h = set(giant_with_hts[giant_with_hts['HGUniProtAcc'].isna()]['Hgene'].tolist()) |
|
unmapped_t = set(giant_with_hts[giant_with_hts['TGUniProtAcc'].isna()]['Tgene'].tolist()) |
|
|
|
unmapped_parts = unmapped_h.union(unmapped_t) |
|
log_update(f"unmapped hgenes: {len(unmapped_h)}") |
|
log_update(f"unmapped tgenes: {len(unmapped_t)}") |
|
log_update(f"unmapped parts (hgids or tgids): {len(unmapped_parts)}") |
|
|
|
|
|
wrong_uniprot_ids =[ |
|
'PRY', |
|
'TIAF1', |
|
'DCAF8L2', |
|
'UMAD1', |
|
'TIPIN', |
|
'GAB3', |
|
'OTOA', |
|
'PAGR1', |
|
'PRY2', |
|
'FAM178A', |
|
'SPATS2L', |
|
'VMAC', |
|
'ZNFX1', |
|
'TFPT', |
|
'TRANK1', |
|
'RRP15', |
|
'PAXBP1', |
|
'RB1CC1', |
|
'PACRGL', |
|
'TRMT1L', |
|
'PPPDE2', |
|
'YY1AP1', |
|
'RGP1', |
|
'SHKBP1', |
|
'RINT1', |
|
'PRAM1', |
|
'PIR', |
|
'TMBIM6', |
|
'PICK1', |
|
'PLEC', |
|
'NUDCD3', |
|
'CCBL1', |
|
'S100PBP', |
|
'RTL1', |
|
'C10orf140', |
|
'CD177', |
|
'SLF2', |
|
'STARD3NL', |
|
'RELL2', |
|
'AMIGO1', |
|
'TRAF3IP1', |
|
'PNOC', |
|
'PERM1', |
|
'UBE2F', |
|
'TBKBP1', |
|
'PAN3', |
|
'NSFL1C', |
|
'SPAST', |
|
'TOX4', |
|
'RGPD8', |
|
'ZDHHC9', |
|
'SLAMF9', |
|
'TNNT1', |
|
'TEKT5', |
|
'TPI1', |
|
'TAAR6', |
|
'SKIDA1', |
|
'PMS1' |
|
] |
|
|
|
wrong_uniprot_ids += giant_with_hts[ |
|
~(giant_with_hts['HGUniProtAcc'].isna()) & |
|
(giant_with_hts['HGUniProtAcc'].str.contains(",")) |
|
]['HGUniProtAcc'].tolist() |
|
|
|
wrong_uniprot_ids += giant_with_hts[ |
|
~(giant_with_hts['TGUniProtAcc'].isna()) & |
|
(giant_with_hts['TGUniProtAcc'].str.contains(",")) |
|
]['TGUniProtAcc'].tolist() |
|
|
|
|
|
hts_tomap_part2 = giant_with_hts[giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids)]['Hgene'].tolist() |
|
hts_tomap_part2 += giant_with_hts[giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids)]['Tgene'].tolist() |
|
hts_tomap_part2 = set(hts_tomap_part2) |
|
log_update(f"Total head and tail genes that need to be mapped again: {len(hts_tomap_part2)}") |
|
|
|
|
|
with open('processed_data/fusionpdb/intermediates/unmapped_parts.txt','w') as f: |
|
for part in unmapped_parts: |
|
f.write(f'{part}\n') |
|
for part in hts_tomap_part2: |
|
f.write(f'{part}\n') |
|
|
|
|
|
giant_with_hts.loc[ |
|
giant_with_hts['HGUniProtAcc'].isin(wrong_uniprot_ids), |
|
'HGUniProtAcc' |
|
] = np.nan |
|
giant_with_hts.loc[ |
|
giant_with_hts['TGUniProtAcc'].isin(wrong_uniprot_ids), |
|
'TGUniProtAcc' |
|
] = np.nan |
|
|
|
|
|
idmap = pd.read_csv(f'raw_data/fusionpdb/hgene_tgene_uniprot_idmap_07_10_2024.txt',sep='\t') |
|
|
|
idmap['n_GeneID'] = idmap['GeneID'].apply(lambda x: [y for y in str(x).strip().split(';') if len(y)>0]) |
|
idmap['n_GeneID'] = idmap['n_GeneID'].apply(lambda x: len(x)) |
|
|
|
log_update(f"Genes may have the following total #s of gene IDs: {idmap['n_GeneID'].unique()}") |
|
|
|
|
|
|
|
log_update(f"All GeneIDs end in ; {idmap['GeneID'].apply(lambda x: x[-1] == ';' if type(x)==str else True).all()}") |
|
|
|
|
|
|
|
|
|
idmap_merge = pd.merge(giant_with_hts, idmap[['From','Entry','GeneID']].rename(columns={'From':'Hgene', 'Entry': 'Entry_Hgene', 'GeneID': 'GeneID_Hgene'}), on='Hgene',how='left') |
|
idmap_merge = pd.merge(idmap_merge, idmap[['From','Entry','GeneID']].rename(columns={'From':'Tgene', 'Entry': 'Entry_Tgene', 'GeneID': 'GeneID_Tgene'}), on='Tgene',how='left') |
|
|
|
idmap_merge['HGID;'] = idmap_merge['HGID'].astype(str) + ';' |
|
idmap_merge['TGID;'] = idmap_merge['TGID'].astype(str) + ';' |
|
|
|
|
|
idmap_merge['HGID_Found'] = idmap_merge.apply(lambda row: row['HGID;'] in str(row['GeneID_Hgene']), axis=1) |
|
idmap_merge['TGID_Found'] = idmap_merge.apply(lambda row: row['TGID;'] in str(row['GeneID_Tgene']), axis=1) |
|
|
|
|
|
|
|
|
|
idmap_merge_success = idmap_merge.loc[ |
|
|
|
((idmap_merge['HGUniProtAcc'].notna()) & (idmap_merge['TGUniProtAcc'].notna())) | |
|
|
|
((idmap_merge['HGUniProtAcc'].isna()) & (idmap_merge['HGID_Found']==True)) | |
|
|
|
((idmap_merge['TGUniProtAcc'].isna()) & (idmap_merge['TGID_Found']==True)) |
|
].reset_index(drop=True) |
|
idmap_merge_success['FusionGID'] = idmap_merge_success['FusionGID'].astype(str) |
|
log_update(f"rows: {len(idmap_merge_success)}") |
|
log_update(f"unique successful fusion GIDs: {len(idmap_merge_success['FusionGID'].unique())}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
partition1 = idmap_merge_success.loc[ |
|
|
|
((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna())) |
|
].reset_index(drop=True) |
|
partition1_gids = set(partition1['FusionGID'].tolist()) |
|
log_update("Partition 1: HGUniProtAcc.notna() and TGUniProtAcc.notna() --> both UniProt accessions were found on FusionPDB") |
|
log_update(f"\t# GIDs: {len(partition1_gids)}") |
|
|
|
partition2 = idmap_merge_success.loc[ |
|
|
|
(idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True) & |
|
~(idmap_merge_success['FusionGID'].isin(partition1_gids)) |
|
].reset_index(drop=True) |
|
partition2_gids = set(partition2['FusionGID'].tolist()) |
|
log_update("Partition 2: HGID_Found & TGID_Found --> both UniProt accessions were mapped successfully; one or both was found by remapping on UniProt") |
|
log_update(f"\t# GIDs: {len(partition2_gids)}") |
|
|
|
partition3 = idmap_merge_success.loc[ |
|
|
|
~( |
|
((idmap_merge_success['HGUniProtAcc'].notna()) & (idmap_merge_success['TGUniProtAcc'].notna())) | |
|
((idmap_merge_success['HGID_Found']==True) & (idmap_merge_success['TGID_Found']==True)) |
|
) & |
|
|
|
((idmap_merge_success['HGID_Found']==True) | (idmap_merge_success['TGID_Found']==True)) & |
|
|
|
~(idmap_merge_success['FusionGID'].isin(partition1_gids)) & |
|
~(idmap_merge_success['FusionGID'].isin(partition2_gids)) |
|
].reset_index(drop=True) |
|
partition3_gids = set(partition3['FusionGID'].tolist()) |
|
log_update("Partition 3: HGID_Found or TGID_Found --> only one was successful, this was the best we can do") |
|
log_update(f"\t# GIDs: {len(partition3_gids)}") |
|
|
|
|
|
partition1_dups = partition1[partition1.duplicated('FusionGID')]['FusionGID'].unique().tolist() |
|
partition2_dups = partition2[partition2.duplicated('FusionGID')]['FusionGID'].unique().tolist() |
|
partition3_dups = partition3[partition3.duplicated('FusionGID')]['FusionGID'].unique().tolist() |
|
|
|
log_update(f"\nDuplicate IDs in partition 1: {len(partition1_dups)}") |
|
log_update(f"Duplicate IDs in partition 2: {len(partition2_dups)}") |
|
log_update(f"Duplicate IDs in partition 3: {len(partition3_dups)} \tDuplicate rows: {len(partition3[partition3['FusionGID'].isin(partition3_dups)])}") |
|
|
|
log_update(f"\nRows in original dataset: {len(idmap_merge_success)}") |
|
log_update(f"Rows in partitions: {len(partition1)+len(partition2)+len(partition3)}") |
|
|
|
|
|
all_starting_fusiongids = set(idmap_merge_success['FusionGID'].tolist()) |
|
all_fusiongids = set(partition1['FusionGID'].tolist()) | set(partition2['FusionGID'].tolist()) | set(partition3['FusionGID'].tolist()) |
|
log_update(f"\nFusion GIDs captured in original dataset: {len(all_fusiongids)} {len(partition1_gids)+len(partition2_gids)+len(partition3_gids)}") |
|
log_update(f"Fusion GIDs captured in the 3 partitions: {len(all_starting_fusiongids)}") |
|
log_update(f"Same set? {all_starting_fusiongids == all_fusiongids}") |
|
|
|
|
|
partition3['Entry_Hgene'] = partition3['Entry_Hgene'].astype(str) |
|
partition3['Entry_Tgene'] = partition3['Entry_Tgene'].astype(str) |
|
|
|
partition3 = partition3.groupby('FusionGID').agg({ |
|
'FusionGID': 'first', |
|
'FusionGene': 'first', |
|
'Hgene': 'first', |
|
'Tgene': 'first', |
|
'URL': 'first', |
|
'HGID': 'first', |
|
'TGID': 'first', |
|
'HGUniProtAcc': 'first', |
|
'TGUniProtAcc': 'first', |
|
'Entry_Hgene': lambda x: ','.join(set([y for y in x])), |
|
'GeneID_Hgene': 'first', |
|
'Entry_Tgene': lambda x: ','.join(set([y for y in x])), |
|
'GeneID_Tgene': 'first', |
|
'HGID;': 'first', |
|
'TGID;': 'first', |
|
'HGID_Found': 'first', |
|
'TGID_Found': 'first' |
|
} |
|
).reset_index(drop=True) |
|
|
|
|
|
recombined = pd.concat( |
|
[ |
|
partition1, |
|
partition2, |
|
partition3 |
|
] |
|
).reset_index(drop=True) |
|
|
|
log_update(f"Duplicate GID rows: {len(recombined[recombined.duplicated('FusionGID')])}") |
|
recombined = recombined.replace({np.nan: None, 'nan': None}) |
|
|
|
|
|
recombined['HGUniProtAcc_Source'] = recombined.apply(lambda row: find_h_source(row), axis=1) |
|
recombined['TGUniProtAcc_Source'] = recombined.apply(lambda row: find_t_source(row), axis=1) |
|
recombined['HGUniProtAcc'] = recombined.apply(lambda row: correct_huniprot(row), axis=1) |
|
recombined['TGUniProtAcc'] = recombined.apply(lambda row: correct_tuniprot(row), axis=1) |
|
|
|
|
|
log_update(f"Every row with UniProt ID Map as HGUniProtAcc_Source has an Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']=='UniProt ID Map']['Entry_Hgene'].apply(lambda x: x is not None).all()}") |
|
log_update(f"Every row with UniProt ID Map as TGUniProtAcc_Source has an Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']=='UniProt ID Map']['Entry_Tgene'].apply(lambda x: x is not None).all()}") |
|
log_update(f"Every row with no HGUniProtAcc_Source has no Entry_Hgene: {recombined.loc[recombined['HGUniProtAcc_Source']==None]['Entry_Hgene'].apply(lambda x: x is None).all()}") |
|
log_update(f"Every row with no TGUniProtAcc_Source has no Entry_Tgene: {recombined.loc[recombined['TGUniProtAcc_Source']==None]['Entry_Tgene'].apply(lambda x: x is None).all()}") |
|
|
|
|
|
recombined = recombined[['FusionGID', 'FusionGene', 'Hgene', 'Tgene', 'URL', 'HGID', 'TGID', 'HGUniProtAcc', 'TGUniProtAcc', 'HGUniProtAcc_Source', 'TGUniProtAcc_Source']] |
|
recombined = recombined.replace({None: np.nan}) |
|
|
|
|
|
hgid_only = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].isna()]) |
|
tgid_only = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].notna()]) |
|
hgid_and_tgid = len(recombined[recombined['HGUniProtAcc'].notna() & recombined['TGUniProtAcc'].notna()]) |
|
neither = len(recombined[recombined['HGUniProtAcc'].isna() & recombined['TGUniProtAcc'].isna()]) |
|
|
|
log_update(f"Fusions with HGID only: {hgid_only}") |
|
log_update(f"Fusions with TGID only: {tgid_only}") |
|
log_update(f"Fusions with HGID and TGID: {hgid_and_tgid}") |
|
log_update(f"Fusions with neither: {neither}") |
|
log_update(f"Sum = {hgid_only+tgid_only+hgid_and_tgid+neither} = {len(recombined)}") |
|
|
|
return recombined |
|
|
|
def scrape_fusionpdb_level_2_3(): |
|
|
|
os.makedirs("raw_data/fusionpdb",exist_ok=True) |
|
os.makedirs("processed_data/fusionpdb",exist_ok=True) |
|
os.makedirs("processed_data/fusionpdb/intermediates",exist_ok=True) |
|
matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level2_curated_*.csv') |
|
if len(matching_file)>0: |
|
log_update(f"\nLevel 2 was already scraped for IDs. Loading data from: {matching_file[0]}") |
|
level2_df = pd.read_csv(matching_file[0]) |
|
else: |
|
log_update(f"\nScraping Level 2 IDs from FusionPDB") |
|
dt_tag = get_local_date_yr() |
|
level2_df = get_levels_dataframe(2, print_progress=True) |
|
level2_df['FusionGID'] = level2_df['FusionGID'].astype(str) |
|
level2_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level2_curated_{dt_tag}.csv',index=False) |
|
|
|
|
|
matching_file = glob.glob('raw_data/fusionpdb/FusionPDB_level3_curated_*.csv') |
|
if len(matching_file)>>0: |
|
log_update(f"\nLevel 3 was already scraped for IDs. Loading data from: {matching_file[0]}") |
|
level3_df = pd.read_csv(matching_file[0]) |
|
else: |
|
log_update(f"\nScraping Level 3 IDs from FusionPDB") |
|
dt_tag = get_local_date_yr() |
|
level3_df = get_levels_dataframe(3, print_progress=True) |
|
level3_df['FusionGID'] = level3_df['FusionGID'].astype(str) |
|
level3_df.to_csv(f'raw_data/fusionpdb/FusionPDB_level3_curated_{dt_tag}.csv',index=False) |
|
|
|
|
|
level2_ids = set(level2_df['FusionGID'].tolist()) |
|
level3_ids = set(level3_df['FusionGID'].tolist()) |
|
log_update(f"Total overlapping fusionGIDs between levels 2 and 3: {len(level2_ids.intersection(level3_ids))}") |
|
|
|
|
|
links_save_dir = 'raw_data/fusionpdb' |
|
os.makedirs(links_save_dir,exist_ok=True) |
|
process_ids(level2_ids,outdir=links_save_dir,level=2) |
|
|
|
|
|
process_ids_ht(level2_ids,outdir=links_save_dir,level=2) |
|
|
|
|
|
links_save_dir = 'raw_data/fusionpdb' |
|
process_ids(level3_ids,outdir=links_save_dir,level=3) |
|
|
|
|
|
process_ids_ht(level3_ids,outdir=links_save_dir,level=3) |
|
|
|
|
|
ht_df = combine_ht_info() |
|
ht_df.to_csv("processed_data/fusionpdb/fusion_heads_and_tails.csv",index=False) |
|
|
|
|
|
log_update("\nCombining level 2 and 3 data") |
|
giant_level2 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'}) |
|
giant_level2 = pd.merge(giant_level2, level2_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left') |
|
log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level2_fusion_structure_links.csv: {len(giant_level2)}") |
|
|
|
giant_level3 = pd.read_csv('raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv').rename(columns={'ID':'FusionGID'}) |
|
giant_level3 = pd.merge(giant_level3, level3_df[['FusionGID','FusionGene','URL','HGID','TGID']],on=['FusionGID','FusionGene'],how='left') |
|
log_update(f"\tSize of raw_data/fusionpdb/FusionPDB_level3_fusion_structure_links.csv: {len(giant_level3)}") |
|
|
|
giant_level2['Level'] = [2]*len(giant_level2) |
|
giant_level3['Level'] = [3]*len(giant_level3) |
|
|
|
|
|
|
|
|
|
|
|
|
|
giant_sl = pd.concat([giant_level2,giant_level3]).drop_duplicates().reset_index(drop=True) |
|
giant_sl.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv',index=False) |
|
|
|
log_update(f"\nSaving file with all Level 2 and 3 Structure links (size: {len(giant_sl)}) to: processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_structure_links.csv") |
|
|
|
|
|
giant_ht = pd.concat([level2_df,level3_df]).reset_index(drop=True) |
|
|
|
giant_ht['FusionGID'] = giant_ht['FusionGID'].astype(str) |
|
giant_with_ht = combine_ht_info_with_structure_links(giant_ht, ht_df) |
|
giant_with_ht.sort_values(by='FusionGID',ascending=True).reset_index(drop=True).to_csv('processed_data/fusionpdb/intermediates/giant_level2-3_fusion_protein_head_tail_info.csv',index=False) |
|
|
|
|
|
download_structures(giant_sl['Structure Link'].tolist()) |
|
|
|
def main(): |
|
with open_logfile("fetch_fusionpdb_data_log.txt"): |
|
scrape_fusionpdb_level_2_3() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|