OncoDigger / app.py
jfataphd's picture
Update app.py
d4a2975
raw
history blame
12.3 kB
import streamlit as st
import time
import json
from gensim.models import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import squarify
import numpy as np
import re
import urllib.request
import random
import plotly.express as px
st.set_page_config(
page_title="FATA4 Science",
page_icon=":microscope:",
layout="wide", #centered
initial_sidebar_state="auto",
menu_items={
'About': "FATA4 Science is a Natural Language Processing (NLP) that ...."
}
)
# Define the HTML and CSS styles
st.markdown("""
<style>
[data-testid=stSidebar] {
background-color: #99CCFF;
}
</style>
""", unsafe_allow_html=True)
st.markdown("""
<style>
body {
background-color: #CCFFFF;
# color: #ffffff;
# font-size: 1px
}
.stApp {
background-color: #CCFFFF;
# color: #ffffff;
# font-size: 1px
}
</style>
""", unsafe_allow_html=True)
opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus'))
if opt == "Clotting corpus":
model_used = ("pubmed_model_clotting")
num_abstracts = 45493
database_name = "Clotting"
if opt == "Neuroblastoma corpus":
model_used = ("pubmed_model_neuroblastoma")
num_abstracts = 29032
database_name = "Neuroblastoma"
st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science")
st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")
st.markdown("---")
st.header(f":blue[{database_name} Pubmed corpus.]")
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
query = text_input_value
query = query.lower()
query = re.sub("[,.?!&*;: ]", "", query)
matches = [" "]
if any([x in query for x in matches]):
st.write("Please only enter one term or a term without spaces")
# query = input ("Enter your keyword(s):")
if query:
bar = st.progress(0)
time.sleep(.05)
st.caption(f"Searching {num_abstracts} {database_name} PubMed abstracts covering 1990-2022")
for i in range(10):
bar.progress((i + 1) * 10)
time.sleep(.1)
try:
model = Word2Vec.load(model_used) # you can continue training with the loaded model!
words = list(model.wv.key_to_index)
X = model.wv[model.wv.key_to_index]
model2 = model.wv[query]
df = pd.DataFrame(X)
except:
st.error("Term occurrence is too low - please try another term")
st.stop()
st.markdown("---")
# def findRelationships(query, df):
table = model.wv.most_similar_cosmul(query, topn=10000)
table = (pd.DataFrame(table))
table.index.name = 'Rank'
table.columns = ['Word', 'SIMILARITY']
# print()
# print("Similarity to " + str(query))
pd.set_option('display.max_rows', None)
table2 = table.copy()
# print(table.head(50))
# table.head(10).to_csv("clotting_sim1.csv", index=True)
# short_table = table.head(50)
# print(table)
# calculate the sizes of the squares in the treemap
short_table = table2.head(10).round(2)
short_table.index += 1
short_table.index = (1 / short_table.index)*10
sizes = short_table.index.tolist()
cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes)))
color = [cmap[i] for i in range(len(sizes))]
short_table.set_index('Word', inplace=True)
squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB",
text_kwargs={'fontsize': 10},)
# # plot the treemap using matplotlib
plt.axis('off')
# Add legend to top right, outside plot region
# plt.legend("upper right", bbox_to_anchor=(-.2, 0))
fig = plt.gcf()
fig.patch.set_facecolor('#CCFFFF')
# print(table.head(10)["SIMILARITY"])
# # display the treemap in Streamlit
table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
rank_num = list(short_table.index.tolist())
# avg_size = sum(sizes) / len(short_table.index)
print(rank_num)
# print(sizes)
# '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
TEMPLATE = """
<br>
{0}: <a href='https://pubmed.ncbi.nlm.nih.gov/?term={1}%5Bmh%5D+%20%20%20%20%20NOT
+review%5Bpt%5D+AND+english%5Bla%5D+AND+hasabstract+AND+1990%253A2022%252F12%252F31%5Bdp%5D+AND+%22{2}%22'>google</a>
""".format(database_name,database_name, database_name)
fig = px.treemap(names=rank_num, path=[short_table.index], values=sizes, hover_name=(table2.head(10)['SIMILARITY']))
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF")
fig.update_annotations(visible=False)
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
hoverlabel_bgcolor="lightgreen", hoverlabel_bordercolor="#000000", texttemplate=TEMPLATE)
fig.update_layout(uniformtext=dict(minsize=15, mode='hide'), treemapcolorway=["lightgreen"])
# treemap1, treemap2 = st.columns(2)
# with treemap1:
st.subheader(f"Top 10 Words closely related to {query}")
# st.pyplot(fig)
# plt.clf()
st.plotly_chart(fig, use_container_width=True)
csv = table.head(100).to_csv().encode('utf-8')
st.download_button(label="download top 100 words (csv)", data=csv, file_name=f'{database_name}_words.csv', mime='text/csv')
st.markdown("---")
# st.write(short_table)
#
# print()
# print("Human genes similar to " + str(query))
df1 = table
df2 = pd.read_csv('Human_Genes.csv')
m = df1.Word.isin(df2.symbol)
df1 = df1[m]
df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
df1["Human Gene"] = df1["Human Gene"].str.upper()
# print(df1.head(50))
print()
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
# time.sleep(2)
# Create the slider with increments of 5 up to 100
st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
f"<span style='color:red; font-style: italic;'>genes</span> contextually "
f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
unsafe_allow_html=True)
value = st.slider("", 0, 100, step=5)
if value > 0:
# st.subheader(f"Top {value} genes closely related to {query}: "
# f"Click on the Pubmed and NCBI links for more gene information")
st.markdown(
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value} "
f"</span>genes similar to "
f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
unsafe_allow_html=True)
df10 = df1.head(value)
df10.index = (1 / df10.index)*10000
sizes = df10.index.tolist()
cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes)))
color2 = [cmap2[i] for i in range(len(sizes))]
df10.set_index('Human Gene', inplace=True)
squarify.plot(sizes=sizes, label=df10.index.tolist(), color=color2, edgecolor="#EBF5FB",
text_kwargs={'fontsize': 12})
#
# # plot the treemap using matplotlib
plt.axis('off')
fig2 = plt.gcf()
fig2.patch.set_facecolor('#CCFFFF')
#
df3 = df1.copy()
df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value)["SIMILARITY"].round(2).astype(str)
df3.reset_index(inplace=True)
df3 = df3.rename(columns={'Human Gene': 'symbol2'})
# Use df.query to get a subset of df1 based on ids in df2
subset = df3.head(value).query('symbol2 in @df2.symbol2')
# Use merge to join the two DataFrames on id
result = pd.merge(subset, df2, on='symbol2')
# Show the result
# print(result)
df = df10
try:
# Define the `text` column for labels and `href` column for links
df['text'] = df10.index
df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10.index]
df['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10.index]
df['name'] = [c for c in result['Approved name']]
df['database'] = database_name
# print(df['name'])
# Create the treemap using `px.treemap`
fig = px.treemap(df, path=[df10.index], values=sizes,
custom_data=['href', 'name', 'database', 'href2'], hover_name=(df3.head(value)['SIMILARITY']))
fig.update(layout_coloraxis_showscale=False)
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
fig.update_annotations(visible=False)
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{label}</span></b><br><span "
"style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
"<a href='%{customdata[0]}'>PubMed"
"</a><br><a href='%{customdata[3]}'>NCBI"
"</span></a>")
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
# # display the treemap in Streamlit
# with treemap2:
# st.pyplot(fig2)
st.plotly_chart(fig, use_container_width=True)
st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
csv = df1.head(value).to_csv().encode('utf-8')
st.download_button(label=f"download top {value} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
mime='text/csv')
except:
st.warning(
f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus")
st.markdown("---")
st.subheader("Cancer-related videos")
if query:
idlist=[]
search_keyword = {query}
html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer")
html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer")
html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer")
video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode())
video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode())
video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode())
video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode())
for i in video_ids2:
video_ids.append(i)
for i in video_ids3:
video_ids.append(i)
for i in video_ids4:
video_ids.append(i)
for i in video_ids5:
video_ids.append(i)
random.shuffle(video_ids)
c1, c2, c3 = st.columns(3)
with c1:
st.video("https://www.youtube.com/watch?v=" + video_ids[0])
with c2:
st.video("https://www.youtube.com/watch?v=" + video_ids[1])
with c3:
st.video("https://www.youtube.com/watch?v=" + video_ids[2])
st.markdown("---")