Spaces:
Runtime error
Runtime error
import streamlit as st | |
import time | |
import json | |
from gensim.models import Word2Vec | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import squarify | |
import numpy as np | |
import re | |
import urllib.request | |
import random | |
import plotly.express as px | |
st.set_page_config( | |
page_title="FATA4 Science", | |
page_icon=":microscope:", | |
layout="wide", #centered | |
initial_sidebar_state="auto", | |
menu_items={ | |
'About': "FATA4 Science is a Natural Language Processing (NLP) that ...." | |
} | |
) | |
# Define the HTML and CSS styles | |
st.markdown(""" | |
<style> | |
[data-testid=stSidebar] { | |
background-color: #99CCFF; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<style> | |
body { | |
background-color: #CCFFFF; | |
# color: #ffffff; | |
# font-size: 1px | |
} | |
.stApp { | |
background-color: #CCFFFF; | |
# color: #ffffff; | |
# font-size: 1px | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus')) | |
if opt == "Clotting corpus": | |
model_used = ("pubmed_model_clotting") | |
num_abstracts = 45493 | |
database_name = "Clotting" | |
if opt == "Neuroblastoma corpus": | |
model_used = ("pubmed_model_neuroblastoma") | |
num_abstracts = 29032 | |
database_name = "Neuroblastoma" | |
# if opt == "Breast Cancer corpus": | |
# model_used = ("pubmed_model_breast_cancer") | |
# num_abstracts = 290320 | |
# database_name = "Breast_cancer" | |
# if opt == "Mammary gland corpus": | |
# model_used = ("pubmed_model_mammary_gland") | |
# num_abstracts = 79032 | |
# database_name = "Mammary_gland" | |
st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science") | |
st.subheader("Uncovering knowledge through Natural Language Processing (NLP)") | |
st.markdown("---") | |
st.header(f":blue[{database_name} Pubmed corpus.]") | |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus") | |
query = text_input_value | |
query = query.lower() | |
query = re.sub("[,.?!&*;: ]", "", query) | |
matches = [" "] | |
if any([x in query for x in matches]): | |
st.write("Please only enter one term or a term without spaces") | |
# query = input ("Enter your keyword(s):") | |
if query: | |
bar = st.progress(0) | |
time.sleep(.05) | |
st.caption(f"Searching {num_abstracts} {database_name} PubMed abstracts covering 1990-2022") | |
for i in range(10): | |
bar.progress((i + 1) * 10) | |
time.sleep(.1) | |
try: | |
model = Word2Vec.load(model_used) # you can continue training with the loaded model! | |
words = list(model.wv.key_to_index) | |
X = model.wv[model.wv.key_to_index] | |
model2 = model.wv[query] | |
df = pd.DataFrame(X) | |
except: | |
st.error("Term occurrence is too low - please try another term") | |
st.stop() | |
st.markdown("---") | |
# def findRelationships(query, df): | |
table = model.wv.most_similar_cosmul(query, topn=10000) | |
table = (pd.DataFrame(table)) | |
table.index.name = 'Rank' | |
table.columns = ['Word', 'SIMILARITY'] | |
# print() | |
# print("Similarity to " + str(query)) | |
pd.set_option('display.max_rows', None) | |
table2 = table.copy() | |
# print(table.head(50)) | |
# table.head(10).to_csv("clotting_sim1.csv", index=True) | |
# short_table = table.head(50) | |
# print(table) | |
# Create the slider with increments of 5 up to 100 | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize " | |
f"<span style='color:red; font-style: italic;'>words</span> contextually " | |
f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
unsafe_allow_html=True) | |
value_word = st.slider("Words", 0, 100, step=5) | |
if value_word > 0: | |
# st.subheader(f"Top {value} genes closely related to {query}: " | |
# f"Click on the Pubmed and NCBI links for more gene information") | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} " | |
f"</span>words similar to " | |
f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Wikipaedia links for more word information</span></p></b>", | |
unsafe_allow_html=True) | |
# calculate the sizes of the squares in the treemap | |
short_table = table2.head(value_word).round(2) | |
short_table.index += 1 | |
short_table.index = (1 / short_table.index)*10 | |
sizes = short_table.index.tolist() | |
short_table.set_index('Word', inplace=True) | |
# label = short_table.index.tolist() | |
print(short_table.index) | |
table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str) | |
rank_num = list(short_table.index.tolist()) | |
# avg_size = sum(sizes) / len(short_table.index) | |
df = short_table | |
try: | |
# Define the `text` column for labels and `href` column for links | |
df['text'] = short_table.index | |
df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index] | |
df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index] | |
df['database'] = database_name | |
# print(sizes) | |
# '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8')) | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'], | |
hover_name=(table2.head(value_word)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="</b><br><span " | |
"style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><a href='%{customdata[3]}'>Wikipedia" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"]) | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
# st.caption( | |
# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/") | |
# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]") | |
csv = table2.head(value_word).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv', | |
mime='text/csv') | |
except: | |
st.warning( | |
f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus") | |
st.markdown("---") | |
# st.write(short_table) | |
# | |
# print() | |
# print("Human genes similar to " + str(query)) | |
df1 = table | |
df2 = pd.read_csv('Human_Genes.csv') | |
m = df1.Word.isin(df2.symbol) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Human Gene'}, inplace=True) | |
df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# print(df1.head(50)) | |
print() | |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# time.sleep(2) | |
# Create the slider with increments of 5 up to 100 | |
st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize " | |
f"<span style='color:red; font-style: italic;'>genes</span> contextually " | |
f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
unsafe_allow_html=True) | |
value = st.slider("Gene", 0, 100, step=5) | |
if value > 0: | |
# st.subheader(f"Top {value} genes closely related to {query}: " | |
# f"Click on the Pubmed and NCBI links for more gene information") | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value} " | |
f"</span>genes similar to " | |
f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>", | |
unsafe_allow_html=True) | |
df10 = df1.head(value) | |
df10.index = (1 / df10.index)*10000 | |
sizes = df10.index.tolist() | |
df10.set_index('Human Gene', inplace=True) | |
df3 = df1.copy() | |
df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value)["SIMILARITY"].round(2).astype(str) | |
df3.reset_index(inplace=True) | |
df3 = df3.rename(columns={'Human Gene': 'symbol2'}) | |
# Use df.query to get a subset of df1 based on ids in df2 | |
subset = df3.head(value).query('symbol2 in @df2.symbol2') | |
# Use merge to join the two DataFrames on id | |
result = pd.merge(subset, df2, on='symbol2') | |
# Show the result | |
# print(result) | |
# label = df10.index.tolist() | |
df2 = df10 | |
try: | |
# Define the `text` column for labels and `href` column for links | |
df2['text'] = df10.index | |
df2['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10.index] | |
df2['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10.index] | |
df2['name'] = [c for c in result['Approved name']] | |
df2['database'] = database_name | |
# print(df['name']) | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df2, path=[df10.index], values=sizes, | |
custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span " | |
"style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><a href='%{customdata[3]}'>NCBI" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"]) | |
# # display the treemap in Streamlit | |
# with treemap2: | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/") | |
st.caption("Gene designation add in exceptions [p21, p53, her2, her3]") | |
csv = df1.head(value).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv', | |
mime='text/csv') | |
except: | |
st.warning( | |
f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus") | |
st.markdown("---") | |
st.subheader("Cancer-related videos") | |
if query: | |
idlist=[] | |
search_keyword = {query} | |
html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer") | |
html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer") | |
html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer") | |
html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer") | |
html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer") | |
video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode()) | |
video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode()) | |
video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode()) | |
video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode()) | |
video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode()) | |
for i in video_ids2: | |
video_ids.append(i) | |
for i in video_ids3: | |
video_ids.append(i) | |
for i in video_ids4: | |
video_ids.append(i) | |
for i in video_ids5: | |
video_ids.append(i) | |
random.shuffle(video_ids) | |
c1, c2, c3 = st.columns(3) | |
with c1: | |
st.video("https://www.youtube.com/watch?v=" + video_ids[0]) | |
with c2: | |
st.video("https://www.youtube.com/watch?v=" + video_ids[1]) | |
with c3: | |
st.video("https://www.youtube.com/watch?v=" + video_ids[2]) | |
st.markdown("---") | |