Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -13,17 +13,10 @@ import urllib.request
|
|
13 |
import random
|
14 |
import plotly.express as px
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
layout="wide", #centered
|
21 |
-
initial_sidebar_state="auto",
|
22 |
-
menu_items={
|
23 |
-
'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
|
24 |
-
" insight from pubmed abstracts. Created by Jimmie E. Fata, PhD"
|
25 |
-
}
|
26 |
-
)
|
27 |
|
28 |
# Define the HTML and CSS styles
|
29 |
st.markdown("""
|
@@ -50,24 +43,28 @@ st.markdown("""
|
|
50 |
|
51 |
st.header(":red[*Abstractalytics*]")
|
52 |
|
53 |
-
st.subheader(
|
54 |
-
|
|
|
|
|
55 |
|
56 |
def custom_subheader(text, identifier, font_size):
|
57 |
st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True)
|
58 |
|
|
|
59 |
custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden "
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
67 |
|
68 |
st.markdown("---")
|
69 |
|
70 |
-
#Define the correct password
|
71 |
# CORRECT_PASSWORD = "123"
|
72 |
|
73 |
# Define a function to check if the password is correct
|
@@ -82,9 +79,7 @@ st.markdown("---")
|
|
82 |
#
|
83 |
# # If the password is correct, show the app content
|
84 |
# if authenticate(password):
|
85 |
-
opt = st.sidebar.radio("Select a PubMed Corpus",
|
86 |
-
options=(
|
87 |
-
'Breast Cancer corpus', 'Lung Cancer corpus'))
|
88 |
# if opt == "Clotting corpus":
|
89 |
# model_used = ("pubmed_model_clotting")
|
90 |
# num_abstracts = 45493
|
@@ -101,6 +96,14 @@ if opt == "Lung Cancer corpus":
|
|
101 |
model_used = ("lung_cancer_pubmed_model")
|
102 |
num_abstracts = 143886
|
103 |
database_name = "Lung_cancer"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
st.header(f":blue[{database_name} Pubmed corpus.]")
|
106 |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
|
@@ -115,7 +118,8 @@ query = re.sub(" ", "-", query)
|
|
115 |
if query:
|
116 |
bar = st.progress(0)
|
117 |
time.sleep(.05)
|
118 |
-
st.caption(
|
|
|
119 |
|
120 |
for i in range(10):
|
121 |
bar.progress((i + 1) * 10)
|
@@ -130,6 +134,7 @@ if query:
|
|
130 |
# print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
|
131 |
df = pd.DataFrame(X)
|
132 |
|
|
|
133 |
def get_compound_ids(compound_names):
|
134 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
135 |
compound_ids = list(executor.map(get_compound_id, compound_names))
|
@@ -149,6 +154,7 @@ if query:
|
|
149 |
return compound_id
|
150 |
return None
|
151 |
|
|
|
152 |
# except:
|
153 |
# st.error("Term occurrence is too low - please try another term")
|
154 |
# st.stop()
|
@@ -203,12 +209,11 @@ if query:
|
|
203 |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
204 |
fig.update_annotations(visible=False)
|
205 |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
"</span></a>")
|
212 |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
|
213 |
|
214 |
# st.pyplot(fig2)
|
@@ -220,10 +225,10 @@ if query:
|
|
220 |
|
221 |
csv = table2.head(value_word).to_csv().encode('utf-8')
|
222 |
st.download_button(label=f"download top {value_word} words (csv)", data=csv,
|
223 |
-
|
224 |
except:
|
225 |
st.warning(
|
226 |
-
|
227 |
|
228 |
# st.markdown("---")
|
229 |
# # st.write(short_table)
|
@@ -373,7 +378,7 @@ if query:
|
|
373 |
# Define the `text` column for labels and `href` column for links
|
374 |
df11['text'] = df11.index
|
375 |
df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
376 |
-
|
377 |
df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
|
378 |
assert isinstance(df11, object)
|
379 |
df11['database'] = database_name
|
@@ -382,17 +387,17 @@ if query:
|
|
382 |
|
383 |
# Create the treemap using `px.treemap`
|
384 |
fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
385 |
-
|
386 |
|
387 |
fig.update(layout_coloraxis_showscale=False)
|
388 |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
389 |
fig.update_annotations(visible=False)
|
390 |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
|
397 |
# # display the treemap in Streamlit
|
398 |
# with treemap2:
|
@@ -403,18 +408,19 @@ if query:
|
|
403 |
# st.caption(
|
404 |
# "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
|
405 |
# st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
|
406 |
-
st.caption(
|
|
|
407 |
st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
|
408 |
st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
|
409 |
|
410 |
csv = df1.head(value_gene).to_csv().encode('utf-8')
|
411 |
st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
|
412 |
-
|
413 |
|
414 |
|
415 |
else:
|
416 |
st.warning(
|
417 |
-
|
418 |
st.markdown("---")
|
419 |
# print()
|
420 |
# print("Human genes similar to " + str(query))
|
@@ -477,7 +483,7 @@ if query:
|
|
477 |
df13.set_index('Drugs', inplace=True)
|
478 |
df13['text'] = df13.index
|
479 |
df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
480 |
-
|
481 |
df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
|
482 |
assert isinstance(df13, object)
|
483 |
df13['database'] = database_name
|
@@ -486,17 +492,17 @@ if query:
|
|
486 |
|
487 |
# Create the treemap using `px.treemap`
|
488 |
fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
489 |
-
|
490 |
|
491 |
fig.update(layout_coloraxis_showscale=False)
|
492 |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
493 |
fig.update_annotations(visible=False)
|
494 |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
|
501 |
# # display the treemap in Streamlit
|
502 |
# with treemap2:
|
@@ -504,17 +510,16 @@ if query:
|
|
504 |
# st.pyplot(fig2)
|
505 |
st.plotly_chart(fig, use_container_width=True)
|
506 |
|
507 |
-
st.caption(
|
508 |
-
"Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
|
509 |
|
510 |
csv = df1.head(value_drug).to_csv().encode('utf-8')
|
511 |
st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
|
512 |
-
|
513 |
|
514 |
|
515 |
else:
|
516 |
st.warning(
|
517 |
-
|
518 |
st.markdown("---")
|
519 |
#
|
520 |
# st.markdown("---")
|
@@ -926,9 +931,8 @@ if query:
|
|
926 |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
927 |
st.markdown("---")
|
928 |
|
929 |
-
|
930 |
# import os
|
931 |
-
|
932 |
# from datasets import Dataset
|
933 |
|
934 |
# # Check if the comments directory exists
|
@@ -955,9 +959,6 @@ if query:
|
|
955 |
|
956 |
# print('Comment saved to dataset.')
|
957 |
|
958 |
-
|
959 |
-
|
960 |
-
|
961 |
# st.title("Abstractalytics Web App")
|
962 |
# st.write("We appreciate your feedback!")
|
963 |
|
|
|
13 |
import random
|
14 |
import plotly.express as px
|
15 |
|
16 |
+
st.set_page_config(page_title="Abstractalytics", page_icon=":microscope:", layout="wide", # centered
|
17 |
+
initial_sidebar_state="auto",
|
18 |
+
menu_items={'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
|
19 |
+
" insight from pubmed abstracts. Created by Jimmie E. Fata, PhD"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Define the HTML and CSS styles
|
22 |
st.markdown("""
|
|
|
43 |
|
44 |
st.header(":red[*Abstractalytics*]")
|
45 |
|
46 |
+
st.subheader(
|
47 |
+
"*A web app designed to explore :red[*PubMed abstracts*] for deeper understanding and fresh insights, driven "
|
48 |
+
"by Natural Language Processing (NLP) techniques.*")
|
49 |
+
|
50 |
|
51 |
def custom_subheader(text, identifier, font_size):
|
52 |
st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True)
|
53 |
|
54 |
+
|
55 |
custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden "
|
56 |
+
"within PubMed abstracts! To begin, simply select a corpus that interests you. Next, enter a single keyword "
|
57 |
+
"you wish to explore within the corpus. Abstractalytics powerful Natural Language "
|
58 |
+
"Processing (NLP) algorithms will analyze the chosen corpus and present you with a list of top words, "
|
59 |
+
"genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
|
60 |
+
"to your input. This advanced text-mining technique enables you to explore and understand complex "
|
61 |
+
"relationships, uncovering new discoveries and connections in your field of research across a massive "
|
62 |
+
"amount of abstracts. Dive in and enjoy the exploration! More oncology-related corpora comming soon.",
|
63 |
+
"unique-id", 18)
|
64 |
|
65 |
st.markdown("---")
|
66 |
|
67 |
+
# Define the correct password
|
68 |
# CORRECT_PASSWORD = "123"
|
69 |
|
70 |
# Define a function to check if the password is correct
|
|
|
79 |
#
|
80 |
# # If the password is correct, show the app content
|
81 |
# if authenticate(password):
|
82 |
+
opt = st.sidebar.radio("Select a PubMed Corpus", options=('Breast Cancer corpus', 'Lung Cancer corpus', 'Prostate Cancer corpus'))
|
|
|
|
|
83 |
# if opt == "Clotting corpus":
|
84 |
# model_used = ("pubmed_model_clotting")
|
85 |
# num_abstracts = 45493
|
|
|
96 |
model_used = ("lung_cancer_pubmed_model")
|
97 |
num_abstracts = 143886
|
98 |
database_name = "Lung_cancer"
|
99 |
+
if opt == "Breast Cancer corpus":
|
100 |
+
model_used = ("pubmed_model_breast_cancer2")
|
101 |
+
num_abstracts = 204381
|
102 |
+
database_name = "Breast_cancer"
|
103 |
+
if opt == "Prostate Cancer corpus":
|
104 |
+
model_used = ("prostate_cancer_pubmed_model")
|
105 |
+
num_abstracts = 89000
|
106 |
+
database_name = "Prostate_cancer"
|
107 |
|
108 |
st.header(f":blue[{database_name} Pubmed corpus.]")
|
109 |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
|
|
|
118 |
if query:
|
119 |
bar = st.progress(0)
|
120 |
time.sleep(.05)
|
121 |
+
st.caption(
|
122 |
+
f"Searching {num_abstracts} {database_name} PubMed primary abstracts covering 1990-2022 (Reviews not included)")
|
123 |
|
124 |
for i in range(10):
|
125 |
bar.progress((i + 1) * 10)
|
|
|
134 |
# print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
|
135 |
df = pd.DataFrame(X)
|
136 |
|
137 |
+
|
138 |
def get_compound_ids(compound_names):
|
139 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
140 |
compound_ids = list(executor.map(get_compound_id, compound_names))
|
|
|
154 |
return compound_id
|
155 |
return None
|
156 |
|
157 |
+
|
158 |
# except:
|
159 |
# st.error("Term occurrence is too low - please try another term")
|
160 |
# st.stop()
|
|
|
209 |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
210 |
fig.update_annotations(visible=False)
|
211 |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
212 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="<br><span "
|
213 |
+
"style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
|
214 |
+
"<a href='%{customdata[0]}'>PubMed"
|
215 |
+
"</a><br><br><a href='%{customdata[3]}'>Wikipedia"
|
216 |
+
"</span></a>")
|
|
|
217 |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
|
218 |
|
219 |
# st.pyplot(fig2)
|
|
|
225 |
|
226 |
csv = table2.head(value_word).to_csv().encode('utf-8')
|
227 |
st.download_button(label=f"download top {value_word} words (csv)", data=csv,
|
228 |
+
file_name=f'{database_name}_words.csv', mime='text/csv')
|
229 |
except:
|
230 |
st.warning(
|
231 |
+
f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
|
232 |
|
233 |
# st.markdown("---")
|
234 |
# # st.write(short_table)
|
|
|
378 |
# Define the `text` column for labels and `href` column for links
|
379 |
df11['text'] = df11.index
|
380 |
df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
381 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
|
382 |
df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
|
383 |
assert isinstance(df11, object)
|
384 |
df11['database'] = database_name
|
|
|
387 |
|
388 |
# Create the treemap using `px.treemap`
|
389 |
fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
390 |
+
hover_name=(df4.head(value_gene)['SIMILARITY']))
|
391 |
|
392 |
fig.update(layout_coloraxis_showscale=False)
|
393 |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
394 |
fig.update_annotations(visible=False)
|
395 |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
396 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
397 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
398 |
+
"<a href='%{customdata[0]}'>PubMed"
|
399 |
+
"</a><br><br><a href='%{customdata[2]}'>GeneCard"
|
400 |
+
"</span></a>")
|
401 |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
|
402 |
# # display the treemap in Streamlit
|
403 |
# with treemap2:
|
|
|
408 |
# st.caption(
|
409 |
# "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
|
410 |
# st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
|
411 |
+
st.caption(
|
412 |
+
"Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
|
413 |
st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
|
414 |
st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
|
415 |
|
416 |
csv = df1.head(value_gene).to_csv().encode('utf-8')
|
417 |
st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
|
418 |
+
file_name=f'{database_name}_genes.csv', mime='text/csv')
|
419 |
|
420 |
|
421 |
else:
|
422 |
st.warning(
|
423 |
+
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
424 |
st.markdown("---")
|
425 |
# print()
|
426 |
# print("Human genes similar to " + str(query))
|
|
|
483 |
df13.set_index('Drugs', inplace=True)
|
484 |
df13['text'] = df13.index
|
485 |
df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
486 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
|
487 |
df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
|
488 |
assert isinstance(df13, object)
|
489 |
df13['database'] = database_name
|
|
|
492 |
|
493 |
# Create the treemap using `px.treemap`
|
494 |
fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
495 |
+
hover_name=(df6.head(value_drug)['SIMILARITY']))
|
496 |
|
497 |
fig.update(layout_coloraxis_showscale=False)
|
498 |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
499 |
fig.update_annotations(visible=False)
|
500 |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
501 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
502 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
503 |
+
"<a href='%{customdata[0]}'>PubMed"
|
504 |
+
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
505 |
+
"</span></a>")
|
506 |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
|
507 |
# # display the treemap in Streamlit
|
508 |
# with treemap2:
|
|
|
510 |
# st.pyplot(fig2)
|
511 |
st.plotly_chart(fig, use_container_width=True)
|
512 |
|
513 |
+
st.caption("Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
|
|
|
514 |
|
515 |
csv = df1.head(value_drug).to_csv().encode('utf-8')
|
516 |
st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
|
517 |
+
file_name=f'{database_name}_drugs.csv', mime='text/csv')
|
518 |
|
519 |
|
520 |
else:
|
521 |
st.warning(
|
522 |
+
f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
|
523 |
st.markdown("---")
|
524 |
#
|
525 |
# st.markdown("---")
|
|
|
931 |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
932 |
st.markdown("---")
|
933 |
|
|
|
934 |
# import os
|
935 |
+
|
936 |
# from datasets import Dataset
|
937 |
|
938 |
# # Check if the comments directory exists
|
|
|
959 |
|
960 |
# print('Comment saved to dataset.')
|
961 |
|
|
|
|
|
|
|
962 |
# st.title("Abstractalytics Web App")
|
963 |
# st.write("We appreciate your feedback!")
|
964 |
|