File size: 13,923 Bytes
1699569
 
 
 
 
e5a12b8
 
 
62faff0
 
2267bcd
66e83e0
1699569
c6e3011
 
 
ff86fbf
62faff0
c6e3011
 
 
 
 
1699569
62faff0
 
 
 
 
 
 
f21967a
a6d026f
f192d73
62faff0
3559da9
f2f40f0
f192d73
afb8bf9
62faff0
3559da9
f2f40f0
afb8bf9
a6d026f
f21967a
2bba935
90c2875
 
e48b5b5
90c2875
e48b5b5
90c2875
 
 
e48b5b5
f67304b
 
 
 
 
 
 
 
1699569
66e83e0
105ed33
66e83e0
c6e3011
e48b5b5
105ed33
62faff0
2bba935
b2912c4
d4a2975
62faff0
 
 
e5a12b8
1699569
e48b5b5
2267bcd
105ed33
62faff0
e48b5b5
 
 
f21967a
 
e48b5b5
f21967a
 
 
 
 
 
 
 
105ed33
f21967a
f67304b
 
1699569
 
 
 
c5c0a51
 
 
1699569
c5c0a51
 
f658f80
e5a12b8
 
8eb1090
f67304b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5a12b8
 
f67304b
e5a12b8
c5c0a51
e5a12b8
 
 
 
f67304b
 
c5c0a51
f2f40f0
c5c0a51
f67304b
 
 
 
 
 
 
 
 
 
 
 
ff86fbf
f2f40f0
f67304b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5c0a51
e5a12b8
1699569
4b2cc15
c5c0a51
 
1699569
4b2cc15
1699569
 
 
e5a12b8
c5c0a51
1699569
f658f80
e5a12b8
d4a2975
8eb1090
d4a2975
 
 
 
 
f67304b
d4a2975
 
 
e5a12b8
d4a2975
 
 
 
 
 
 
c5c0a51
f658f80
b1a4aa9
e5a12b8
f2f40f0
d4a2975
f2f40f0
 
 
d4a2975
f2f40f0
 
 
 
f67304b
 
d4a2975
 
f67304b
 
f2f40f0
f67304b
d4a2975
f67304b
d4a2975
f67304b
d4a2975
 
 
 
f67304b
 
d4a2975
 
 
 
 
f2f40f0
f67304b
d4a2975
 
f2f40f0
d4a2975
 
 
 
 
ff86fbf
d4a2975
e5a12b8
d4a2975
 
 
 
 
 
 
c6e3011
f67304b
 
d4a2975
 
 
8eb1090
 
62faff0
fb7bdf2
62faff0
fb7bdf2
 
 
 
 
62faff0
fb7bdf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2267bcd
 
8eb1090
 
2267bcd
fb7bdf2
2267bcd
fb7bdf2
2267bcd
fb7bdf2
8eb1090
66e83e0
f2f40f0
 
ff86fbf
 
66e83e0
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import streamlit as st
import time
import json
from gensim.models import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import squarify
import numpy as np
import re
import urllib.request
import random
import plotly.express as px

st.set_page_config(
    page_title="FATA4 Science",
                page_icon=":microscope:",
                layout="wide", #centered
                initial_sidebar_state="auto",
                menu_items={
                    'About': "FATA4 Science is a Natural Language Processing (NLP) that ...."
                }
                )

# Define the HTML and CSS styles
st.markdown("""
<style>
    [data-testid=stSidebar] {
        background-color: #99CCFF;
    }
</style>
""", unsafe_allow_html=True)
st.markdown("""
    <style>
    body {
        background-color: #CCFFFF;
        # color: #ffffff;
        # font-size: 1px
    }
    .stApp {
        background-color: #CCFFFF;
        # color: #ffffff;
        # font-size: 1px
    }
    </style>
    """, unsafe_allow_html=True)

opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus'))
if opt == "Clotting corpus":
    model_used = ("pubmed_model_clotting")
    num_abstracts = 45493
    database_name = "Clotting"
if opt == "Neuroblastoma corpus":
    model_used = ("pubmed_model_neuroblastoma")
    num_abstracts = 29032
    database_name = "Neuroblastoma"
# if opt == "Breast Cancer corpus":
#     model_used = ("pubmed_model_breast_cancer")
#     num_abstracts = 290320
#     database_name = "Breast_cancer"
# if opt == "Mammary gland corpus":
#     model_used = ("pubmed_model_mammary_gland")
#     num_abstracts = 79032
#     database_name = "Mammary_gland"

st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science")

st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")
st.markdown("---")

st.header(f":blue[{database_name} Pubmed corpus.]")
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
query = text_input_value
query = query.lower()
query = re.sub("[,.?!&*;: ]", "", query)
matches = [" "]
if any([x in query for x in matches]):
    st.write("Please only enter one term or a term without spaces")
# query = input ("Enter your keyword(s):")
if query:
    bar = st.progress(0)
    time.sleep(.05)
    st.caption(f"Searching {num_abstracts} {database_name} PubMed abstracts covering 1990-2022")

    for i in range(10):
        bar.progress((i + 1) * 10)
        time.sleep(.1)

    try:
        model = Word2Vec.load(model_used)  # you can continue training with the loaded model!
        words = list(model.wv.key_to_index)
        X = model.wv[model.wv.key_to_index]
        model2 = model.wv[query]
        df = pd.DataFrame(X)

    except:
        st.error("Term occurrence is too low - please try another term")
        st.stop()
    st.markdown("---")
    # def findRelationships(query, df):


    table = model.wv.most_similar_cosmul(query, topn=10000)
    table = (pd.DataFrame(table))
    table.index.name = 'Rank'
    table.columns = ['Word', 'SIMILARITY']

    # print()
    # print("Similarity to " + str(query))
    pd.set_option('display.max_rows', None)
    table2 = table.copy()
    # print(table.head(50))
    # table.head(10).to_csv("clotting_sim1.csv", index=True)
    # short_table = table.head(50)
    # print(table)

    # Create the slider with increments of 5 up to 100

    st.markdown(
        f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
        f"<span style='color:red; font-style: italic;'>words</span> contextually "
        f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
        f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
        unsafe_allow_html=True)
    value_word = st.slider("Words", 0, 100, step=5)
    if value_word > 0:
        # st.subheader(f"Top {value} genes closely related to {query}: "
        #              f"Click on the Pubmed and NCBI links for more gene information")

        st.markdown(
            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
            f"</span>words similar to "
            f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Wikipaedia links for more word information</span></p></b>",
            unsafe_allow_html=True)


    # calculate the sizes of the squares in the treemap
    short_table = table2.head(value_word).round(2)
    short_table.index += 1
    short_table.index = (1 / short_table.index)*10
    sizes = short_table.index.tolist()


    short_table.set_index('Word', inplace=True)
    # label = short_table.index.tolist()
    print(short_table.index)
    table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
    rank_num = list(short_table.index.tolist())
    # avg_size = sum(sizes) / len(short_table.index)
    df = short_table
    try:
        # Define the `text` column for labels and `href` column for links
        df['text'] = short_table.index

        df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
                  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
        df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]

        df['database'] = database_name


    # print(sizes)
    # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
        # Create the treemap using `px.treemap`
        fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
                         hover_name=(table2.head(value_word)['SIMILARITY']))

        fig.update(layout_coloraxis_showscale=False)
        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
        fig.update_annotations(visible=False)
        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                          hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
                          texttemplate="</b><br><span "
                                       "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
                                       "<a href='%{customdata[0]}'>PubMed"
                                       "</a><br><a href='%{customdata[3]}'>Wikipedia"
                                       "</span></a>")
        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])

        # st.pyplot(fig2)
        st.plotly_chart(fig, use_container_width=True)

        # st.caption(
        #     "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
        # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")

        csv = table2.head(value_word).to_csv().encode('utf-8')
        st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv',
                           mime='text/csv')

    except:
        st.warning(
            f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus")

    st.markdown("---")
    # st.write(short_table)
    #

    # print()
    # print("Human genes similar to " + str(query))
    df1 = table
    df2 = pd.read_csv('Human_Genes.csv')
    m = df1.Word.isin(df2.symbol)
    df1 = df1[m]
    df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
    df1["Human Gene"] = df1["Human Gene"].str.upper()
    # print(df1.head(50))
    print()
    # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
    # time.sleep(2)
    # Create the slider with increments of 5 up to 100

    st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
                f"<span style='color:red; font-style: italic;'>genes</span> contextually "
                f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
                f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
                    unsafe_allow_html=True)
    value = st.slider("Gene", 0, 100, step=5)
    if value > 0:
        # st.subheader(f"Top {value} genes closely related to {query}: "
        #              f"Click on the Pubmed and NCBI links for more gene information")

        st.markdown(
            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value} "
            f"</span>genes similar to "
            f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
            unsafe_allow_html=True)

    df10 = df1.head(value)
    df10.index = (1 / df10.index)*10000
    sizes = df10.index.tolist()
    df10.set_index('Human Gene', inplace=True)

    df3 = df1.copy()
    df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value)["SIMILARITY"].round(2).astype(str)
    df3.reset_index(inplace=True)
    df3 = df3.rename(columns={'Human Gene': 'symbol2'})
    # Use df.query to get a subset of df1 based on ids in df2
    subset = df3.head(value).query('symbol2 in @df2.symbol2')
    # Use merge to join the two DataFrames on id
    result = pd.merge(subset, df2, on='symbol2')
    # Show the result
    # print(result)
    # label = df10.index.tolist()
    df2 = df10
    try:
        # Define the `text` column for labels and `href` column for links
        df2['text'] = df10.index
        df2['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
                  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10.index]
        df2['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10.index]

        df2['name'] = [c for c in result['Approved name']]

        df2['database'] = database_name

        # print(df['name'])

        # Create the treemap using `px.treemap`
        fig = px.treemap(df2, path=[df10.index], values=sizes,
                     custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value)['SIMILARITY']))

        fig.update(layout_coloraxis_showscale=False)
        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
        fig.update_annotations(visible=False)
        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                      hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
                      texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
                                   "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
                                   "<a href='%{customdata[0]}'>PubMed"
                                   "</a><br><a href='%{customdata[3]}'>NCBI"
                                   "</span></a>")
        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
        # # display the treemap in Streamlit
        # with treemap2:

        # st.pyplot(fig2)
        st.plotly_chart(fig, use_container_width=True)

        st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
        st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")



        csv = df1.head(value).to_csv().encode('utf-8')
        st.download_button(label=f"download top {value} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
                       mime='text/csv')


    except:
        st.warning(
            f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus")
    st.markdown("---")
    st.subheader("Cancer-related videos")
    if query:
        idlist=[]
        search_keyword = {query}
        html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
        html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
        html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer")
        html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer")
        html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer")
        video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
        video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode())
        video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode())
        video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode())
        video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode())

        for i in video_ids2:
            video_ids.append(i)
        for i in video_ids3:
            video_ids.append(i)
        for i in video_ids4:
            video_ids.append(i)
        for i in video_ids5:
            video_ids.append(i)

        random.shuffle(video_ids)

        c1, c2, c3 = st.columns(3)


        with c1:
           st.video("https://www.youtube.com/watch?v=" + video_ids[0])
        with c2:
           st.video("https://www.youtube.com/watch?v=" + video_ids[1])
        with c3:
           st.video("https://www.youtube.com/watch?v=" + video_ids[2])
    st.markdown("---")