Spaces:

jfataphd
/

OncoDigger

Runtime error

File size: 13,923 Bytes

1699569
 
 
 
 
e5a12b8
 
 
62faff0
 
2267bcd
66e83e0
1699569
c6e3011
 
 
ff86fbf
62faff0
c6e3011
 
 
 
 
1699569
62faff0
 
 
 
 
 
 
f21967a
a6d026f
f192d73
62faff0
3559da9
f2f40f0
f192d73
afb8bf9
62faff0
3559da9
f2f40f0
afb8bf9
a6d026f
f21967a
2bba935
90c2875
 
e48b5b5
90c2875
e48b5b5
90c2875
 
 
e48b5b5
f67304b
 
 
 
 
 
 
 
1699569
66e83e0
105ed33
66e83e0
c6e3011
e48b5b5
105ed33
62faff0
2bba935
b2912c4
d4a2975
62faff0
 
 
e5a12b8
1699569
e48b5b5
2267bcd
105ed33
62faff0
e48b5b5
 
 
f21967a
 
e48b5b5
f21967a
 
 
 
 
 
 
 
105ed33
f21967a
f67304b
 
1699569
 
 
 
c5c0a51
 
 
1699569
c5c0a51
 
f658f80
e5a12b8
 
8eb1090
f67304b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5a12b8
 
f67304b
e5a12b8
c5c0a51
e5a12b8
 
 
 
f67304b
 
c5c0a51
f2f40f0
c5c0a51
f67304b
 
 
 
 
 
 
 
 
 
 
 
ff86fbf
f2f40f0
f67304b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5c0a51
e5a12b8
1699569
4b2cc15
c5c0a51
 
1699569
4b2cc15
1699569
 
 
e5a12b8
c5c0a51
1699569
f658f80
e5a12b8
d4a2975
8eb1090
d4a2975
 
 
 
 
f67304b
d4a2975
 
 
e5a12b8
d4a2975
 
 
 
 
 
 
c5c0a51
f658f80
b1a4aa9
e5a12b8
f2f40f0
d4a2975
f2f40f0
 
 
d4a2975
f2f40f0
 
 
 
f67304b
 
d4a2975
 
f67304b
 
f2f40f0
f67304b
d4a2975
f67304b
d4a2975
f67304b
d4a2975
 
 
 
f67304b
 
d4a2975
 
 
 
 
f2f40f0
f67304b
d4a2975
 
f2f40f0
d4a2975
 
 
 
 
ff86fbf
d4a2975
e5a12b8
d4a2975
 
 
 
 
 
 
c6e3011
f67304b
 
d4a2975
 
 
8eb1090
 
62faff0
fb7bdf2
62faff0
fb7bdf2
 
 
 
 
62faff0
fb7bdf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2267bcd
 
8eb1090
 
2267bcd
fb7bdf2
2267bcd
fb7bdf2
2267bcd
fb7bdf2
8eb1090
66e83e0
f2f40f0
 
ff86fbf
 
66e83e0

import streamlit as st
import time
import json
from gensim.models import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import squarify
import numpy as np
import re
import urllib.request
import random
import plotly.express as px

st.set_page_config(
    page_title="FATA4 Science",
                page_icon=":microscope:",
                layout="wide", #centered
                initial_sidebar_state="auto",
                menu_items={
                    'About': "FATA4 Science is a Natural Language Processing (NLP) that ...."
                }
                )

# Define the HTML and CSS styles
st.markdown("""
<style>
    [data-testid=stSidebar] {
        background-color: #99CCFF;
    }
</style>
""", unsafe_allow_html=True)
st.markdown("""
    <style>
    body {
        background-color: #CCFFFF;
        # color: #ffffff;
        # font-size: 1px
    }
    .stApp {
        background-color: #CCFFFF;
        # color: #ffffff;
        # font-size: 1px
    }
    </style>
    """, unsafe_allow_html=True)

opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus'))
if opt == "Clotting corpus":
    model_used = ("pubmed_model_clotting")
    num_abstracts = 45493
    database_name = "Clotting"
if opt == "Neuroblastoma corpus":
    model_used = ("pubmed_model_neuroblastoma")
    num_abstracts = 29032
    database_name = "Neuroblastoma"
# if opt == "Breast Cancer corpus":
#     model_used = ("pubmed_model_breast_cancer")
#     num_abstracts = 290320
#     database_name = "Breast_cancer"
# if opt == "Mammary gland corpus":
#     model_used = ("pubmed_model_mammary_gland")
#     num_abstracts = 79032
#     database_name = "Mammary_gland"

st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science")

st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")
st.markdown("---")

st.header(f":blue[{database_name} Pubmed corpus.]")
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
query = text_input_value
query = query.lower()
query = re.sub("[,.?!&*;: ]", "", query)
matches = [" "]
if any([x in query for x in matches]):
    st.write("Please only enter one term or a term without spaces")
# query = input ("Enter your keyword(s):")
if query:
    bar = st.progress(0)
    time.sleep(.05)
    st.caption(f"Searching {num_abstracts} {database_name} PubMed abstracts covering 1990-2022")

    for i in range(10):
        bar.progress((i + 1) * 10)
        time.sleep(.1)

    try:
        model = Word2Vec.load(model_used)  # you can continue training with the loaded model!
        words = list(model.wv.key_to_index)
        X = model.wv[model.wv.key_to_index]
        model2 = model.wv[query]
        df = pd.DataFrame(X)

    except:
        st.error("Term occurrence is too low - please try another term")
        st.stop()
    st.markdown("---")
    # def findRelationships(query, df):


    table = model.wv.most_similar_cosmul(query, topn=10000)
    table = (pd.DataFrame(table))
    table.index.name = 'Rank'
    table.columns = ['Word', 'SIMILARITY']

    # print()
    # print("Similarity to " + str(query))
    pd.set_option('display.max_rows', None)
    table2 = table.copy()
    # print(table.head(50))
    # table.head(10).to_csv("clotting_sim1.csv", index=True)
    # short_table = table.head(50)
    # print(table)

    # Create the slider with increments of 5 up to 100

    st.markdown(
        f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
        f"<span style='color:red; font-style: italic;'>words</span> contextually "
        f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
        f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
        unsafe_allow_html=True)
    value_word = st.slider("Words", 0, 100, step=5)
    if value_word > 0:
        # st.subheader(f"Top {value} genes closely related to {query}: "
        #              f"Click on the Pubmed and NCBI links for more gene information")

        st.markdown(
            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
            f"</span>words similar to "
            f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Wikipaedia links for more word information</span></p></b>",
            unsafe_allow_html=True)


    # calculate the sizes of the squares in the treemap
    short_table = table2.head(value_word).round(2)
    short_table.index += 1
    short_table.index = (1 / short_table.index)*10
    sizes = short_table.index.tolist()


    short_table.set_index('Word', inplace=True)
    # label = short_table.index.tolist()
    print(short_table.index)
    table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
    rank_num = list(short_table.index.tolist())
    # avg_size = sum(sizes) / len(short_table.index)
    df = short_table
    try:
        # Define the `text` column for labels and `href` column for links
        df['text'] = short_table.index

        df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
                  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
        df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]

        df['database'] = database_name


    # print(sizes)
    # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
        # Create the treemap using `px.treemap`
        fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
                         hover_name=(table2.head(value_word)['SIMILARITY']))

        fig.update(layout_coloraxis_showscale=False)
        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
        fig.update_annotations(visible=False)
        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                          hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
                          texttemplate="</b><br><span "
                                       "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
                                       "<a href='%{customdata[0]}'>PubMed"
                                       "</a><br><a href='%{customdata[3]}'>Wikipedia"
                                       "</span></a>")
        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])

        # st.pyplot(fig2)
        st.plotly_chart(fig, use_container_width=True)

        # st.caption(
        #     "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
        # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")

        csv = table2.head(value_word).to_csv().encode('utf-8')
        st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv',
                           mime='text/csv')

    except:
        st.warning(
            f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus")

    st.markdown("---")
    # st.write(short_table)
    #

    # print()
    # print("Human genes similar to " + str(query))
    df1 = table
    df2 = pd.read_csv('Human_Genes.csv')
    m = df1.Word.isin(df2.symbol)
    df1 = df1[m]
    df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
    df1["Human Gene"] = df1["Human Gene"].str.upper()
    # print(df1.head(50))
    print()
    # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
    # time.sleep(2)
    # Create the slider with increments of 5 up to 100

    st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
                f"<span style='color:red; font-style: italic;'>genes</span> contextually "
                f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
                f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
                    unsafe_allow_html=True)
    value = st.slider("Gene", 0, 100, step=5)
    if value > 0:
        # st.subheader(f"Top {value} genes closely related to {query}: "
        #              f"Click on the Pubmed and NCBI links for more gene information")

        st.markdown(
            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value} "
            f"</span>genes similar to "
            f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
            unsafe_allow_html=True)

    df10 = df1.head(value)
    df10.index = (1 / df10.index)*10000
    sizes = df10.index.tolist()
    df10.set_index('Human Gene', inplace=True)

    df3 = df1.copy()
    df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value)["SIMILARITY"].round(2).astype(str)
    df3.reset_index(inplace=True)
    df3 = df3.rename(columns={'Human Gene': 'symbol2'})
    # Use df.query to get a subset of df1 based on ids in df2
    subset = df3.head(value).query('symbol2 in @df2.symbol2')
    # Use merge to join the two DataFrames on id
    result = pd.merge(subset, df2, on='symbol2')
    # Show the result
    # print(result)
    # label = df10.index.tolist()
    df2 = df10
    try:
        # Define the `text` column for labels and `href` column for links
        df2['text'] = df10.index
        df2['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
                  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10.index]
        df2['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10.index]

        df2['name'] = [c for c in result['Approved name']]

        df2['database'] = database_name

        # print(df['name'])

        # Create the treemap using `px.treemap`
        fig = px.treemap(df2, path=[df10.index], values=sizes,
                     custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value)['SIMILARITY']))

        fig.update(layout_coloraxis_showscale=False)
        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
        fig.update_annotations(visible=False)
        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                      hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
                      texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
                                   "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
                                   "<a href='%{customdata[0]}'>PubMed"
                                   "</a><br><a href='%{customdata[3]}'>NCBI"
                                   "</span></a>")
        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
        # # display the treemap in Streamlit
        # with treemap2:

        # st.pyplot(fig2)
        st.plotly_chart(fig, use_container_width=True)

        st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
        st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")



        csv = df1.head(value).to_csv().encode('utf-8')
        st.download_button(label=f"download top {value} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
                       mime='text/csv')


    except:
        st.warning(
            f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus")
    st.markdown("---")
    st.subheader("Cancer-related videos")
    if query:
        idlist=[]
        search_keyword = {query}
        html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
        html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
        html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer")
        html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer")
        html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer")
        video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
        video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode())
        video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode())
        video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode())
        video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode())

        for i in video_ids2:
            video_ids.append(i)
        for i in video_ids3:
            video_ids.append(i)
        for i in video_ids4:
            video_ids.append(i)
        for i in video_ids5:
            video_ids.append(i)

        random.shuffle(video_ids)

        c1, c2, c3 = st.columns(3)


        with c1:
           st.video("https://www.youtube.com/watch?v=" + video_ids[0])
        with c2:
           st.video("https://www.youtube.com/watch?v=" + video_ids[1])
        with c3:
           st.video("https://www.youtube.com/watch?v=" + video_ids[2])
    st.markdown("---")