import streamlit as st import time import json from gensim.models import Word2Vec import pandas as pd import matplotlib.pyplot as plt import squarify import numpy as np import re import urllib.request import random import plotly.express as px st.set_page_config( page_title="FATA4 Science", page_icon=":microscope:", layout="wide", #centered initial_sidebar_state="auto", menu_items={ 'About': "FATA4 Science is a Natural Language Processing (NLP) that ...." } ) # Define the HTML and CSS styles st.markdown(""" """, unsafe_allow_html=True) st.markdown(""" """, unsafe_allow_html=True) opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus')) if opt == "Clotting corpus": model_used = ("pubmed_model_clotting") num_abstracts = 45493 database_name = "Clotting" if opt == "Neuroblastoma corpus": model_used = ("pubmed_model_neuroblastoma") num_abstracts = 29032 database_name = "Neuroblastoma" # if opt == "Breast Cancer corpus": # model_used = ("pubmed_model_breast_cancer") # num_abstracts = 290320 # database_name = "Breast_cancer" # if opt == "Mammary gland corpus": # model_used = ("pubmed_model_mammary_gland") # num_abstracts = 79032 # database_name = "Mammary_gland" st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science") st.subheader("Uncovering knowledge through Natural Language Processing (NLP)") st.markdown("---") st.header(f":blue[{database_name} Pubmed corpus.]") text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus") query = text_input_value query = query.lower() query = re.sub("[,.?!&*;: ]", "", query) matches = [" "] if any([x in query for x in matches]): st.write("Please only enter one term or a term without spaces") # query = input ("Enter your keyword(s):") if query: bar = st.progress(0) time.sleep(.05) st.caption(f"Searching {num_abstracts} {database_name} PubMed abstracts covering 1990-2022") for i in range(10): bar.progress((i + 1) * 10) time.sleep(.1) try: model = Word2Vec.load(model_used) # you can continue training with the loaded model! words = list(model.wv.key_to_index) X = model.wv[model.wv.key_to_index] model2 = model.wv[query] df = pd.DataFrame(X) except: st.error("Term occurrence is too low - please try another term") st.stop() st.markdown("---") # def findRelationships(query, df): table = model.wv.most_similar_cosmul(query, topn=10000) table = (pd.DataFrame(table)) table.index.name = 'Rank' table.columns = ['Word', 'SIMILARITY'] # print() # print("Similarity to " + str(query)) pd.set_option('display.max_rows', None) table2 = table.copy() # print(table.head(50)) # table.head(10).to_csv("clotting_sim1.csv", index=True) # short_table = table.head(50) # print(table) # Create the slider with increments of 5 up to 100 st.markdown( f"Populate a treemap with the slider below to visualize " f"words contextually " f"and semantically similar to {query} " f"within the {database_name} corpus.
", unsafe_allow_html=True) value_word = st.slider("Words", 0, 100, step=5) if value_word > 0: # st.subheader(f"Top {value} genes closely related to {query}: " # f"Click on the Pubmed and NCBI links for more gene information") st.markdown( f"Top {value_word} " f"words similar to " f"{query}: Click on the squares to expand and the Wikipaedia links for more word information
", unsafe_allow_html=True) # calculate the sizes of the squares in the treemap short_table = table2.head(value_word).round(2) short_table.index += 1 short_table.index = (1 / short_table.index)*10 sizes = short_table.index.tolist() short_table.set_index('Word', inplace=True) # label = short_table.index.tolist() print(short_table.index) table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str) rank_num = list(short_table.index.tolist()) # avg_size = sum(sizes) / len(short_table.index) df = short_table try: # Define the `text` column for labels and `href` column for links df['text'] = short_table.index df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index] df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index] df['database'] = database_name # print(sizes) # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8')) # Create the treemap using `px.treemap` fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'], hover_name=(table2.head(value_word)['SIMILARITY'])) fig.update(layout_coloraxis_showscale=False) fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) fig.update_annotations(visible=False) fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="
%{customdata[1]}
" "PubMed" "
Wikipedia" "") fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"]) # st.pyplot(fig2) st.plotly_chart(fig, use_container_width=True) # st.caption( # "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/") # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]") csv = table2.head(value_word).to_csv().encode('utf-8') st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv', mime='text/csv') except: st.warning( f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus") st.markdown("---") # st.write(short_table) # # print() # print("Human genes similar to " + str(query)) df1 = table df2 = pd.read_csv('Human_Genes.csv') m = df1.Word.isin(df2.symbol) df1 = df1[m] df1.rename(columns={'Word': 'Human Gene'}, inplace=True) df1["Human Gene"] = df1["Human Gene"].str.upper() # print(df1.head(50)) print() # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) # time.sleep(2) # Create the slider with increments of 5 up to 100 st.markdown(f"Populate a treemap with the slider below to visualize " f"genes contextually " f"and semantically similar to {query} " f"within the {database_name} corpus.
", unsafe_allow_html=True) value = st.slider("Gene", 0, 100, step=5) if value > 0: # st.subheader(f"Top {value} genes closely related to {query}: " # f"Click on the Pubmed and NCBI links for more gene information") st.markdown( f"Top {value} " f"genes similar to " f"{query}: Click on the squares to expand and the Pubmed and NCBI links for more gene information
", unsafe_allow_html=True) df10 = df1.head(value) df10.index = (1 / df10.index)*10000 sizes = df10.index.tolist() df10.set_index('Human Gene', inplace=True) df3 = df1.copy() df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value)["SIMILARITY"].round(2).astype(str) df3.reset_index(inplace=True) df3 = df3.rename(columns={'Human Gene': 'symbol2'}) # Use df.query to get a subset of df1 based on ids in df2 subset = df3.head(value).query('symbol2 in @df2.symbol2') # Use merge to join the two DataFrames on id result = pd.merge(subset, df2, on='symbol2') # Show the result # print(result) # label = df10.index.tolist() df2 = df10 try: # Define the `text` column for labels and `href` column for links df2['text'] = df10.index df2['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10.index] df2['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10.index] df2['name'] = [c for c in result['Approved name']] df2['database'] = database_name # print(df['name']) # Create the treemap using `px.treemap` fig = px.treemap(df2, path=[df10.index], values=sizes, custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value)['SIMILARITY'])) fig.update(layout_coloraxis_showscale=False) fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) fig.update_annotations(visible=False) fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="%{customdata[4]}
%{customdata[1]}
" "PubMed" "
NCBI" "") fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"]) # # display the treemap in Streamlit # with treemap2: # st.pyplot(fig2) st.plotly_chart(fig, use_container_width=True) st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/") st.caption("Gene designation add in exceptions [p21, p53, her2, her3]") csv = df1.head(value).to_csv().encode('utf-8') st.download_button(label=f"download top {value} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv', mime='text/csv') except: st.warning( f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus") st.markdown("---") st.subheader("Cancer-related videos") if query: idlist=[] search_keyword = {query} html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer") html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer") html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer") html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer") html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer") video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode()) video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode()) video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode()) video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode()) video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode()) for i in video_ids2: video_ids.append(i) for i in video_ids3: video_ids.append(i) for i in video_ids4: video_ids.append(i) for i in video_ids5: video_ids.append(i) random.shuffle(video_ids) c1, c2, c3 = st.columns(3) with c1: st.video("https://www.youtube.com/watch?v=" + video_ids[0]) with c2: st.video("https://www.youtube.com/watch?v=" + video_ids[1]) with c3: st.video("https://www.youtube.com/watch?v=" + video_ids[2]) st.markdown("---")