OncoDigger / app.py
jfataphd's picture
Update app.py
90c2875
raw
history blame
4.73 kB
import streamlit as st
import time
import json
from gensim.models import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import squarify
import numpy as np
# Define the HTML and CSS styles
st.markdown("""
<style>
body {
background-color: #EBF5FB;
# color: #ffffff;
}
.stApp {
background-color: #EBF5FB;
# color: #ffffff;
}
</style>
""", unsafe_allow_html=True)
opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus'))
if opt == "Clotting corpus":
model_used = ("pubmed_model_clotting")
num_abstracts = 45493
database_name = "Clotting"
if opt == "Neuroblastoma corpus":
model_used = ("pubmed_model_neuroblastoma")
num_abstracts = 29032
database_name = "Neuroblastoma"
st.header(f"Word2Vec App for {database_name} Pubmed corpus.")
st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")
st.subheader("Open sidebar to choose corpus")
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus", max_chars=50)
query = text_input_value
query = query.lower()
# query = input ("Enter your keyword(s):")
if query:
bar = st.progress(0)
time.sleep(.2)
st.caption(f":LightSkyBlue[searching {num_abstracts} {database_name} PubMed abstracts]")
for i in range(10):
bar.progress((i + 1) * 10)
time.sleep(.1)
try:
model = Word2Vec.load(model_used) # you can continue training with the loaded model!
words = list(model.wv.key_to_index)
X = model.wv[model.wv.key_to_index]
model2 = model.wv[query]
df = pd.DataFrame(X)
except:
st.error("Term occurrence is too low - please try another term")
st.stop()
# def findRelationships(query, df):
table = model.wv.most_similar_cosmul(query, topn=10000)
table = (pd.DataFrame(table))
table.index.name = 'Rank'
table.columns = ['Word', 'SIMILARITY']
print()
print("Similarity to " + str(query))
pd.set_option('display.max_rows', None)
print(table.head(50))
# table.head(10).to_csv("clotting_sim1.csv", index=True)
# short_table = table.head(50)
# print(table)
st.subheader(f"Top 10 Words closely related to {query}")
# calculate the sizes of the squares in the treemap
short_table = table.head(10)
short_table.index += 1
short_table.index = 1 / short_table.index
sizes = short_table.index.tolist()
cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes)))
color = [cmap[i] for i in range(len(sizes))]
short_table.set_index('Word', inplace=True)
squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB",
text_kwargs={'fontsize': 10})
# # plot the treemap using matplotlib
plt.axis('off')
fig = plt.gcf()
fig.patch.set_facecolor('#EBF5FB')
# # display the treemap in Streamlit
st.pyplot(fig)
plt.clf()
csv = table.head(100).to_csv().encode('utf-8')
st.download_button(label="download top 100 words (csv)", data=csv, file_name=f'{database_name}_words.csv', mime='text/csv')
# st.write(short_table)
#
print()
print("Human genes similar to " + str(query))
df1 = table
df2 = pd.read_csv('Human_Genes.csv')
m = df1.Word.isin(df2.symbol)
df1 = df1[m]
df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
df1["Human Gene"] = df1["Human Gene"].str.upper()
print(df1.head(50))
print()
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
# time.sleep(2)
st.subheader(f"Top 10 Genes closely related to {query}")
df10 = df1.head(10)
df10.index = 1 / df10.index
sizes = df10.index.tolist()
cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes)))
color2 = [cmap2[i] for i in range(len(sizes))]
df10.set_index('Human Gene', inplace=True)
squarify.plot(sizes=sizes, label=df10.index.tolist(), color=color2, edgecolor="#EBF5FB",
text_kwargs={'fontsize': 12})
#
# # plot the treemap using matplotlib
plt.axis('off')
fig2 = plt.gcf()
fig2.patch.set_facecolor('#EBF5FB')
# plt.show()
#
# # display the treemap in Streamlit
st.pyplot(fig2)
csv = df1.head(100).to_csv().encode('utf-8')
st.download_button(label="download top 100 genes (csv)", data=csv, file_name=f'{database_name}_genes.csv', mime='text/csv')
# findRelationships(query, df)
# model = gensim.models.KeyedVectors.load_word2vec_format('pubmed_model_clotting', binary=True)
# similar_words = model.most_similar(word)
# output = json.dumps({"word": word, "similar_words": similar_words})
# st.write(output)