PhenoGene-Demo / app.py
aaditkapoorbionlp's picture
Update app.py
9c06b92
import streamlit as st
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
st.set_page_config(
page_title="PhenoGene",
page_icon="๐Ÿง‘โ€๐Ÿ’ป",
layout="wide",
menu_items={
'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/',
'About': "PhenoGene v1.0"
})
# Constants
embs = []
# Heading
st.title('PhenoGene Interactive Demo')
with st.expander("About", expanded=True):
st.write(
"""
- PhenoGene is a novel gene prioritization method capable of representing HPO terms into embeddings. Utilizing advanced graph embeddings methods, PhenoGene can learn an effective mapping between genes and HPO terms.
- Given a list of HPO terms, we compute the similarity with a Gene
- **Input:** List of HPO terms
- **Output:** Similarity score to the genes
- Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/)
"""
)
st.markdown("")
# Gene File, 128 dim embeddings
gdf = pd.read_csv("data/diff2vec_gene_embd.csv").set_index("gene")
genes = gdf.index.tolist()
gene_emb = gdf.values
gene_emb_data = {}
for x,y in zip(genes, gene_emb):
gene_emb_data[x] = y.reshape(1, 128)
st.session_state['gene_emb_data'] = gene_emb_data
# All HPO embeddings, 128 dim embeddings
hdf = pd.read_csv("data/diff2vec_hpo_embd.csv").set_index("hpo_id")
hpos = hdf.index.tolist()
hpo_emb = hdf.values
hpo_emb_data = {}
for x,y in zip(hpos, hpo_emb):
hpo_emb_data[x] = y.reshape(1, 128)
st.session_state['hpo_emb_data'] = hpo_emb_data
@st.cache(allow_output_mutation=True)
def compute_similarity_with_gene(emb_src, genes, distance_metric='cosine'):
data = {}
for g in genes:
if distance_metric == "cosine":
data[g] = cosine_similarity(emb_src, gene_emb_data[g]).item()
df = pd.DataFrame(data=data.items(), columns=['gene', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
return df
@st.cache
def convert_df(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode('utf-8')
st.subheader("๐Ÿ–ฎ Enter HPO terms separated by a comma")
hpo_terms_text = st.text_area('Example: HP_0000006, HP_0000006', "HP_0000006, HP_0000006")
hpo_terms = list(map(str.strip, hpo_terms_text.split(',')))
#st.write("HPO Terms entered: ")
#st.write(hpo_terms)
st.subheader("๐Ÿ’ป Hit Compute to calculate similarity to gene")
metrics = 'cosine'
no_emb = False
if st.button("Compute"):
with st.spinner('Computing...'):
for h in hpo_terms:
if h not in hpo_emb_data.keys():
st.error("No Embeddings.")
no_emb = True
break
embs.append(hpo_emb_data[h])
embs_mean = np.array(embs).mean(axis=0)
result_df = compute_similarity_with_gene(embs_mean, genes, distance_metric=metrics)
if no_emb:
st.error("Embedding Error.")
else:
csv = convert_df(result_df)
st.success("Done!")
st.dataframe(result_df)
st.download_button(
label="Download results as CSV",
data=csv,
mime='text/csv',
)