Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import pandas as pd | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances | |
st.set_page_config( | |
page_title="PhenoGene", | |
page_icon="๐งโ๐ป", | |
layout="wide", | |
menu_items={ | |
'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/', | |
'About': "PhenoGene v1.0" | |
}) | |
# Constants | |
embs = [] | |
# Heading | |
st.title('PhenoGene Interactive Demo') | |
with st.expander("About", expanded=True): | |
st.write( | |
""" | |
- PhenoGene is a novel gene prioritization method capable of representing HPO terms into embeddings. Utilizing advanced graph embeddings methods, PhenoGene can learn an effective mapping between genes and HPO terms. | |
- Given a list of HPO terms, we compute the similarity with a Gene | |
- **Input:** List of HPO terms | |
- **Output:** Similarity score to the genes | |
- Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/) | |
""" | |
) | |
st.markdown("") | |
# Gene File, 128 dim embeddings | |
gdf = pd.read_csv("data/diff2vec_gene_embd.csv").set_index("gene") | |
genes = gdf.index.tolist() | |
gene_emb = gdf.values | |
gene_emb_data = {} | |
for x,y in zip(genes, gene_emb): | |
gene_emb_data[x] = y.reshape(1, 128) | |
st.session_state['gene_emb_data'] = gene_emb_data | |
# All HPO embeddings, 128 dim embeddings | |
hdf = pd.read_csv("data/diff2vec_hpo_embd.csv").set_index("hpo_id") | |
hpos = hdf.index.tolist() | |
hpo_emb = hdf.values | |
hpo_emb_data = {} | |
for x,y in zip(hpos, hpo_emb): | |
hpo_emb_data[x] = y.reshape(1, 128) | |
st.session_state['hpo_emb_data'] = hpo_emb_data | |
def compute_similarity_with_gene(emb_src, genes, distance_metric='cosine'): | |
data = {} | |
for g in genes: | |
if distance_metric == "cosine": | |
data[g] = cosine_similarity(emb_src, gene_emb_data[g]).item() | |
df = pd.DataFrame(data=data.items(), columns=['gene', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True) | |
return df | |
def convert_df(df): | |
# IMPORTANT: Cache the conversion to prevent computation on every rerun | |
return df.to_csv().encode('utf-8') | |
st.subheader("๐ฎ Enter HPO terms separated by a comma") | |
hpo_terms_text = st.text_area('Example: HP_0000006, HP_0000006', "HP_0000006, HP_0000006") | |
hpo_terms = list(map(str.strip, hpo_terms_text.split(','))) | |
#st.write("HPO Terms entered: ") | |
#st.write(hpo_terms) | |
st.subheader("๐ป Hit Compute to calculate similarity to gene") | |
metrics = 'cosine' | |
no_emb = False | |
if st.button("Compute"): | |
with st.spinner('Computing...'): | |
for h in hpo_terms: | |
if h not in hpo_emb_data.keys(): | |
st.error("No Embeddings.") | |
no_emb = True | |
break | |
embs.append(hpo_emb_data[h]) | |
embs_mean = np.array(embs).mean(axis=0) | |
result_df = compute_similarity_with_gene(embs_mean, genes, distance_metric=metrics) | |
if no_emb: | |
st.error("Embedding Error.") | |
else: | |
csv = convert_df(result_df) | |
st.success("Done!") | |
st.dataframe(result_df) | |
st.download_button( | |
label="Download results as CSV", | |
data=csv, | |
mime='text/csv', | |
) | |