Kapoor commited on
Commit
c2e5c2b
Β·
1 Parent(s): 300a2c4
App.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
6
+
7
+ st.set_page_config(
8
+ page_title="PhenoGene",
9
+ page_icon="πŸ§‘β€πŸ’»",
10
+ layout="wide",
11
+ menu_items={
12
+ 'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/',
13
+ 'About': "PhenoGene v1.0"
14
+ }
15
+ )
16
+
17
+ # Constants
18
+ embs = []
19
+
20
+ # Heading
21
+ st.title('PhenoGene Interactive Demo')
22
+ st.image("https://www.nlm.nih.gov/images/NLMgeneric.jpg")
23
+ with st.expander("About", expanded=True):
24
+
25
+ st.write(
26
+ """
27
+ - This Application presents the demo for PhenoGene. Given a list of HPO terms, we compute the similarity with a Gene
28
+ - Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/)
29
+ """
30
+ )
31
+
32
+ st.markdown("")
33
+
34
+
35
+
36
+ # Gene File, 128 dim embeddings
37
+ gdf = pd.read_csv("data/diff2vec_gene_embd.csv").set_index("gene")
38
+ genes = gdf.index.tolist()
39
+ gene_emb = gdf.values
40
+ gene_emb_data = {}
41
+ for x,y in zip(genes, gene_emb):
42
+ gene_emb_data[x] = y.reshape(1, 128)
43
+ st.session_state['gene_emb_data'] = gene_emb_data
44
+
45
+
46
+ # All HPO embeddings, 128 dim embeddings
47
+ hdf = pd.read_csv("data/diff2vec_hpo_embd.csv").set_index("hpo_id")
48
+ hpos = hdf.index.tolist()
49
+ hpo_emb = hdf.values
50
+ hpo_emb_data = {}
51
+ for x,y in zip(hpos, hpo_emb):
52
+ hpo_emb_data[x] = y.reshape(1, 128)
53
+ st.session_state['hpo_emb_data'] = hpo_emb_data
54
+
55
+ @st.cache(allow_output_mutation=True)
56
+ def compute_similarity_with_gene(emb_src, genes, distance_metric='cosine'):
57
+ data = {}
58
+ for g in genes:
59
+ if distance_metric == "Cosine":
60
+ data[g] = cosine_similarity(emb_src, gene_emb_data[g]).item()
61
+ df = pd.DataFrame(data=data.items(), columns=['gene', 'cosine']).sort_values(by='cosine', ascending=False).reset_index()
62
+ return df
63
+
64
+ @st.cache
65
+ def convert_df(df):
66
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
67
+ return df.to_csv().encode('utf-8')
68
+
69
+ st.subheader("Enter HPO terms separated by a comma")
70
+ hpo_terms_text = st.text_area('Example: HP_0000006, HP_0000006')
71
+ hpo_terms = list(map(str.strip, hpo_terms_text.split(',')))
72
+
73
+ st.write("HPO Terms entered: ")
74
+ st.write(hpo_terms)
75
+
76
+ st.subheader("Hit Compute to calculate similarity to gene")
77
+ metrics = 'Cosine'
78
+
79
+ if st.button("Compute"):
80
+ with st.spinner('Computing...'):
81
+ for h in hpo_terms:
82
+ if h not in hpo_emb_data.keys():
83
+ st.error("No Embeddings.")
84
+ break
85
+ embs.append(hpo_emb_data[h])
86
+ embs_mean = np.array(embs).mean(axis=0)
87
+ result_df = compute_similarity_with_gene(embs_mean, genes, distance_metric=metrics)
88
+ csv = convert_df(result_df)
89
+ st.success("Done!")
90
+ st.dataframe(result_df)
91
+
92
+ st.download_button(
93
+ label="Download results as CSV",
94
+ data=csv,
95
+ mime='text/csv',
96
+ )
97
+
diff2vec_gene_embd.csv β†’ data/diff2vec_gene_embd.csv RENAMED
File without changes
diff2vec_hpo_embd.csv β†’ data/diff2vec_hpo_embd.csv RENAMED
File without changes