jfataphd commited on
Commit
f67304b
·
1 Parent(s): a61d268

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -62
app.py CHANGED
@@ -53,6 +53,14 @@ if opt == "Neuroblastoma corpus":
53
  model_used = ("pubmed_model_neuroblastoma")
54
  num_abstracts = 29032
55
  database_name = "Neuroblastoma"
 
 
 
 
 
 
 
 
56
 
57
  st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science")
58
 
@@ -89,6 +97,8 @@ if query:
89
  st.stop()
90
  st.markdown("---")
91
  # def findRelationships(query, df):
 
 
92
  table = model.wv.most_similar_cosmul(query, topn=10000)
93
  table = (pd.DataFrame(table))
94
  table.index.name = 'Rank'
@@ -103,58 +113,84 @@ if query:
103
  # short_table = table.head(50)
104
  # print(table)
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # calculate the sizes of the squares in the treemap
108
- short_table = table2.head(10).round(2)
109
  short_table.index += 1
110
  short_table.index = (1 / short_table.index)*10
111
  sizes = short_table.index.tolist()
112
 
113
- cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes)))
114
- color = [cmap[i] for i in range(len(sizes))]
115
 
116
  short_table.set_index('Word', inplace=True)
117
- squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB",
118
- text_kwargs={'fontsize': 10},)
119
- # # plot the treemap using matplotlib
120
- plt.axis('off')
121
- # Add legend to top right, outside plot region
122
- # plt.legend("upper right", bbox_to_anchor=(-.2, 0))
123
- fig = plt.gcf()
124
- fig.patch.set_facecolor('#CCFFFF')
125
- # print(table.head(10)["SIMILARITY"])
126
- # # display the treemap in Streamlit
127
  table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
128
  rank_num = list(short_table.index.tolist())
129
  # avg_size = sum(sizes) / len(short_table.index)
130
- print(rank_num)
 
 
 
 
 
 
 
 
 
 
 
131
  # print(sizes)
132
  # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
133
- TEMPLATE = """
134
- <br>
135
- {0}: <a href='https://pubmed.ncbi.nlm.nih.gov/?term={1}%5Bmh%5D+%20%20%20%20%20NOT
136
- +review%5Bpt%5D+AND+english%5Bla%5D+AND+hasabstract+AND+1990%253A2022%252F12%252F31%5Bdp%5D+AND+%22{2}%22'>google</a>
137
- """.format(database_name,database_name, database_name)
138
- fig = px.treemap(names=rank_num, path=[short_table.index], values=sizes, hover_name=(table2.head(10)['SIMILARITY']))
139
-
140
- fig.update(layout_coloraxis_showscale=False)
141
- fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF")
142
- fig.update_annotations(visible=False)
143
- fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
144
- hoverlabel_bgcolor="lightgreen", hoverlabel_bordercolor="#000000", texttemplate=TEMPLATE)
145
- fig.update_layout(uniformtext=dict(minsize=15, mode='hide'), treemapcolorway=["lightgreen"])
146
-
147
-
148
- # treemap1, treemap2 = st.columns(2)
149
- # with treemap1:
150
- st.subheader(f"Top 10 Words closely related to {query}")
151
- # st.pyplot(fig)
152
- # plt.clf()
153
- st.plotly_chart(fig, use_container_width=True)
154
-
155
-
156
- csv = table.head(100).to_csv().encode('utf-8')
157
- st.download_button(label="download top 100 words (csv)", data=csv, file_name=f'{database_name}_words.csv', mime='text/csv')
 
 
 
 
 
 
158
  st.markdown("---")
159
  # st.write(short_table)
160
  #
@@ -178,7 +214,7 @@ if query:
178
  f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
179
  f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
180
  unsafe_allow_html=True)
181
- value = st.slider("", 0, 100, step=5)
182
  if value > 0:
183
  # st.subheader(f"Top {value} genes closely related to {query}: "
184
  # f"Click on the Pubmed and NCBI links for more gene information")
@@ -192,20 +228,8 @@ if query:
192
  df10 = df1.head(value)
193
  df10.index = (1 / df10.index)*10000
194
  sizes = df10.index.tolist()
195
-
196
- cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes)))
197
- color2 = [cmap2[i] for i in range(len(sizes))]
198
-
199
  df10.set_index('Human Gene', inplace=True)
200
- squarify.plot(sizes=sizes, label=df10.index.tolist(), color=color2, edgecolor="#EBF5FB",
201
- text_kwargs={'fontsize': 12})
202
- #
203
- # # plot the treemap using matplotlib
204
 
205
- plt.axis('off')
206
- fig2 = plt.gcf()
207
- fig2.patch.set_facecolor('#CCFFFF')
208
- #
209
  df3 = df1.copy()
210
  df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value)["SIMILARITY"].round(2).astype(str)
211
  df3.reset_index(inplace=True)
@@ -216,31 +240,31 @@ if query:
216
  result = pd.merge(subset, df2, on='symbol2')
217
  # Show the result
218
  # print(result)
219
-
220
- df = df10
221
  try:
222
  # Define the `text` column for labels and `href` column for links
223
- df['text'] = df10.index
224
- df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
225
  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10.index]
226
- df['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10.index]
227
 
228
- df['name'] = [c for c in result['Approved name']]
229
 
230
- df['database'] = database_name
231
 
232
  # print(df['name'])
233
 
234
  # Create the treemap using `px.treemap`
235
- fig = px.treemap(df, path=[df10.index], values=sizes,
236
- custom_data=['href', 'name', 'database', 'href2'], hover_name=(df3.head(value)['SIMILARITY']))
237
 
238
  fig.update(layout_coloraxis_showscale=False)
239
  fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
240
  fig.update_annotations(visible=False)
241
  fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
242
  hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
243
- texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{label}</span></b><br><span "
244
  "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
245
  "<a href='%{customdata[0]}'>PubMed"
246
  "</a><br><a href='%{customdata[3]}'>NCBI"
@@ -260,6 +284,8 @@ if query:
260
  csv = df1.head(value).to_csv().encode('utf-8')
261
  st.download_button(label=f"download top {value} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
262
  mime='text/csv')
 
 
263
  except:
264
  st.warning(
265
  f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus")
 
53
  model_used = ("pubmed_model_neuroblastoma")
54
  num_abstracts = 29032
55
  database_name = "Neuroblastoma"
56
+ # if opt == "Breast Cancer corpus":
57
+ # model_used = ("pubmed_model_breast_cancer")
58
+ # num_abstracts = 290320
59
+ # database_name = "Breast_cancer"
60
+ # if opt == "Mammary gland corpus":
61
+ # model_used = ("pubmed_model_mammary_gland")
62
+ # num_abstracts = 79032
63
+ # database_name = "Mammary_gland"
64
 
65
  st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science")
66
 
 
97
  st.stop()
98
  st.markdown("---")
99
  # def findRelationships(query, df):
100
+
101
+
102
  table = model.wv.most_similar_cosmul(query, topn=10000)
103
  table = (pd.DataFrame(table))
104
  table.index.name = 'Rank'
 
113
  # short_table = table.head(50)
114
  # print(table)
115
 
116
+ # Create the slider with increments of 5 up to 100
117
+
118
+ st.markdown(
119
+ f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
120
+ f"<span style='color:red; font-style: italic;'>words</span> contextually "
121
+ f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
122
+ f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
123
+ unsafe_allow_html=True)
124
+ value_word = st.slider("Words", 0, 100, step=5)
125
+ if value_word > 0:
126
+ # st.subheader(f"Top {value} genes closely related to {query}: "
127
+ # f"Click on the Pubmed and NCBI links for more gene information")
128
+
129
+ st.markdown(
130
+ f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
131
+ f"</span>words similar to "
132
+ f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Wikipaedia links for more word information</span></p></b>",
133
+ unsafe_allow_html=True)
134
+
135
 
136
  # calculate the sizes of the squares in the treemap
137
+ short_table = table2.head(value_word).round(2)
138
  short_table.index += 1
139
  short_table.index = (1 / short_table.index)*10
140
  sizes = short_table.index.tolist()
141
 
 
 
142
 
143
  short_table.set_index('Word', inplace=True)
144
+ # label = short_table.index.tolist()
145
+ print(short_table.index)
 
 
 
 
 
 
 
 
146
  table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
147
  rank_num = list(short_table.index.tolist())
148
  # avg_size = sum(sizes) / len(short_table.index)
149
+ df = short_table
150
+ try:
151
+ # Define the `text` column for labels and `href` column for links
152
+ df['text'] = short_table.index
153
+
154
+ df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
155
+ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
156
+ df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
157
+
158
+ df['database'] = database_name
159
+
160
+
161
  # print(sizes)
162
  # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
163
+ # Create the treemap using `px.treemap`
164
+ fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
165
+ hover_name=(table2.head(value_word)['SIMILARITY']))
166
+
167
+ fig.update(layout_coloraxis_showscale=False)
168
+ fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
169
+ fig.update_annotations(visible=False)
170
+ fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
171
+ hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
172
+ texttemplate="</b><br><span "
173
+ "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
174
+ "<a href='%{customdata[0]}'>PubMed"
175
+ "</a><br><a href='%{customdata[3]}'>Wikipedia"
176
+ "</span></a>")
177
+ fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
178
+
179
+ # st.pyplot(fig2)
180
+ st.plotly_chart(fig, use_container_width=True)
181
+
182
+ # st.caption(
183
+ # "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
184
+ # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
185
+
186
+ csv = table2.head(value_word).to_csv().encode('utf-8')
187
+ st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv',
188
+ mime='text/csv')
189
+
190
+ except:
191
+ st.warning(
192
+ f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus")
193
+
194
  st.markdown("---")
195
  # st.write(short_table)
196
  #
 
214
  f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
215
  f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
216
  unsafe_allow_html=True)
217
+ value = st.slider("Gene", 0, 100, step=5)
218
  if value > 0:
219
  # st.subheader(f"Top {value} genes closely related to {query}: "
220
  # f"Click on the Pubmed and NCBI links for more gene information")
 
228
  df10 = df1.head(value)
229
  df10.index = (1 / df10.index)*10000
230
  sizes = df10.index.tolist()
 
 
 
 
231
  df10.set_index('Human Gene', inplace=True)
 
 
 
 
232
 
 
 
 
 
233
  df3 = df1.copy()
234
  df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value)["SIMILARITY"].round(2).astype(str)
235
  df3.reset_index(inplace=True)
 
240
  result = pd.merge(subset, df2, on='symbol2')
241
  # Show the result
242
  # print(result)
243
+ # label = df10.index.tolist()
244
+ df2 = df10
245
  try:
246
  # Define the `text` column for labels and `href` column for links
247
+ df2['text'] = df10.index
248
+ df2['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
249
  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10.index]
250
+ df2['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10.index]
251
 
252
+ df2['name'] = [c for c in result['Approved name']]
253
 
254
+ df2['database'] = database_name
255
 
256
  # print(df['name'])
257
 
258
  # Create the treemap using `px.treemap`
259
+ fig = px.treemap(df2, path=[df10.index], values=sizes,
260
+ custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value)['SIMILARITY']))
261
 
262
  fig.update(layout_coloraxis_showscale=False)
263
  fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
264
  fig.update_annotations(visible=False)
265
  fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
266
  hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
267
+ texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
268
  "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
269
  "<a href='%{customdata[0]}'>PubMed"
270
  "</a><br><a href='%{customdata[3]}'>NCBI"
 
284
  csv = df1.head(value).to_csv().encode('utf-8')
285
  st.download_button(label=f"download top {value} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
286
  mime='text/csv')
287
+
288
+
289
  except:
290
  st.warning(
291
  f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus")