jfataphd commited on
Commit
e32c352
·
1 Parent(s): f67304b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -100
app.py CHANGED
@@ -85,16 +85,16 @@ if query:
85
  bar.progress((i + 1) * 10)
86
  time.sleep(.1)
87
 
88
- try:
89
- model = Word2Vec.load(model_used) # you can continue training with the loaded model!
90
- words = list(model.wv.key_to_index)
91
- X = model.wv[model.wv.key_to_index]
92
- model2 = model.wv[query]
93
- df = pd.DataFrame(X)
94
-
95
- except:
96
- st.error("Term occurrence is too low - please try another term")
97
- st.stop()
98
  st.markdown("---")
99
  # def findRelationships(query, df):
100
 
@@ -133,62 +133,61 @@ if query:
133
  unsafe_allow_html=True)
134
 
135
 
136
- # calculate the sizes of the squares in the treemap
137
- short_table = table2.head(value_word).round(2)
138
- short_table.index += 1
139
- short_table.index = (1 / short_table.index)*10
140
- sizes = short_table.index.tolist()
141
 
142
 
143
- short_table.set_index('Word', inplace=True)
144
- # label = short_table.index.tolist()
145
- print(short_table.index)
146
- table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
147
- rank_num = list(short_table.index.tolist())
148
- # avg_size = sum(sizes) / len(short_table.index)
149
- df = short_table
150
- try:
151
- # Define the `text` column for labels and `href` column for links
152
- df['text'] = short_table.index
153
 
154
- df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
155
  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
156
- df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
157
 
158
- df['database'] = database_name
159
 
160
 
161
- # print(sizes)
162
- # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
163
- # Create the treemap using `px.treemap`
164
- fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
165
  hover_name=(table2.head(value_word)['SIMILARITY']))
166
 
167
- fig.update(layout_coloraxis_showscale=False)
168
- fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
169
- fig.update_annotations(visible=False)
170
- fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
171
  hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
172
  texttemplate="</b><br><span "
173
  "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
174
  "<a href='%{customdata[0]}'>PubMed"
175
  "</a><br><a href='%{customdata[3]}'>Wikipedia"
176
  "</span></a>")
177
- fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
178
 
179
- # st.pyplot(fig2)
180
- st.plotly_chart(fig, use_container_width=True)
181
 
182
- # st.caption(
183
- # "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
184
- # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
185
 
186
- csv = table2.head(value_word).to_csv().encode('utf-8')
187
- st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv',
188
  mime='text/csv')
189
-
190
- except:
191
- st.warning(
192
  f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus")
193
 
194
  st.markdown("---")
@@ -204,7 +203,7 @@ if query:
204
  df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
205
  df1["Human Gene"] = df1["Human Gene"].str.upper()
206
  # print(df1.head(50))
207
- print()
208
  # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
209
  # time.sleep(2)
210
  # Create the slider with increments of 5 up to 100
@@ -214,82 +213,184 @@ if query:
214
  f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
215
  f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
216
  unsafe_allow_html=True)
217
- value = st.slider("Gene", 0, 100, step=5)
218
- if value > 0:
219
  # st.subheader(f"Top {value} genes closely related to {query}: "
220
  # f"Click on the Pubmed and NCBI links for more gene information")
221
 
222
  st.markdown(
223
- f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value} "
224
  f"</span>genes similar to "
225
  f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
226
  unsafe_allow_html=True)
227
 
228
- df10 = df1.head(value)
229
- df10.index = (1 / df10.index)*10000
230
- sizes = df10.index.tolist()
231
- df10.set_index('Human Gene', inplace=True)
232
-
233
- df3 = df1.copy()
234
- df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value)["SIMILARITY"].round(2).astype(str)
235
- df3.reset_index(inplace=True)
236
- df3 = df3.rename(columns={'Human Gene': 'symbol2'})
237
- # Use df.query to get a subset of df1 based on ids in df2
238
- subset = df3.head(value).query('symbol2 in @df2.symbol2')
239
- # Use merge to join the two DataFrames on id
240
- result = pd.merge(subset, df2, on='symbol2')
241
- # Show the result
242
- # print(result)
243
- # label = df10.index.tolist()
244
- df2 = df10
245
- try:
246
- # Define the `text` column for labels and `href` column for links
247
- df2['text'] = df10.index
248
- df2['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
249
- '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10.index]
250
- df2['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10.index]
251
-
252
- df2['name'] = [c for c in result['Approved name']]
253
-
254
- df2['database'] = database_name
255
-
256
- # print(df['name'])
257
-
258
- # Create the treemap using `px.treemap`
259
- fig = px.treemap(df2, path=[df10.index], values=sizes,
260
- custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value)['SIMILARITY']))
261
-
262
- fig.update(layout_coloraxis_showscale=False)
263
- fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
264
- fig.update_annotations(visible=False)
265
- fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
 
266
  hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
267
  texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
268
  "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
269
  "<a href='%{customdata[0]}'>PubMed"
270
  "</a><br><a href='%{customdata[3]}'>NCBI"
271
  "</span></a>")
272
- fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
273
- # # display the treemap in Streamlit
274
- # with treemap2:
275
 
276
- # st.pyplot(fig2)
277
- st.plotly_chart(fig, use_container_width=True)
278
 
279
- st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
280
- st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
281
 
282
 
283
 
284
- csv = df1.head(value).to_csv().encode('utf-8')
285
- st.download_button(label=f"download top {value} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
286
  mime='text/csv')
287
 
288
 
289
- except:
290
- st.warning(
291
  f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus")
292
  st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  st.subheader("Cancer-related videos")
294
  if query:
295
  idlist=[]
 
85
  bar.progress((i + 1) * 10)
86
  time.sleep(.1)
87
 
88
+ # try:
89
+ model = Word2Vec.load(model_used) # you can continue training with the loaded model!
90
+ words = list(model.wv.key_to_index)
91
+ X = model.wv[model.wv.key_to_index]
92
+ model2 = model.wv[query]
93
+ df = pd.DataFrame(X)
94
+
95
+ # except:
96
+ # st.error("Term occurrence is too low - please try another term")
97
+ # st.stop()
98
  st.markdown("---")
99
  # def findRelationships(query, df):
100
 
 
133
  unsafe_allow_html=True)
134
 
135
 
136
+ # calculate the sizes of the squares in the treemap
137
+ short_table = table2.head(value_word).round(2)
138
+ short_table.index += 1
139
+ short_table.index = (1 / short_table.index)*10
140
+ sizes = short_table.index.tolist()
141
 
142
 
143
+ short_table.set_index('Word', inplace=True)
144
+ # label = short_table.index.tolist()
145
+ # print(short_table.index)
146
+ table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
147
+ rank_num = list(short_table.index.tolist())
148
+ # avg_size = sum(sizes) / len(short_table.index)
149
+ df = short_table
150
+ try:
151
+ # Define the `text` column for labels and `href` column for links
152
+ df['text'] = short_table.index
153
 
154
+ df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
155
  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
156
+ df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
157
 
158
+ df['database'] = database_name
159
 
160
 
161
+ # print(sizes)
162
+ # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
163
+ # Create the treemap using `px.treemap`
164
+ fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
165
  hover_name=(table2.head(value_word)['SIMILARITY']))
166
 
167
+ fig.update(layout_coloraxis_showscale=False)
168
+ fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
169
+ fig.update_annotations(visible=False)
170
+ fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
171
  hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
172
  texttemplate="</b><br><span "
173
  "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
174
  "<a href='%{customdata[0]}'>PubMed"
175
  "</a><br><a href='%{customdata[3]}'>Wikipedia"
176
  "</span></a>")
177
+ fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
178
 
179
+ # st.pyplot(fig2)
180
+ st.plotly_chart(fig, use_container_width=True)
181
 
182
+ # st.caption(
183
+ # "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
184
+ # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
185
 
186
+ csv = table2.head(value_word).to_csv().encode('utf-8')
187
+ st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv',
188
  mime='text/csv')
189
+ except:
190
+ st.warning(
 
191
  f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus")
192
 
193
  st.markdown("---")
 
203
  df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
204
  df1["Human Gene"] = df1["Human Gene"].str.upper()
205
  # print(df1.head(50))
206
+ # print()
207
  # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
208
  # time.sleep(2)
209
  # Create the slider with increments of 5 up to 100
 
213
  f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
214
  f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
215
  unsafe_allow_html=True)
216
+ value_gene = st.slider("Gene", 0, 100, step=5)
217
+ if value_gene > 0:
218
  # st.subheader(f"Top {value} genes closely related to {query}: "
219
  # f"Click on the Pubmed and NCBI links for more gene information")
220
 
221
  st.markdown(
222
+ f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
223
  f"</span>genes similar to "
224
  f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
225
  unsafe_allow_html=True)
226
 
227
+ df10 = df1.head(value_gene)
228
+ df10.index = (1 / df10.index)*10000
229
+ sizes = df10.index.tolist()
230
+ df10.set_index('Human Gene', inplace=True)
231
+
232
+ df3 = df1.copy()
233
+ df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str)
234
+ df3.reset_index(inplace=True)
235
+ df3 = df3.rename(columns={'Human Gene': 'symbol2'})
236
+ # Use df.query to get a subset of df1 based on ids in df2
237
+ subset = df3.head(value_gene).query('symbol2 in @df2.symbol2')
238
+ # Use merge to join the two DataFrames on id
239
+ result = pd.merge(subset, df2, on='symbol2')
240
+ # Show the result
241
+ # print(result)
242
+ # label = df10.index.tolist()
243
+ # df2 = df10
244
+ # print(df2)
245
+ try:
246
+ # Define the `text` column for labels and `href` column for links
247
+ df10['text'] = df10.index
248
+ df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
249
+ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']]
250
+ df10['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10['text']]
251
+
252
+ df10['name'] = [c for c in result['Approved name']]
253
+
254
+ df10['database'] = database_name
255
+
256
+ # print(df['name'])
257
+
258
+ # Create the treemap using `px.treemap`
259
+ fig = px.treemap(df10, path=[df10['text']], values=sizes,
260
+ custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value_gene)['SIMILARITY']))
261
+
262
+ fig.update(layout_coloraxis_showscale=False)
263
+ fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
264
+ fig.update_annotations(visible=False)
265
+ fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
266
  hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
267
  texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
268
  "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
269
  "<a href='%{customdata[0]}'>PubMed"
270
  "</a><br><a href='%{customdata[3]}'>NCBI"
271
  "</span></a>")
272
+ fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
273
+ # # display the treemap in Streamlit
274
+ # with treemap2:
275
 
276
+ # st.pyplot(fig2)
277
+ st.plotly_chart(fig, use_container_width=True)
278
 
279
+ st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
280
+ st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
281
 
282
 
283
 
284
+ csv = df1.head(value_gene).to_csv().encode('utf-8')
285
+ st.download_button(label=f"download top {value_gene} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
286
  mime='text/csv')
287
 
288
 
289
+ except:
290
+ st.warning(
291
  f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus")
292
  st.markdown("---")
293
+
294
+ # st.write(short_table)
295
+ #
296
+
297
+ # print()
298
+ # print("Human genes similar to " + str(query))
299
+ df1 = table
300
+ df2 = pd.read_csv('protein.csv')
301
+ m = df1.Word.isin(df2.protein)
302
+ df1 = df1[m]
303
+ df1.rename(columns={'Word': 'Protein'}, inplace=True)
304
+ # print(df1)
305
+ df_len = len(df1)
306
+ # df1["Protein"] = df1["Protein"].str.upper()
307
+ # print(df1.head(50))
308
+ # print()
309
+ # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
310
+ # time.sleep(2)
311
+ # Create the slider with increments of 5 up to 100
312
+
313
+ st.markdown(
314
+ f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
315
+ f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
316
+ f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
317
+ f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
318
+ unsafe_allow_html=True)
319
+ value_protein = st.slider("Protein", 0, 100, step=5)
320
+ # print(value_protein)
321
+ if value_protein > 0:
322
+ # st.subheader(f"Top {value} genes closely related to {query}: "
323
+ # f"Click on the Pubmed and NCBI links for more gene information")
324
+
325
+ st.markdown(
326
+ f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_protein} "
327
+ f"</span>proteins similar to "
328
+ f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and Wikipedia links for more protein information</span></p></b>",
329
+ unsafe_allow_html=True)
330
+
331
+ df11 = df1.head(value_protein)
332
+ print(df11)
333
+
334
+ df11.index = (1 / df11.index) * 10000
335
+ sizes = df11.index.tolist()
336
+
337
+ df11.set_index('Protein', inplace=True)
338
+
339
+ df4 = df1.copy()
340
+ # print(df4.head(10))
341
+ df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_protein)["SIMILARITY"].round(2).astype(str)
342
+ df4.reset_index(inplace=True)
343
+ # df4 = df4.rename(columns={'Protein': 'symbol2'})
344
+ # print(df4)
345
+ # # Use df.query to get a subset of df1 based on ids in df2
346
+ # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
347
+ # # Use merge to join the two DataFrames on id
348
+ # result = pd.merge(subset, df2b, on='symbol2')
349
+ # print(result)
350
+ if value_protein <= df_len:
351
+ # Define the `text` column for labels and `href` column for links
352
+ df11['text'] = df11.index
353
+ df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
354
+ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
355
+ df11['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df11['text']]
356
+
357
+ df11['database'] = database_name
358
+
359
+ # df11['name'] = [c for c in result['Approved name']]
360
+
361
+ # Create the treemap using `px.treemap`
362
+ fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
363
+ hover_name=(df4.head(value_protein)['SIMILARITY']))
364
+
365
+ fig.update(layout_coloraxis_showscale=False)
366
+ fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
367
+ fig.update_annotations(visible=False)
368
+ fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
369
+ hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
370
+ texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}</span></b><br>"
371
+ "<a href='%{customdata[0]}'>PubMed"
372
+ "</a><br><a href='%{customdata[2]}'>Wikipedia"
373
+ "</span></a>")
374
+ fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
375
+ # # display the treemap in Streamlit
376
+ # with treemap2:
377
+
378
+ # st.pyplot(fig2)
379
+ st.plotly_chart(fig, use_container_width=True)
380
+
381
+ st.caption(
382
+ "Protein designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
383
+
384
+ csv = df1.head(value_protein).to_csv().encode('utf-8')
385
+ st.download_button(label=f"download top {value_protein} proteins (csv)", data=csv, file_name=f'{database_name}_genes.csv',
386
+ mime='text/csv')
387
+
388
+
389
+ else:
390
+ st.warning(f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus")
391
+ st.markdown("---")
392
+
393
+
394
  st.subheader("Cancer-related videos")
395
  if query:
396
  idlist=[]