jfataphd commited on
Commit
8d11e9c
·
1 Parent(s): 4a76b1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +745 -729
app.py CHANGED
@@ -76,7 +76,8 @@ st.markdown("---")
76
  # # If the password is correct, show the app content
77
  # if authenticate(password):
78
  opt = st.sidebar.radio("Select a PubMed Corpus", options=('Breast Cancer corpus', 'Lung Cancer corpus',
79
- 'Colorectal Cancer corpus', 'Prostate Cancer corpus'))
 
80
  # if opt == "Clotting corpus":
81
  # model_used = ("pubmed_model_clotting")
82
  # num_abstracts = 45493
@@ -105,6 +106,10 @@ if opt == "Prostate Cancer corpus":
105
  model_used = ("prostate_cancer_pubmed_model")
106
  num_abstracts = 89782
107
  database_name = "Prostate_cancer"
 
 
 
 
108
 
109
  st.header(f":blue[{database_name} Pubmed corpus.]")
110
  text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
@@ -126,111 +131,118 @@ if query:
126
  bar.progress((i + 1) * 10)
127
  time.sleep(.1)
128
 
129
- # try:
130
- model = Word2Vec.load(f"{model_used}") # you can continue training with the loaded model!
131
- words = list(model.wv.key_to_index)
132
- X = model.wv[model.wv.key_to_index]
133
- # print(model.wv['bfgf'])
134
- model2 = model.wv[query]
135
- # print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
136
- df = pd.DataFrame(X)
137
 
138
 
139
- def get_compound_ids(compound_names):
140
- with concurrent.futures.ThreadPoolExecutor() as executor:
141
- compound_ids = list(executor.map(get_compound_id, compound_names))
142
- return compound_ids
143
 
144
 
145
- import requests
146
 
147
 
148
- def get_compound_id(compound_name):
149
- url = f"http://rest.kegg.jp/find/compound/{compound_name}"
150
- response = requests.get(url)
151
- if response.status_code == 200:
152
- result = response.text.split('\n')
153
- if result[0]:
154
- compound_id = result[0].split('\t')[0]
155
- return compound_id
156
- return None
157
 
158
 
159
  # except:
160
  # st.error("Term occurrence is too low - please try another term")
161
  # st.stop()
162
- st.markdown("---")
163
 
164
- table = model.wv.most_similar_cosmul(query, topn=10000)
165
- table = (pd.DataFrame(table))
166
- table.index.name = 'Rank'
167
- table.columns = ['Word', 'SIMILARITY']
 
168
 
169
- pd.set_option('display.max_rows', None)
170
- table2 = table.copy()
171
 
172
- # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
173
- # f"<span style='color:red; font-style: italic;'>words</span> contextually "
174
- # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
175
- # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
176
- # unsafe_allow_html=True)
177
 
178
- # Set the max number of words to display
179
- value_word = min(100, len(table2))
180
 
181
- st.markdown(
182
- f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
183
- f"</span>words contextually and semantically similar to "
184
- f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
185
- f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>",
186
- unsafe_allow_html=True)
187
 
188
- short_table = table2.head(value_word).round(2)
189
- short_table.index += 1
190
- short_table.index = (1 / short_table.index) * 10
191
- sizes = short_table.index.tolist()
192
 
193
- short_table.set_index('Word', inplace=True)
194
- table2["SIMILARITY"] = 'Similarity Score ' + table2.head(value_word)["SIMILARITY"].round(2).astype(str)
195
- rank_num = list(short_table.index.tolist())
 
 
 
196
 
197
- df = short_table
198
- try:
199
- df['text'] = short_table.index
200
- df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
 
 
 
 
 
 
 
 
 
 
201
  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
202
- df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
203
 
204
- df.loc[:, 'database'] = database_name
205
 
206
- fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
207
- hover_name=(table2.head(value_word)['SIMILARITY']))
208
 
209
- fig.update(layout_coloraxis_showscale=False)
210
- fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
211
- fig.update_annotations(visible=False)
212
- fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
213
  hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="<br><span "
214
  "style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
215
  "<a href='%{customdata[0]}'>PubMed"
216
  "</a><br><br><a href='%{customdata[3]}'>Wikipedia"
217
  "</span></a>")
218
- fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
219
 
220
- # st.pyplot(fig2)
221
- st.plotly_chart(fig, use_container_width=True)
222
 
223
- # st.caption(
224
- # "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
225
- # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
226
 
227
- csv = table2.head(value_word).to_csv().encode('utf-8')
228
- st.download_button(label=f"download top {value_word} words (csv)", data=csv,
229
  file_name=f'{database_name}_words.csv', mime='text/csv')
230
- except:
231
- st.warning(
232
- f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
233
 
 
 
 
 
 
 
234
  # st.markdown("---")
235
  # # st.write(short_table)
236
  # #
@@ -334,669 +346,673 @@ if query:
334
 
335
  st.markdown("---")
336
 
337
- df1 = table.copy()
338
- df2 = pd.read_csv('Human Genes.csv')
339
- m = df1.Word.isin(df2.symbol)
340
- df1 = df1[m]
341
- df1.rename(columns={'Word': 'Genes'}, inplace=True)
342
- df_len = len(df1)
343
- print(len(df1))
344
-
345
- # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
346
- # f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
347
- # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
348
- # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
349
- # unsafe_allow_html=True)
350
-
351
- # Set the number of proteins to display
352
- value_gene = min(df_len, 100)
353
-
354
- st.markdown(
355
- f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
356
- f"</span>human genes contextually and semantically similar to "
357
- f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name} </span>corpus. Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
358
- unsafe_allow_html=True)
359
-
360
- df11 = df1.head(value_gene).copy()
361
-
362
- df11.index = (1 / df11.index) * 10000
363
- sizes = df11.index.tolist()
364
-
365
- df11.set_index('Genes', inplace=True)
366
-
367
- df4 = df1.copy()
368
- # print(df4.head(10))
369
- df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_gene)["SIMILARITY"].round(2).astype(str)
370
- df4.reset_index(inplace=True)
371
- # df4 = df4.rename(columns={'Protein': 'symbol2'})
372
- # print(df4)
373
- # # Use df.query to get a subset of df1 based on ids in df2
374
- # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
375
- # # Use merge to join the two DataFrames on id
376
- # result = pd.merge(subset, df2b, on='symbol2')
377
- # print(result)
378
- if value_gene <= df_len:
379
- # Define the `text` column for labels and `href` column for links
380
- df11['text'] = df11.index
381
- df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
382
- '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
383
- df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
384
- assert isinstance(df11, object)
385
- df11['database'] = database_name
386
-
387
- # df11['name'] = [c for c in result['Approved name']]
388
-
389
- # Create the treemap using `px.treemap`
390
- fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
391
- hover_name=(df4.head(value_gene)['SIMILARITY']))
392
-
393
- fig.update(layout_coloraxis_showscale=False)
394
- fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
395
- fig.update_annotations(visible=False)
396
- fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
397
- hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
398
- texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
399
- "<a href='%{customdata[0]}'>PubMed"
400
- "</a><br><br><a href='%{customdata[2]}'>GeneCard"
401
- "</span></a>")
402
- fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
403
- # # display the treemap in Streamlit
404
- # with treemap2:
405
-
406
- # st.pyplot(fig2)
407
- st.plotly_chart(fig, use_container_width=True)
408
-
409
- # st.caption(
410
- # "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
411
- # st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
412
- st.caption(
413
- "Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
414
- st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
415
- st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
416
-
417
- csv = df1.head(value_gene).to_csv().encode('utf-8')
418
- st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
419
- file_name=f'{database_name}_genes.csv', mime='text/csv')
420
-
421
-
422
- else:
423
- st.warning(
424
- f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
425
- st.markdown("---")
426
- # print()
427
- # print("Human genes similar to " + str(query))
428
- df1 = table.copy()
429
- df2 = pd.read_csv('kegg_drug_list_lowercase.csv')
430
- m = df1.Word.isin(df2.drugs)
431
- df1 = df1[m]
432
- df1.rename(columns={'Word': 'Drugs'}, inplace=True)
433
- df_len = len(df1)
434
- # print(len(df1))
435
- # df1["Human Gene"] = df1["Human Gene"].str.upper()
436
- # print(df1.head(50))
437
- # print()
438
- # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
439
- # time.sleep(2)
440
- # Create the slider with increments of 5 up to 100
441
-
442
- # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
443
- value_drug = min(df1.shape[0], 100)
444
-
445
- # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
446
- # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
447
- # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
448
- # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
449
- # unsafe_allow_html=True)
450
-
451
- st.markdown(
452
- f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_drug} "
453
- f"</span>Drugs contextually and semantically similar to "
454
- f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
455
- unsafe_allow_html=True)
456
-
457
- df13 = df1.head(value_drug).copy()
458
-
459
- df13.index = (1 / df13.index) * 10000
460
- sizes = df13.index.tolist()
461
-
462
- df13.set_index('Drugs', inplace=True)
463
-
464
- df6 = df1.copy()
465
- # print(df4.head(10))
466
- df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str)
467
- df6.reset_index(inplace=True)
468
- # df4 = df4.rename(columns={'Protein': 'symbol2'})
469
- # print(df4)
470
- # # Use df.query to get a subset of df1 based on ids in df2
471
- # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
472
- # # Use merge to join the two DataFrames on id
473
- # result = pd.merge(subset, df2b, on='symbol2')
474
- # print(result)
475
- if value_drug <= df_len:
476
- # Define the `text` column for labels and `href` column for links
477
- # Reset the index
478
- df13.reset_index(inplace=True)
479
-
480
- # Replace hyphens with spaces in the 'text' column
481
- df13['Drugs'] = df13['Drugs'].str.replace('-', ' ')
482
-
483
- # Set the 'text' column back as the index
484
  df13.set_index('Drugs', inplace=True)
485
- df13['text'] = df13.index
486
- df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
487
- '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
488
- df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
489
- assert isinstance(df13, object)
490
- df13['database'] = database_name
491
-
492
- # df11['name'] = [c for c in result['Approved name']]
493
-
494
- # Create the treemap using `px.treemap`
495
- fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
496
- hover_name=(df6.head(value_drug)['SIMILARITY']))
497
-
498
- fig.update(layout_coloraxis_showscale=False)
499
- fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
500
- fig.update_annotations(visible=False)
501
- fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
502
- hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
503
- texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
504
- "<a href='%{customdata[0]}'>PubMed"
505
- "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
506
- "</span></a>")
507
- fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
508
- # # display the treemap in Streamlit
509
- # with treemap2:
510
-
511
- # st.pyplot(fig2)
512
- st.plotly_chart(fig, use_container_width=True)
513
-
514
- st.caption("Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
515
-
516
- csv = df1.head(value_drug).to_csv().encode('utf-8')
517
- st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
518
- file_name=f'{database_name}_drugs.csv', mime='text/csv')
519
-
520
-
521
- else:
522
- st.warning(
523
- f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
524
- st.markdown("---")
525
- #
526
- # st.markdown("---")
527
- # # print()
528
- # # print("Human genes similar to " + str(query))
529
- # df1 = table.copy()
530
- # df2 = pd.read_csv('diseasesKegg.csv')
531
- # m = df1.Word.isin(df2.disease)
532
- # df1 = df1[m]
533
- # df1.rename(columns={'Word': 'Disease'}, inplace=True)
534
- # df_len = len(df1)
535
- # # print(len(df1))
536
- # # df1["Human Gene"] = df1["Human Gene"].str.upper()
537
- # # print(df1.head(50))
538
- # # print()
539
- # # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
540
- # # time.sleep(2)
541
- # # Create the slider with increments of 5 up to 100
542
- #
543
- # # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
544
- # value_disease = min(df1.shape[0], 100)
545
- #
546
- # # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
547
- # # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
548
- # # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
549
- # # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
550
- # # unsafe_allow_html=True)
551
- #
552
- # st.markdown(
553
- # f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_disease} "
554
- # f"</span>Diseases contextually and semantically similar to "
555
- # f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
556
- # unsafe_allow_html=True)
557
- #
558
- # df14 = df1.head(value_disease).copy()
559
- #
560
- # df14.index = (1 / df14.index) * 10000
561
- # sizes = df14.index.tolist()
562
- #
563
- # df14.set_index('Disease', inplace=True)
564
- #
565
- # df7 = df1.copy()
566
- # # print(df4.head(10))
567
- # df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str)
568
- # df7.reset_index(inplace=True)
569
- # # df4 = df4.rename(columns={'Protein': 'symbol2'})
570
- # # print(df4)
571
- # # # Use df.query to get a subset of df1 based on ids in df2
572
- # # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
573
- # # # Use merge to join the two DataFrames on id
574
- # # result = pd.merge(subset, df2b, on='symbol2')
575
- # # print(result)
576
- # if value_disease <= df_len:
577
- # # Define the `text` column for labels and `href` column for links
578
- # # Reset the index
579
- # df14.reset_index(inplace=True)
580
- #
581
- # # Replace hyphens with spaces in the 'text' column
582
- # df14['Disease'] = df14['Disease'].str.replace('-', ' ')
583
- #
584
- # # Set the 'text' column back as the index
585
- # df14.set_index('Disease', inplace=True)
586
- # df14['text'] = df14.index
587
- # df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
588
- # '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']]
589
- # df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']]
590
- # assert isinstance(df14, object)
591
- # df14['database'] = database_name
592
- #
593
- # # df11['name'] = [c for c in result['Approved name']]
594
- #
595
- # # Create the treemap using `px.treemap`
596
- # fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
597
- # hover_name=(df7.head(value_disease)['SIMILARITY']))
598
- #
599
- # fig.update(layout_coloraxis_showscale=False)
600
- # fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
601
- # fig.update_annotations(visible=False)
602
- # fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
603
- # hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
604
- # texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
605
- # "<a href='%{customdata[0]}'>PubMed"
606
- # "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
607
- # "</span></a>")
608
- # fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["PaleGoldenRod"])
609
- # # # display the treemap in Streamlit
610
- # # with treemap2:
611
- #
612
- # # st.pyplot(fig2)
613
- # st.plotly_chart(fig, use_container_width=True)
614
- #
615
- # st.caption("Disease designation and database provided by KEGG: https://www.genome.jp/kegg/disease/")
616
- #
617
- # csv = df1.head(value_disease).to_csv().encode('utf-8')
618
- # st.download_button(label=f"download top {value_disease} diseases (csv)", data=csv,
619
- # file_name=f'{database_name}_disease.csv', mime='text/csv')
620
- #
621
- #
622
- # else:
623
- # st.warning(
624
- # f"This selection exceeds the number of similar diseases related to {query} within the {database_name} corpus, please choose a lower number")
625
- # st.markdown("---")
626
 
627
- # st.markdown("---")
628
- # # print()
629
- # # print("Human genes similar to " + str(query))
630
- # df1 = table.copy()
631
- # df2 = pd.read_csv('pathwaysKegg.csv')
632
- # m = df1.Word.isin(df2.pathway)
633
- # df1 = df1[m]
634
- # df1.rename(columns={'Word': 'Pathway'}, inplace=True)
635
- # df_len = len(df1)
636
- # # print(len(df1))
637
- # # df1["Human Gene"] = df1["Human Gene"].str.upper()
638
- # # print(df1.head(50))
639
- # # print()
640
- # # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
641
- # # time.sleep(2)
642
- # # Create the slider with increments of 5 up to 100
643
- #
644
- # # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
645
- # value_pathway = min(df1.shape[0], 100)
646
- #
647
- # # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
648
- # # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
649
- # # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
650
- # # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
651
- # # unsafe_allow_html=True)
652
- #
653
- # st.markdown(
654
- # f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_pathway} "
655
- # f"</span>Pathways contextually and semantically similar to "
656
- # f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
657
- # unsafe_allow_html=True)
658
- #
659
- # df16 = df1.head(value_pathway).copy()
660
- #
661
- # df16.index = (1 / df16.index) * 10000
662
- # sizes = df16.index.tolist()
663
- #
664
- # df16.set_index('Pathway', inplace=True)
665
- #
666
- # df9 = df1.copy()
667
- # # print(df4.head(10))
668
- # df9["SIMILARITY"] = 'Similarity Score ' + df9.head(value_pathway)["SIMILARITY"].round(2).astype(str)
669
- # df9.reset_index(inplace=True)
670
- # # df4 = df4.rename(columns={'Protein': 'symbol2'})
671
- # # print(df4)
672
- # # # Use df.query to get a subset of df1 based on ids in df2
673
- # # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
674
- # # # Use merge to join the two DataFrames on id
675
- # # result = pd.merge(subset, df2b, on='symbol2')
676
- # # print(result)
677
- # if value_pathway <= df_len:
678
- # # Define the `text` column for labels and `href` column for links
679
- # # Reset the index
680
- # df16.reset_index(inplace=True)
681
- #
682
- # # Replace hyphens with spaces in the 'text' column
683
- # df16['Pathway'] = df16['Pathway'].str.replace('-', ' ')
684
- #
685
- # # Set the 'text' column back as the index
686
- # df16.set_index('Pathway', inplace=True)
687
- # df16['text'] = df16.index
688
- # df16['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
689
- # '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df16['text']]
690
- # df16['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df16['text']]
691
- # assert isinstance(df16, object)
692
- # df16['database'] = database_name
693
- #
694
- # # df11['name'] = [c for c in result['Approved name']]
695
- #
696
- # # Create the treemap using `px.treemap`
697
- # fig = px.treemap(df16, path=[df16['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
698
- # hover_name=(df9.head(value_pathway)['SIMILARITY']))
699
- #
700
- # fig.update(layout_coloraxis_showscale=False)
701
- # fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
702
- # fig.update_annotations(visible=False)
703
- # fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
704
- # hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
705
- # texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
706
- # "<a href='%{customdata[0]}'>PubMed"
707
- # "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
708
- # "</span></a>")
709
- # fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["FloralWhite"])
710
- # # # display the treemap in Streamlit
711
- # # with treemap2:
712
- #
713
- # # st.pyplot(fig2)
714
- # st.plotly_chart(fig, use_container_width=True)
715
- #
716
- # st.caption("Pathway designation and database provided by KEGG: https://www.genome.jp/kegg/pathway.html")
717
- #
718
- # csv = df1.head(value_pathway).to_csv().encode('utf-8')
719
- # st.download_button(label=f"download top {value_pathway} pathways (csv)", data=csv,
720
- # file_name=f'{database_name}_pathways.csv', mime='text/csv')
721
- #
722
- #
723
- # else:
724
- # st.warning(
725
- # f"This selection exceeds the number of similar pathways related to {query} within the {database_name} corpus, please choose a lower number")
726
- # st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
 
728
- st.markdown("---")
729
- # print()
730
- # print("Human genes similar to " + str(query))
731
- df1 = table.copy()
732
- df2 = pd.read_csv('phytochemicals.csv')
733
- m = df1.Word.isin(df2.phyto)
734
- df1 = df1[m]
735
- df1.rename(columns={'Word': 'Phytochemical'}, inplace=True)
736
- df_len = len(df1)
737
- # print(len(df1))
738
- # df1["Human Gene"] = df1["Human Gene"].str.upper()
739
- # print(df1.head(50))
740
- # print()
741
- # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
742
- # time.sleep(2)
743
- # Create the slider with increments of 5 up to 100
744
-
745
- # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
746
- value_phyto = min(df1.shape[0], 100)
747
-
748
- # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
749
- # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
750
- # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
751
- # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
752
- # unsafe_allow_html=True)
753
-
754
- st.markdown(
755
- f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_phyto} "
756
- f"</span>Phytochemicals contextually and semantically similar to "
757
- f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
758
- f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information</span></p></b>",
759
- unsafe_allow_html=True)
760
-
761
- df15 = df1.head(value_phyto).copy()
762
-
763
- df15.index = (1 / df15.index) * 10000
764
- sizes = df15.index.tolist()
765
-
766
- df15.set_index('Phytochemical', inplace=True)
767
-
768
- df8 = df1.copy()
769
- # print(df4.head(10))
770
- df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str)
771
- df8.reset_index(inplace=True)
772
- # df4 = df4.rename(columns={'Protein': 'symbol2'})
773
- # print(df4)
774
- # # Use df.query to get a subset of df1 based on ids in df2
775
- # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
776
- # # Use merge to join the two DataFrames on id
777
- # result = pd.merge(subset, df2b, on='symbol2')
778
- # print(result)
779
- if value_phyto <= df_len:
780
- # Define the `text` column for labels and `href` column for links
781
- # Reset the index
782
- df15.reset_index(inplace=True)
783
-
784
- # Replace hyphens with spaces in the 'text' column
785
- df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ')
786
-
787
- # Set the 'text' column back as the index
788
  df15.set_index('Phytochemical', inplace=True)
789
- df15['text'] = df15.index
790
- df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
791
- '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']]
792
- df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']]
793
- assert isinstance(df15, object)
794
- df15['database'] = database_name
795
-
796
- # df11['name'] = [c for c in result['Approved name']]
797
-
798
- # Create the treemap using `px.treemap`
799
- fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
800
- hover_name=(df8.head(value_phyto)['SIMILARITY']))
801
-
802
- fig.update(layout_coloraxis_showscale=False)
803
- fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
804
- fig.update_annotations(visible=False)
805
- fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
806
- hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
807
- texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
808
- "<a href='%{customdata[0]}'>PubMed"
809
- "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
810
- "</span></a>")
811
- fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightSeaGreen"])
812
- # # display the treemap in Streamlit
813
- # with treemap2:
814
-
815
- # st.pyplot(fig2)
816
- st.plotly_chart(fig, use_container_width=True)
817
-
818
- st.caption("Phytochemical designation and database provided by PhytoHub: https://phytohub.eu/")
819
-
820
- csv = df1.head(value_phyto).to_csv().encode('utf-8')
821
- st.download_button(label=f"download top {value_phyto} phytochemicals (csv)", data=csv,
822
- file_name=f'{database_name}_phytochemicals.csv', mime='text/csv')
823
-
824
-
825
- else:
826
- st.warning(
827
- f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number")
828
- st.markdown("---")
829
-
830
- # print()
831
- # print("Human genes similar to " + str(query))
832
- df1 = table.copy()
833
- df2 = pd.read_csv('kegg_compounds_lowercase.csv')
834
- m = df1.Word.isin(df2.compound)
835
- df1 = df1[m]
836
- df1.rename(columns={'Word': 'Compounds'}, inplace=True)
837
- df_len = len(df1)
838
- # df1["Human Gene"] = df1["Human Gene"].str.upper()
839
- # print(df1.head(50))
840
- # print()
841
- # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
842
- # time.sleep(2)
843
- # Create the slider with increments of 5 up to 100
844
-
845
- # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
846
- value_compound = min(df1.shape[0], 100)
847
-
848
- # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
849
- # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
850
- # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
851
- # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
852
- # unsafe_allow_html=True)
853
-
854
- st.markdown(
855
- f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_compound} "
856
- f"</span>Compounds contextually and semantically similar to "
857
- f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
858
- f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)</span></p></b>",
859
- unsafe_allow_html=True)
860
-
861
- df12 = df1.head(value_compound).copy()
862
-
863
- df12.index = (1 / df12.index) * 10000
864
- sizes = df12.index.tolist()
865
-
866
- df12.set_index('Compounds', inplace=True)
867
-
868
- df5 = df1.copy()
869
- # print(df4.head(10))
870
- df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str)
871
- df5.reset_index(inplace=True)
872
- # df4 = df4.rename(columns={'Protein': 'symbol2'})
873
- # print(df4)
874
- # # Use df.query to get a subset of df1 based on ids in df2
875
- # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
876
- # # Use merge to join the two DataFrames on id
877
- # result = pd.merge(subset, df2b, on='symbol2')
878
- # print(result)
879
-
880
- if value_compound <= df_len:
881
- # Define the `text` column for labels and `href` column for links
882
- # Reset the index
883
- df12.reset_index(inplace=True)
884
-
885
- # Replace hyphens with spaces in the 'text' column
886
- df12['Compounds'] = df12['Compounds'].str.replace('-', ' ')
887
-
888
- # Set the 'text' column back as the index
889
- df12.set_index('Compounds', inplace=True)
890
- df12['text'] = df12.index
891
- df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
892
- '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']]
893
- df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']]
894
- df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in get_compound_ids(df12['text'])]
895
- assert isinstance(df12, object)
896
- df12['database'] = database_name
897
-
898
- # df11['name'] = [c for c in result['Approved name']]
899
-
900
- # Create the treemap using `px.treemap`
901
- fig = px.treemap(df12, path=[df12['text']], values=sizes,
902
- custom_data=['href', 'database', 'href2', 'text', 'href3'],
903
- hover_name=(df5.head(value_compound)['SIMILARITY']))
904
-
905
- fig.update(layout_coloraxis_showscale=False)
906
- fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
907
- fig.update_annotations(visible=False)
908
- fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
909
- hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
910
- texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
911
- "<a href='%{customdata[0]}'>PubMed"
912
- "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
913
- "</a><br><br><a href='%{customdata[4]}'>KEGG Compound Page"
914
- "</span></a>")
915
-
916
- fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightYellow"])
917
- # # display the treemap in Streamlit
918
- # with treemap2:
919
-
920
- # st.pyplot(fig2)
921
- st.plotly_chart(fig, use_container_width=True)
922
-
923
- st.caption("Compound designation and database provided by KEGG: https://www.kegg.jp/kegg/compound/")
924
-
925
- csv = df1.head(value_compound).to_csv().encode('utf-8')
926
- st.download_button(label=f"download top {value_compound} compounds (csv)", data=csv,
927
- file_name=f'{database_name}_compounds.csv', mime='text/csv')
928
-
929
-
930
- else:
931
- st.warning(
932
- f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
933
- st.markdown("---")
934
-
935
- # import os
936
-
937
- # from datasets import Dataset
938
-
939
- # # Check if the comments directory exists
940
- # if os.path.exists('comments'):
941
- # # Load the dataset from disk
942
- # dataset = Dataset.load_from_disk('comments')
943
- # else:
944
- # # Create a new dataset
945
- # dataset = Dataset.from_dict({'id': [], 'text': []})
946
-
947
- # def save_comment(comment):
948
- # # Check if the dataset exists
949
- # if os.path.exists('comments'):
950
- # dataset = Dataset.load_from_disk('comments')
951
- # else:
952
- # dataset = Dataset.from_dict({'id': [], 'text': []})
953
 
954
- # # Append the new comment to the dataset
955
- # new_comment = {'id': len(dataset), 'text': comment}
956
- # dataset = dataset.concatenate(Dataset.from_dict(new_comment))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
957
 
958
- # # Save the dataset to disk
959
- # dataset.save_to_disk('comments')
960
-
961
- # print('Comment saved to dataset.')
962
-
963
- # st.title("Abstractalytics Web App")
964
- # st.write("We appreciate your feedback!")
965
-
966
- # user_comment = st.text_area("Please send us your anonymous remarks/suggestions about the Abstractalytics Web App: "
967
- # "(app will pause while we save your comments)")
968
-
969
- # if st.button("Submit"):
970
- # if user_comment:
971
- # save_comment(user_comment)
972
- # st.success("Your comment has been saved. Thank you for your feedback!")
973
- # else:
974
- # st.warning("Please enter a comment before submitting.")
975
-
976
- # # Load the comments dataset from disk
977
- # if os.path.exists('comments'):
978
- # dataset = Dataset.load_from_disk('comments')
979
- # else:
980
- # dataset = Dataset.from_dict({'id': [], 'text': []})
981
-
982
- # # Access the text column of the dataset
983
- # comments = dataset['text']
984
-
985
- # # Define the password
986
- # PASSWORD = 'ram100pass'
987
-
988
- # # Prompt the user for the password
989
- # password = st.text_input('Password:', type='password')
990
-
991
- # # Display the comments if the password is correct
992
- # if password == PASSWORD:
993
- # st.title('Comments')
994
- # for comment in comments:
995
- # st.write(comment)
996
- # else:
997
- # st.warning('Incorrect password')
998
 
999
- st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1000
 
1001
  st.subheader("Cancer-related videos")
1002
  if query:
 
76
  # # If the password is correct, show the app content
77
  # if authenticate(password):
78
  opt = st.sidebar.radio("Select a PubMed Corpus", options=('Breast Cancer corpus', 'Lung Cancer corpus',
79
+ 'Skin Cancer corpus', 'Colorectal Cancer corpus',
80
+ 'Prostate Cancer corpus'))
81
  # if opt == "Clotting corpus":
82
  # model_used = ("pubmed_model_clotting")
83
  # num_abstracts = 45493
 
106
  model_used = ("prostate_cancer_pubmed_model")
107
  num_abstracts = 89782
108
  database_name = "Prostate_cancer"
109
+ if opt == "Skin Cancer corpus":
110
+ model_used = ("skin_cancer_pubmed_model")
111
+ num_abstracts = 176568
112
+ database_name = "Skin_cancer"
113
 
114
  st.header(f":blue[{database_name} Pubmed corpus.]")
115
  text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
 
131
  bar.progress((i + 1) * 10)
132
  time.sleep(.1)
133
 
134
+ try:
135
+ model = Word2Vec.load(f"{model_used}") # you can continue training with the loaded model!
136
+ words = list(model.wv.key_to_index)
137
+ X = model.wv[model.wv.key_to_index]
138
+ # print(model.wv['bfgf'])
139
+ model2 = model.wv[query]
140
+ # print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
141
+ df = pd.DataFrame(X)
142
 
143
 
144
+ def get_compound_ids(compound_names):
145
+ with concurrent.futures.ThreadPoolExecutor() as executor:
146
+ compound_ids = list(executor.map(get_compound_id, compound_names))
147
+ return compound_ids
148
 
149
 
150
+ import requests
151
 
152
 
153
+ def get_compound_id(compound_name):
154
+ url = f"http://rest.kegg.jp/find/compound/{compound_name}"
155
+ response = requests.get(url)
156
+ if response.status_code == 200:
157
+ result = response.text.split('\n')
158
+ if result[0]:
159
+ compound_id = result[0].split('\t')[0]
160
+ return compound_id
161
+ return None
162
 
163
 
164
  # except:
165
  # st.error("Term occurrence is too low - please try another term")
166
  # st.stop()
167
+ st.markdown("---")
168
 
169
+ try:
170
+ table = model.wv.most_similar_cosmul(query, topn=10000)
171
+ table = (pd.DataFrame(table))
172
+ table.index.name = 'Rank'
173
+ table.columns = ['Word', 'SIMILARITY']
174
 
175
+ pd.set_option('display.max_rows', None)
176
+ table2 = table.copy()
177
 
 
 
 
 
 
178
 
 
 
179
 
180
+ # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
181
+ # f"<span style='color:red; font-style: italic;'>words</span> contextually "
182
+ # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
183
+ # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
184
+ # unsafe_allow_html=True)
 
185
 
186
+ # Set the max number of words to display
187
+ value_word = min(100, len(table2))
 
 
188
 
189
+ st.markdown(
190
+ f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
191
+ f"</span>words contextually and semantically similar to "
192
+ f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
193
+ f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>",
194
+ unsafe_allow_html=True)
195
 
196
+ short_table = table2.head(value_word).round(2)
197
+ short_table.index += 1
198
+ short_table.index = (1 / short_table.index) * 10
199
+ sizes = short_table.index.tolist()
200
+
201
+ short_table.set_index('Word', inplace=True)
202
+ table2["SIMILARITY"] = 'Similarity Score ' + table2.head(value_word)["SIMILARITY"].round(2).astype(str)
203
+ rank_num = list(short_table.index.tolist())
204
+
205
+ df = short_table
206
+
207
+
208
+ df['text'] = short_table.index
209
+ df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
210
  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
211
+ df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
212
 
213
+ df.loc[:, 'database'] = database_name
214
 
215
+ fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
216
+ hover_name=(table2.head(value_word)['SIMILARITY']))
217
 
218
+ fig.update(layout_coloraxis_showscale=False)
219
+ fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
220
+ fig.update_annotations(visible=False)
221
+ fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
222
  hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="<br><span "
223
  "style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
224
  "<a href='%{customdata[0]}'>PubMed"
225
  "</a><br><br><a href='%{customdata[3]}'>Wikipedia"
226
  "</span></a>")
227
+ fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
228
 
229
+ # st.pyplot(fig2)
230
+ st.plotly_chart(fig, use_container_width=True)
231
 
232
+ # st.caption(
233
+ # "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
234
+ # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
235
 
236
+ csv = table2.head(value_word).to_csv().encode('utf-8')
237
+ st.download_button(label=f"download top {value_word} words (csv)", data=csv,
238
  file_name=f'{database_name}_words.csv', mime='text/csv')
 
 
 
239
 
240
+ except:
241
+ st.warning(
242
+ f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
243
+ except KeyError:
244
+ st.warning(
245
+ "This word is not found in the corpus, it could be because it is not spelled correctly or could be that it does not have enough representation within the corpus, please try again")
246
  # st.markdown("---")
247
  # # st.write(short_table)
248
  # #
 
346
 
347
  st.markdown("---")
348
 
349
+ try:
350
+ df1 = table.copy()
351
+ df2 = pd.read_csv('Human Genes.csv')
352
+ m = df1.Word.isin(df2.symbol)
353
+ df1 = df1[m]
354
+ df1.rename(columns={'Word': 'Genes'}, inplace=True)
355
+ df_len = len(df1)
356
+ print(len(df1))
357
+
358
+ # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
359
+ # f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
360
+ # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
361
+ # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
362
+ # unsafe_allow_html=True)
363
+
364
+ # Set the number of proteins to display
365
+ value_gene = min(df_len, 100)
366
+
367
+ st.markdown(
368
+ f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
369
+ f"</span>human genes contextually and semantically similar to "
370
+ f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name} </span>corpus. Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
371
+ unsafe_allow_html=True)
372
+
373
+ df11 = df1.head(value_gene).copy()
374
+
375
+ df11.index = (1 / df11.index) * 10000
376
+ sizes = df11.index.tolist()
377
+
378
+ df11.set_index('Genes', inplace=True)
379
+
380
+ df4 = df1.copy()
381
+ # print(df4.head(10))
382
+ df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_gene)["SIMILARITY"].round(2).astype(str)
383
+ df4.reset_index(inplace=True)
384
+ # df4 = df4.rename(columns={'Protein': 'symbol2'})
385
+ # print(df4)
386
+ # # Use df.query to get a subset of df1 based on ids in df2
387
+ # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
388
+ # # Use merge to join the two DataFrames on id
389
+ # result = pd.merge(subset, df2b, on='symbol2')
390
+ # print(result)
391
+ if value_gene <= df_len:
392
+ # Define the `text` column for labels and `href` column for links
393
+ df11['text'] = df11.index
394
+ df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
395
+ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
396
+ df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
397
+ assert isinstance(df11, object)
398
+ df11['database'] = database_name
399
+
400
+ # df11['name'] = [c for c in result['Approved name']]
401
+
402
+ # Create the treemap using `px.treemap`
403
+ fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
404
+ hover_name=(df4.head(value_gene)['SIMILARITY']))
405
+
406
+ fig.update(layout_coloraxis_showscale=False)
407
+ fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
408
+ fig.update_annotations(visible=False)
409
+ fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
410
+ hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
411
+ texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
412
+ "<a href='%{customdata[0]}'>PubMed"
413
+ "</a><br><br><a href='%{customdata[2]}'>GeneCard"
414
+ "</span></a>")
415
+ fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
416
+ # # display the treemap in Streamlit
417
+ # with treemap2:
418
+
419
+ # st.pyplot(fig2)
420
+ st.plotly_chart(fig, use_container_width=True)
421
+
422
+ # st.caption(
423
+ # "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
424
+ # st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
425
+ st.caption(
426
+ "Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
427
+ st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
428
+ st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
429
+
430
+ csv = df1.head(value_gene).to_csv().encode('utf-8')
431
+ st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
432
+ file_name=f'{database_name}_genes.csv', mime='text/csv')
433
+
434
+
435
+ else:
436
+ st.warning(
437
+ f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
438
+ st.markdown("---")
439
+ # print()
440
+ # print("Human genes similar to " + str(query))
441
+ df1 = table.copy()
442
+ df2 = pd.read_csv('kegg_drug_list_lowercase.csv')
443
+ m = df1.Word.isin(df2.drugs)
444
+ df1 = df1[m]
445
+ df1.rename(columns={'Word': 'Drugs'}, inplace=True)
446
+ df_len = len(df1)
447
+ # print(len(df1))
448
+ # df1["Human Gene"] = df1["Human Gene"].str.upper()
449
+ # print(df1.head(50))
450
+ # print()
451
+ # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
452
+ # time.sleep(2)
453
+ # Create the slider with increments of 5 up to 100
454
+
455
+ # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
456
+ value_drug = min(df1.shape[0], 100)
457
+
458
+ # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
459
+ # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
460
+ # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
461
+ # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
462
+ # unsafe_allow_html=True)
463
+
464
+ st.markdown(
465
+ f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_drug} "
466
+ f"</span>Drugs contextually and semantically similar to "
467
+ f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
468
+ unsafe_allow_html=True)
469
+
470
+ df13 = df1.head(value_drug).copy()
471
+
472
+ df13.index = (1 / df13.index) * 10000
473
+ sizes = df13.index.tolist()
474
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  df13.set_index('Drugs', inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
 
477
+ df6 = df1.copy()
478
+ # print(df4.head(10))
479
+ df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str)
480
+ df6.reset_index(inplace=True)
481
+ # df4 = df4.rename(columns={'Protein': 'symbol2'})
482
+ # print(df4)
483
+ # # Use df.query to get a subset of df1 based on ids in df2
484
+ # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
485
+ # # Use merge to join the two DataFrames on id
486
+ # result = pd.merge(subset, df2b, on='symbol2')
487
+ # print(result)
488
+ if value_drug <= df_len:
489
+ # Define the `text` column for labels and `href` column for links
490
+ # Reset the index
491
+ df13.reset_index(inplace=True)
492
+
493
+ # Replace hyphens with spaces in the 'text' column
494
+ df13['Drugs'] = df13['Drugs'].str.replace('-', ' ')
495
+
496
+ # Set the 'text' column back as the index
497
+ df13.set_index('Drugs', inplace=True)
498
+ df13['text'] = df13.index
499
+ df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
500
+ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
501
+ df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
502
+ assert isinstance(df13, object)
503
+ df13['database'] = database_name
504
+
505
+ # df11['name'] = [c for c in result['Approved name']]
506
+
507
+ # Create the treemap using `px.treemap`
508
+ fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
509
+ hover_name=(df6.head(value_drug)['SIMILARITY']))
510
+
511
+ fig.update(layout_coloraxis_showscale=False)
512
+ fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
513
+ fig.update_annotations(visible=False)
514
+ fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
515
+ hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
516
+ texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
517
+ "<a href='%{customdata[0]}'>PubMed"
518
+ "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
519
+ "</span></a>")
520
+ fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
521
+ # # display the treemap in Streamlit
522
+ # with treemap2:
523
+
524
+ # st.pyplot(fig2)
525
+ st.plotly_chart(fig, use_container_width=True)
526
+
527
+ st.caption("Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
528
+
529
+ csv = df1.head(value_drug).to_csv().encode('utf-8')
530
+ st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
531
+ file_name=f'{database_name}_drugs.csv', mime='text/csv')
532
+
533
+
534
+ else:
535
+ st.warning(
536
+ f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
537
+ st.markdown("---")
538
+ #
539
+ # st.markdown("---")
540
+ # # print()
541
+ # # print("Human genes similar to " + str(query))
542
+ # df1 = table.copy()
543
+ # df2 = pd.read_csv('diseasesKegg.csv')
544
+ # m = df1.Word.isin(df2.disease)
545
+ # df1 = df1[m]
546
+ # df1.rename(columns={'Word': 'Disease'}, inplace=True)
547
+ # df_len = len(df1)
548
+ # # print(len(df1))
549
+ # # df1["Human Gene"] = df1["Human Gene"].str.upper()
550
+ # # print(df1.head(50))
551
+ # # print()
552
+ # # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
553
+ # # time.sleep(2)
554
+ # # Create the slider with increments of 5 up to 100
555
+ #
556
+ # # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
557
+ # value_disease = min(df1.shape[0], 100)
558
+ #
559
+ # # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
560
+ # # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
561
+ # # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
562
+ # # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
563
+ # # unsafe_allow_html=True)
564
+ #
565
+ # st.markdown(
566
+ # f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_disease} "
567
+ # f"</span>Diseases contextually and semantically similar to "
568
+ # f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
569
+ # unsafe_allow_html=True)
570
+ #
571
+ # df14 = df1.head(value_disease).copy()
572
+ #
573
+ # df14.index = (1 / df14.index) * 10000
574
+ # sizes = df14.index.tolist()
575
+ #
576
+ # df14.set_index('Disease', inplace=True)
577
+ #
578
+ # df7 = df1.copy()
579
+ # # print(df4.head(10))
580
+ # df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str)
581
+ # df7.reset_index(inplace=True)
582
+ # # df4 = df4.rename(columns={'Protein': 'symbol2'})
583
+ # # print(df4)
584
+ # # # Use df.query to get a subset of df1 based on ids in df2
585
+ # # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
586
+ # # # Use merge to join the two DataFrames on id
587
+ # # result = pd.merge(subset, df2b, on='symbol2')
588
+ # # print(result)
589
+ # if value_disease <= df_len:
590
+ # # Define the `text` column for labels and `href` column for links
591
+ # # Reset the index
592
+ # df14.reset_index(inplace=True)
593
+ #
594
+ # # Replace hyphens with spaces in the 'text' column
595
+ # df14['Disease'] = df14['Disease'].str.replace('-', ' ')
596
+ #
597
+ # # Set the 'text' column back as the index
598
+ # df14.set_index('Disease', inplace=True)
599
+ # df14['text'] = df14.index
600
+ # df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
601
+ # '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']]
602
+ # df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']]
603
+ # assert isinstance(df14, object)
604
+ # df14['database'] = database_name
605
+ #
606
+ # # df11['name'] = [c for c in result['Approved name']]
607
+ #
608
+ # # Create the treemap using `px.treemap`
609
+ # fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
610
+ # hover_name=(df7.head(value_disease)['SIMILARITY']))
611
+ #
612
+ # fig.update(layout_coloraxis_showscale=False)
613
+ # fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
614
+ # fig.update_annotations(visible=False)
615
+ # fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
616
+ # hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
617
+ # texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
618
+ # "<a href='%{customdata[0]}'>PubMed"
619
+ # "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
620
+ # "</span></a>")
621
+ # fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["PaleGoldenRod"])
622
+ # # # display the treemap in Streamlit
623
+ # # with treemap2:
624
+ #
625
+ # # st.pyplot(fig2)
626
+ # st.plotly_chart(fig, use_container_width=True)
627
+ #
628
+ # st.caption("Disease designation and database provided by KEGG: https://www.genome.jp/kegg/disease/")
629
+ #
630
+ # csv = df1.head(value_disease).to_csv().encode('utf-8')
631
+ # st.download_button(label=f"download top {value_disease} diseases (csv)", data=csv,
632
+ # file_name=f'{database_name}_disease.csv', mime='text/csv')
633
+ #
634
+ #
635
+ # else:
636
+ # st.warning(
637
+ # f"This selection exceeds the number of similar diseases related to {query} within the {database_name} corpus, please choose a lower number")
638
+ # st.markdown("---")
639
+
640
+ # st.markdown("---")
641
+ # # print()
642
+ # # print("Human genes similar to " + str(query))
643
+ # df1 = table.copy()
644
+ # df2 = pd.read_csv('pathwaysKegg.csv')
645
+ # m = df1.Word.isin(df2.pathway)
646
+ # df1 = df1[m]
647
+ # df1.rename(columns={'Word': 'Pathway'}, inplace=True)
648
+ # df_len = len(df1)
649
+ # # print(len(df1))
650
+ # # df1["Human Gene"] = df1["Human Gene"].str.upper()
651
+ # # print(df1.head(50))
652
+ # # print()
653
+ # # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
654
+ # # time.sleep(2)
655
+ # # Create the slider with increments of 5 up to 100
656
+ #
657
+ # # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
658
+ # value_pathway = min(df1.shape[0], 100)
659
+ #
660
+ # # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
661
+ # # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
662
+ # # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
663
+ # # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
664
+ # # unsafe_allow_html=True)
665
+ #
666
+ # st.markdown(
667
+ # f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_pathway} "
668
+ # f"</span>Pathways contextually and semantically similar to "
669
+ # f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
670
+ # unsafe_allow_html=True)
671
+ #
672
+ # df16 = df1.head(value_pathway).copy()
673
+ #
674
+ # df16.index = (1 / df16.index) * 10000
675
+ # sizes = df16.index.tolist()
676
+ #
677
+ # df16.set_index('Pathway', inplace=True)
678
+ #
679
+ # df9 = df1.copy()
680
+ # # print(df4.head(10))
681
+ # df9["SIMILARITY"] = 'Similarity Score ' + df9.head(value_pathway)["SIMILARITY"].round(2).astype(str)
682
+ # df9.reset_index(inplace=True)
683
+ # # df4 = df4.rename(columns={'Protein': 'symbol2'})
684
+ # # print(df4)
685
+ # # # Use df.query to get a subset of df1 based on ids in df2
686
+ # # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
687
+ # # # Use merge to join the two DataFrames on id
688
+ # # result = pd.merge(subset, df2b, on='symbol2')
689
+ # # print(result)
690
+ # if value_pathway <= df_len:
691
+ # # Define the `text` column for labels and `href` column for links
692
+ # # Reset the index
693
+ # df16.reset_index(inplace=True)
694
+ #
695
+ # # Replace hyphens with spaces in the 'text' column
696
+ # df16['Pathway'] = df16['Pathway'].str.replace('-', ' ')
697
+ #
698
+ # # Set the 'text' column back as the index
699
+ # df16.set_index('Pathway', inplace=True)
700
+ # df16['text'] = df16.index
701
+ # df16['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
702
+ # '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df16['text']]
703
+ # df16['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df16['text']]
704
+ # assert isinstance(df16, object)
705
+ # df16['database'] = database_name
706
+ #
707
+ # # df11['name'] = [c for c in result['Approved name']]
708
+ #
709
+ # # Create the treemap using `px.treemap`
710
+ # fig = px.treemap(df16, path=[df16['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
711
+ # hover_name=(df9.head(value_pathway)['SIMILARITY']))
712
+ #
713
+ # fig.update(layout_coloraxis_showscale=False)
714
+ # fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
715
+ # fig.update_annotations(visible=False)
716
+ # fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
717
+ # hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
718
+ # texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
719
+ # "<a href='%{customdata[0]}'>PubMed"
720
+ # "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
721
+ # "</span></a>")
722
+ # fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["FloralWhite"])
723
+ # # # display the treemap in Streamlit
724
+ # # with treemap2:
725
+ #
726
+ # # st.pyplot(fig2)
727
+ # st.plotly_chart(fig, use_container_width=True)
728
+ #
729
+ # st.caption("Pathway designation and database provided by KEGG: https://www.genome.jp/kegg/pathway.html")
730
+ #
731
+ # csv = df1.head(value_pathway).to_csv().encode('utf-8')
732
+ # st.download_button(label=f"download top {value_pathway} pathways (csv)", data=csv,
733
+ # file_name=f'{database_name}_pathways.csv', mime='text/csv')
734
+ #
735
+ #
736
+ # else:
737
+ # st.warning(
738
+ # f"This selection exceeds the number of similar pathways related to {query} within the {database_name} corpus, please choose a lower number")
739
+ # st.markdown("---")
740
+
741
+ st.markdown("---")
742
+ # print()
743
+ # print("Human genes similar to " + str(query))
744
+ df1 = table.copy()
745
+ df2 = pd.read_csv('phytochemicals.csv')
746
+ m = df1.Word.isin(df2.phyto)
747
+ df1 = df1[m]
748
+ df1.rename(columns={'Word': 'Phytochemical'}, inplace=True)
749
+ df_len = len(df1)
750
+ # print(len(df1))
751
+ # df1["Human Gene"] = df1["Human Gene"].str.upper()
752
+ # print(df1.head(50))
753
+ # print()
754
+ # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
755
+ # time.sleep(2)
756
+ # Create the slider with increments of 5 up to 100
757
+
758
+ # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
759
+ value_phyto = min(df1.shape[0], 100)
760
+
761
+ # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
762
+ # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
763
+ # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
764
+ # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
765
+ # unsafe_allow_html=True)
766
+
767
+ st.markdown(
768
+ f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_phyto} "
769
+ f"</span>Phytochemicals contextually and semantically similar to "
770
+ f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
771
+ f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information</span></p></b>",
772
+ unsafe_allow_html=True)
773
+
774
+ df15 = df1.head(value_phyto).copy()
775
+
776
+ df15.index = (1 / df15.index) * 10000
777
+ sizes = df15.index.tolist()
778
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
779
  df15.set_index('Phytochemical', inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
780
 
781
+ df8 = df1.copy()
782
+ # print(df4.head(10))
783
+ df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str)
784
+ df8.reset_index(inplace=True)
785
+ # df4 = df4.rename(columns={'Protein': 'symbol2'})
786
+ # print(df4)
787
+ # # Use df.query to get a subset of df1 based on ids in df2
788
+ # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
789
+ # # Use merge to join the two DataFrames on id
790
+ # result = pd.merge(subset, df2b, on='symbol2')
791
+ # print(result)
792
+ if value_phyto <= df_len:
793
+ # Define the `text` column for labels and `href` column for links
794
+ # Reset the index
795
+ df15.reset_index(inplace=True)
796
+
797
+ # Replace hyphens with spaces in the 'text' column
798
+ df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ')
799
+
800
+ # Set the 'text' column back as the index
801
+ df15.set_index('Phytochemical', inplace=True)
802
+ df15['text'] = df15.index
803
+ df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
804
+ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']]
805
+ df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']]
806
+ assert isinstance(df15, object)
807
+ df15['database'] = database_name
808
+
809
+ # df11['name'] = [c for c in result['Approved name']]
810
+
811
+ # Create the treemap using `px.treemap`
812
+ fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
813
+ hover_name=(df8.head(value_phyto)['SIMILARITY']))
814
+
815
+ fig.update(layout_coloraxis_showscale=False)
816
+ fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
817
+ fig.update_annotations(visible=False)
818
+ fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
819
+ hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
820
+ texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
821
+ "<a href='%{customdata[0]}'>PubMed"
822
+ "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
823
+ "</span></a>")
824
+ fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightSeaGreen"])
825
+ # # display the treemap in Streamlit
826
+ # with treemap2:
827
+
828
+ # st.pyplot(fig2)
829
+ st.plotly_chart(fig, use_container_width=True)
830
+
831
+ st.caption("Phytochemical designation and database provided by PhytoHub: https://phytohub.eu/")
832
+
833
+ csv = df1.head(value_phyto).to_csv().encode('utf-8')
834
+ st.download_button(label=f"download top {value_phyto} phytochemicals (csv)", data=csv,
835
+ file_name=f'{database_name}_phytochemicals.csv', mime='text/csv')
836
+
837
+
838
+ else:
839
+ st.warning(
840
+ f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number")
841
+ st.markdown("---")
842
+
843
+ # print()
844
+ # print("Human genes similar to " + str(query))
845
+ df1 = table.copy()
846
+ df2 = pd.read_csv('kegg_compounds_lowercase.csv')
847
+ m = df1.Word.isin(df2.compound)
848
+ df1 = df1[m]
849
+ df1.rename(columns={'Word': 'Compounds'}, inplace=True)
850
+ df_len = len(df1)
851
+ # df1["Human Gene"] = df1["Human Gene"].str.upper()
852
+ # print(df1.head(50))
853
+ # print()
854
+ # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
855
+ # time.sleep(2)
856
+ # Create the slider with increments of 5 up to 100
857
+
858
+ # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
859
+ value_compound = min(df1.shape[0], 100)
860
+
861
+ # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
862
+ # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
863
+ # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
864
+ # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
865
+ # unsafe_allow_html=True)
866
+
867
+ st.markdown(
868
+ f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_compound} "
869
+ f"</span>Compounds contextually and semantically similar to "
870
+ f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
871
+ f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)</span></p></b>",
872
+ unsafe_allow_html=True)
873
+
874
+ df12 = df1.head(value_compound).copy()
875
+
876
+ df12.index = (1 / df12.index) * 10000
877
+ sizes = df12.index.tolist()
878
 
879
+ df12.set_index('Compounds', inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
880
 
881
+ df5 = df1.copy()
882
+ # print(df4.head(10))
883
+ df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str)
884
+ df5.reset_index(inplace=True)
885
+ # df4 = df4.rename(columns={'Protein': 'symbol2'})
886
+ # print(df4)
887
+ # # Use df.query to get a subset of df1 based on ids in df2
888
+ # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
889
+ # # Use merge to join the two DataFrames on id
890
+ # result = pd.merge(subset, df2b, on='symbol2')
891
+ # print(result)
892
+
893
+ if value_compound <= df_len:
894
+ # Define the `text` column for labels and `href` column for links
895
+ # Reset the index
896
+ df12.reset_index(inplace=True)
897
+
898
+ # Replace hyphens with spaces in the 'text' column
899
+ df12['Compounds'] = df12['Compounds'].str.replace('-', ' ')
900
+
901
+ # Set the 'text' column back as the index
902
+ df12.set_index('Compounds', inplace=True)
903
+ df12['text'] = df12.index
904
+ df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
905
+ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']]
906
+ df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']]
907
+ df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in
908
+ get_compound_ids(df12['text'])]
909
+ assert isinstance(df12, object)
910
+ df12['database'] = database_name
911
+
912
+ # df11['name'] = [c for c in result['Approved name']]
913
+
914
+ # Create the treemap using `px.treemap`
915
+ fig = px.treemap(df12, path=[df12['text']], values=sizes,
916
+ custom_data=['href', 'database', 'href2', 'text', 'href3'],
917
+ hover_name=(df5.head(value_compound)['SIMILARITY']))
918
+
919
+ fig.update(layout_coloraxis_showscale=False)
920
+ fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
921
+ fig.update_annotations(visible=False)
922
+ fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
923
+ hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
924
+ texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
925
+ "<a href='%{customdata[0]}'>PubMed"
926
+ "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
927
+ "</a><br><br><a href='%{customdata[4]}'>KEGG Compound Page"
928
+ "</span></a>")
929
+
930
+ fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightYellow"])
931
+ # # display the treemap in Streamlit
932
+ # with treemap2:
933
+
934
+ # st.pyplot(fig2)
935
+ st.plotly_chart(fig, use_container_width=True)
936
+
937
+ st.caption("Compound designation and database provided by KEGG: https://www.kegg.jp/kegg/compound/")
938
+
939
+ csv = df1.head(value_compound).to_csv().encode('utf-8')
940
+ st.download_button(label=f"download top {value_compound} compounds (csv)", data=csv,
941
+ file_name=f'{database_name}_compounds.csv', mime='text/csv')
942
+
943
+
944
+ else:
945
+ st.warning(
946
+ f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
947
+ st.markdown("---")
948
+
949
+ # import os
950
+
951
+ # from datasets import Dataset
952
+
953
+ # # Check if the comments directory exists
954
+ # if os.path.exists('comments'):
955
+ # # Load the dataset from disk
956
+ # dataset = Dataset.load_from_disk('comments')
957
+ # else:
958
+ # # Create a new dataset
959
+ # dataset = Dataset.from_dict({'id': [], 'text': []})
960
+
961
+ # def save_comment(comment):
962
+ # # Check if the dataset exists
963
+ # if os.path.exists('comments'):
964
+ # dataset = Dataset.load_from_disk('comments')
965
+ # else:
966
+ # dataset = Dataset.from_dict({'id': [], 'text': []})
967
+
968
+ # # Append the new comment to the dataset
969
+ # new_comment = {'id': len(dataset), 'text': comment}
970
+ # dataset = dataset.concatenate(Dataset.from_dict(new_comment))
971
+
972
+ # # Save the dataset to disk
973
+ # dataset.save_to_disk('comments')
974
+
975
+ # print('Comment saved to dataset.')
976
+
977
+ # st.title("Abstractalytics Web App")
978
+ # st.write("We appreciate your feedback!")
979
+
980
+ # user_comment = st.text_area("Please send us your anonymous remarks/suggestions about the Abstractalytics Web App: "
981
+ # "(app will pause while we save your comments)")
982
+
983
+ # if st.button("Submit"):
984
+ # if user_comment:
985
+ # save_comment(user_comment)
986
+ # st.success("Your comment has been saved. Thank you for your feedback!")
987
+ # else:
988
+ # st.warning("Please enter a comment before submitting.")
989
+
990
+ # # Load the comments dataset from disk
991
+ # if os.path.exists('comments'):
992
+ # dataset = Dataset.load_from_disk('comments')
993
+ # else:
994
+ # dataset = Dataset.from_dict({'id': [], 'text': []})
995
+
996
+ # # Access the text column of the dataset
997
+ # comments = dataset['text']
998
+
999
+ # # Define the password
1000
+ # PASSWORD = 'ram100pass'
1001
+
1002
+ # # Prompt the user for the password
1003
+ # password = st.text_input('Password:', type='password')
1004
+
1005
+ # # Display the comments if the password is correct
1006
+ # if password == PASSWORD:
1007
+ # st.title('Comments')
1008
+ # for comment in comments:
1009
+ # st.write(comment)
1010
+ # else:
1011
+ # st.warning('Incorrect password')
1012
+
1013
+ st.markdown("---")
1014
+ except:
1015
+ st.warning("")
1016
 
1017
  st.subheader("Cancer-related videos")
1018
  if query: