Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -76,7 +76,8 @@ st.markdown("---")
|
|
76 |
# # If the password is correct, show the app content
|
77 |
# if authenticate(password):
|
78 |
opt = st.sidebar.radio("Select a PubMed Corpus", options=('Breast Cancer corpus', 'Lung Cancer corpus',
|
79 |
-
'
|
|
|
80 |
# if opt == "Clotting corpus":
|
81 |
# model_used = ("pubmed_model_clotting")
|
82 |
# num_abstracts = 45493
|
@@ -105,6 +106,10 @@ if opt == "Prostate Cancer corpus":
|
|
105 |
model_used = ("prostate_cancer_pubmed_model")
|
106 |
num_abstracts = 89782
|
107 |
database_name = "Prostate_cancer"
|
|
|
|
|
|
|
|
|
108 |
|
109 |
st.header(f":blue[{database_name} Pubmed corpus.]")
|
110 |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
|
@@ -126,111 +131,118 @@ if query:
|
|
126 |
bar.progress((i + 1) * 10)
|
127 |
time.sleep(.1)
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
|
144 |
|
145 |
-
|
146 |
|
147 |
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
|
158 |
|
159 |
# except:
|
160 |
# st.error("Term occurrence is too low - please try another term")
|
161 |
# st.stop()
|
162 |
-
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
168 |
|
169 |
-
|
170 |
-
|
171 |
|
172 |
-
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
|
173 |
-
# f"<span style='color:red; font-style: italic;'>words</span> contextually "
|
174 |
-
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
175 |
-
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
176 |
-
# unsafe_allow_html=True)
|
177 |
|
178 |
-
# Set the max number of words to display
|
179 |
-
value_word = min(100, len(table2))
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
unsafe_allow_html=True)
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
short_table.index = (1 / short_table.index) * 10
|
191 |
-
sizes = short_table.index.tolist()
|
192 |
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
196 |
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
|
202 |
-
|
203 |
|
204 |
-
|
205 |
|
206 |
-
|
207 |
-
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="<br><span "
|
214 |
"style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
|
215 |
"<a href='%{customdata[0]}'>PubMed"
|
216 |
"</a><br><br><a href='%{customdata[3]}'>Wikipedia"
|
217 |
"</span></a>")
|
218 |
-
|
219 |
|
220 |
-
|
221 |
-
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
|
227 |
-
|
228 |
-
|
229 |
file_name=f'{database_name}_words.csv', mime='text/csv')
|
230 |
-
except:
|
231 |
-
st.warning(
|
232 |
-
f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
# st.markdown("---")
|
235 |
# # st.write(short_table)
|
236 |
# #
|
@@ -334,669 +346,673 @@ if query:
|
|
334 |
|
335 |
st.markdown("---")
|
336 |
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
df6 = df1.copy()
|
465 |
-
# print(df4.head(10))
|
466 |
-
df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str)
|
467 |
-
df6.reset_index(inplace=True)
|
468 |
-
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
469 |
-
# print(df4)
|
470 |
-
# # Use df.query to get a subset of df1 based on ids in df2
|
471 |
-
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
472 |
-
# # Use merge to join the two DataFrames on id
|
473 |
-
# result = pd.merge(subset, df2b, on='symbol2')
|
474 |
-
# print(result)
|
475 |
-
if value_drug <= df_len:
|
476 |
-
# Define the `text` column for labels and `href` column for links
|
477 |
-
# Reset the index
|
478 |
-
df13.reset_index(inplace=True)
|
479 |
-
|
480 |
-
# Replace hyphens with spaces in the 'text' column
|
481 |
-
df13['Drugs'] = df13['Drugs'].str.replace('-', ' ')
|
482 |
-
|
483 |
-
# Set the 'text' column back as the index
|
484 |
df13.set_index('Drugs', inplace=True)
|
485 |
-
df13['text'] = df13.index
|
486 |
-
df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
487 |
-
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
|
488 |
-
df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
|
489 |
-
assert isinstance(df13, object)
|
490 |
-
df13['database'] = database_name
|
491 |
-
|
492 |
-
# df11['name'] = [c for c in result['Approved name']]
|
493 |
-
|
494 |
-
# Create the treemap using `px.treemap`
|
495 |
-
fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
496 |
-
hover_name=(df6.head(value_drug)['SIMILARITY']))
|
497 |
-
|
498 |
-
fig.update(layout_coloraxis_showscale=False)
|
499 |
-
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
500 |
-
fig.update_annotations(visible=False)
|
501 |
-
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
502 |
-
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
503 |
-
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
504 |
-
"<a href='%{customdata[0]}'>PubMed"
|
505 |
-
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
506 |
-
"</span></a>")
|
507 |
-
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
|
508 |
-
# # display the treemap in Streamlit
|
509 |
-
# with treemap2:
|
510 |
-
|
511 |
-
# st.pyplot(fig2)
|
512 |
-
st.plotly_chart(fig, use_container_width=True)
|
513 |
-
|
514 |
-
st.caption("Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
|
515 |
-
|
516 |
-
csv = df1.head(value_drug).to_csv().encode('utf-8')
|
517 |
-
st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
|
518 |
-
file_name=f'{database_name}_drugs.csv', mime='text/csv')
|
519 |
-
|
520 |
-
|
521 |
-
else:
|
522 |
-
st.warning(
|
523 |
-
f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
|
524 |
-
st.markdown("---")
|
525 |
-
#
|
526 |
-
# st.markdown("---")
|
527 |
-
# # print()
|
528 |
-
# # print("Human genes similar to " + str(query))
|
529 |
-
# df1 = table.copy()
|
530 |
-
# df2 = pd.read_csv('diseasesKegg.csv')
|
531 |
-
# m = df1.Word.isin(df2.disease)
|
532 |
-
# df1 = df1[m]
|
533 |
-
# df1.rename(columns={'Word': 'Disease'}, inplace=True)
|
534 |
-
# df_len = len(df1)
|
535 |
-
# # print(len(df1))
|
536 |
-
# # df1["Human Gene"] = df1["Human Gene"].str.upper()
|
537 |
-
# # print(df1.head(50))
|
538 |
-
# # print()
|
539 |
-
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
540 |
-
# # time.sleep(2)
|
541 |
-
# # Create the slider with increments of 5 up to 100
|
542 |
-
#
|
543 |
-
# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
544 |
-
# value_disease = min(df1.shape[0], 100)
|
545 |
-
#
|
546 |
-
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
547 |
-
# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
548 |
-
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
549 |
-
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
550 |
-
# # unsafe_allow_html=True)
|
551 |
-
#
|
552 |
-
# st.markdown(
|
553 |
-
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_disease} "
|
554 |
-
# f"</span>Diseases contextually and semantically similar to "
|
555 |
-
# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
556 |
-
# unsafe_allow_html=True)
|
557 |
-
#
|
558 |
-
# df14 = df1.head(value_disease).copy()
|
559 |
-
#
|
560 |
-
# df14.index = (1 / df14.index) * 10000
|
561 |
-
# sizes = df14.index.tolist()
|
562 |
-
#
|
563 |
-
# df14.set_index('Disease', inplace=True)
|
564 |
-
#
|
565 |
-
# df7 = df1.copy()
|
566 |
-
# # print(df4.head(10))
|
567 |
-
# df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str)
|
568 |
-
# df7.reset_index(inplace=True)
|
569 |
-
# # df4 = df4.rename(columns={'Protein': 'symbol2'})
|
570 |
-
# # print(df4)
|
571 |
-
# # # Use df.query to get a subset of df1 based on ids in df2
|
572 |
-
# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
573 |
-
# # # Use merge to join the two DataFrames on id
|
574 |
-
# # result = pd.merge(subset, df2b, on='symbol2')
|
575 |
-
# # print(result)
|
576 |
-
# if value_disease <= df_len:
|
577 |
-
# # Define the `text` column for labels and `href` column for links
|
578 |
-
# # Reset the index
|
579 |
-
# df14.reset_index(inplace=True)
|
580 |
-
#
|
581 |
-
# # Replace hyphens with spaces in the 'text' column
|
582 |
-
# df14['Disease'] = df14['Disease'].str.replace('-', ' ')
|
583 |
-
#
|
584 |
-
# # Set the 'text' column back as the index
|
585 |
-
# df14.set_index('Disease', inplace=True)
|
586 |
-
# df14['text'] = df14.index
|
587 |
-
# df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
588 |
-
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']]
|
589 |
-
# df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']]
|
590 |
-
# assert isinstance(df14, object)
|
591 |
-
# df14['database'] = database_name
|
592 |
-
#
|
593 |
-
# # df11['name'] = [c for c in result['Approved name']]
|
594 |
-
#
|
595 |
-
# # Create the treemap using `px.treemap`
|
596 |
-
# fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
597 |
-
# hover_name=(df7.head(value_disease)['SIMILARITY']))
|
598 |
-
#
|
599 |
-
# fig.update(layout_coloraxis_showscale=False)
|
600 |
-
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
601 |
-
# fig.update_annotations(visible=False)
|
602 |
-
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
603 |
-
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
604 |
-
# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
605 |
-
# "<a href='%{customdata[0]}'>PubMed"
|
606 |
-
# "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
607 |
-
# "</span></a>")
|
608 |
-
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["PaleGoldenRod"])
|
609 |
-
# # # display the treemap in Streamlit
|
610 |
-
# # with treemap2:
|
611 |
-
#
|
612 |
-
# # st.pyplot(fig2)
|
613 |
-
# st.plotly_chart(fig, use_container_width=True)
|
614 |
-
#
|
615 |
-
# st.caption("Disease designation and database provided by KEGG: https://www.genome.jp/kegg/disease/")
|
616 |
-
#
|
617 |
-
# csv = df1.head(value_disease).to_csv().encode('utf-8')
|
618 |
-
# st.download_button(label=f"download top {value_disease} diseases (csv)", data=csv,
|
619 |
-
# file_name=f'{database_name}_disease.csv', mime='text/csv')
|
620 |
-
#
|
621 |
-
#
|
622 |
-
# else:
|
623 |
-
# st.warning(
|
624 |
-
# f"This selection exceeds the number of similar diseases related to {query} within the {database_name} corpus, please choose a lower number")
|
625 |
-
# st.markdown("---")
|
626 |
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
|
728 |
-
st.markdown("---")
|
729 |
-
# print()
|
730 |
-
# print("Human genes similar to " + str(query))
|
731 |
-
df1 = table.copy()
|
732 |
-
df2 = pd.read_csv('phytochemicals.csv')
|
733 |
-
m = df1.Word.isin(df2.phyto)
|
734 |
-
df1 = df1[m]
|
735 |
-
df1.rename(columns={'Word': 'Phytochemical'}, inplace=True)
|
736 |
-
df_len = len(df1)
|
737 |
-
# print(len(df1))
|
738 |
-
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
739 |
-
# print(df1.head(50))
|
740 |
-
# print()
|
741 |
-
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
742 |
-
# time.sleep(2)
|
743 |
-
# Create the slider with increments of 5 up to 100
|
744 |
-
|
745 |
-
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
746 |
-
value_phyto = min(df1.shape[0], 100)
|
747 |
-
|
748 |
-
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
749 |
-
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
750 |
-
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
751 |
-
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
752 |
-
# unsafe_allow_html=True)
|
753 |
-
|
754 |
-
st.markdown(
|
755 |
-
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_phyto} "
|
756 |
-
f"</span>Phytochemicals contextually and semantically similar to "
|
757 |
-
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
|
758 |
-
f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
759 |
-
unsafe_allow_html=True)
|
760 |
-
|
761 |
-
df15 = df1.head(value_phyto).copy()
|
762 |
-
|
763 |
-
df15.index = (1 / df15.index) * 10000
|
764 |
-
sizes = df15.index.tolist()
|
765 |
-
|
766 |
-
df15.set_index('Phytochemical', inplace=True)
|
767 |
-
|
768 |
-
df8 = df1.copy()
|
769 |
-
# print(df4.head(10))
|
770 |
-
df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str)
|
771 |
-
df8.reset_index(inplace=True)
|
772 |
-
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
773 |
-
# print(df4)
|
774 |
-
# # Use df.query to get a subset of df1 based on ids in df2
|
775 |
-
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
776 |
-
# # Use merge to join the two DataFrames on id
|
777 |
-
# result = pd.merge(subset, df2b, on='symbol2')
|
778 |
-
# print(result)
|
779 |
-
if value_phyto <= df_len:
|
780 |
-
# Define the `text` column for labels and `href` column for links
|
781 |
-
# Reset the index
|
782 |
-
df15.reset_index(inplace=True)
|
783 |
-
|
784 |
-
# Replace hyphens with spaces in the 'text' column
|
785 |
-
df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ')
|
786 |
-
|
787 |
-
# Set the 'text' column back as the index
|
788 |
df15.set_index('Phytochemical', inplace=True)
|
789 |
-
df15['text'] = df15.index
|
790 |
-
df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
791 |
-
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']]
|
792 |
-
df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']]
|
793 |
-
assert isinstance(df15, object)
|
794 |
-
df15['database'] = database_name
|
795 |
-
|
796 |
-
# df11['name'] = [c for c in result['Approved name']]
|
797 |
-
|
798 |
-
# Create the treemap using `px.treemap`
|
799 |
-
fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
800 |
-
hover_name=(df8.head(value_phyto)['SIMILARITY']))
|
801 |
-
|
802 |
-
fig.update(layout_coloraxis_showscale=False)
|
803 |
-
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
804 |
-
fig.update_annotations(visible=False)
|
805 |
-
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
806 |
-
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
807 |
-
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
808 |
-
"<a href='%{customdata[0]}'>PubMed"
|
809 |
-
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
810 |
-
"</span></a>")
|
811 |
-
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightSeaGreen"])
|
812 |
-
# # display the treemap in Streamlit
|
813 |
-
# with treemap2:
|
814 |
-
|
815 |
-
# st.pyplot(fig2)
|
816 |
-
st.plotly_chart(fig, use_container_width=True)
|
817 |
-
|
818 |
-
st.caption("Phytochemical designation and database provided by PhytoHub: https://phytohub.eu/")
|
819 |
-
|
820 |
-
csv = df1.head(value_phyto).to_csv().encode('utf-8')
|
821 |
-
st.download_button(label=f"download top {value_phyto} phytochemicals (csv)", data=csv,
|
822 |
-
file_name=f'{database_name}_phytochemicals.csv', mime='text/csv')
|
823 |
-
|
824 |
-
|
825 |
-
else:
|
826 |
-
st.warning(
|
827 |
-
f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number")
|
828 |
-
st.markdown("---")
|
829 |
-
|
830 |
-
# print()
|
831 |
-
# print("Human genes similar to " + str(query))
|
832 |
-
df1 = table.copy()
|
833 |
-
df2 = pd.read_csv('kegg_compounds_lowercase.csv')
|
834 |
-
m = df1.Word.isin(df2.compound)
|
835 |
-
df1 = df1[m]
|
836 |
-
df1.rename(columns={'Word': 'Compounds'}, inplace=True)
|
837 |
-
df_len = len(df1)
|
838 |
-
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
839 |
-
# print(df1.head(50))
|
840 |
-
# print()
|
841 |
-
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
842 |
-
# time.sleep(2)
|
843 |
-
# Create the slider with increments of 5 up to 100
|
844 |
-
|
845 |
-
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
846 |
-
value_compound = min(df1.shape[0], 100)
|
847 |
-
|
848 |
-
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
849 |
-
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
850 |
-
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
851 |
-
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
852 |
-
# unsafe_allow_html=True)
|
853 |
-
|
854 |
-
st.markdown(
|
855 |
-
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_compound} "
|
856 |
-
f"</span>Compounds contextually and semantically similar to "
|
857 |
-
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
|
858 |
-
f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)</span></p></b>",
|
859 |
-
unsafe_allow_html=True)
|
860 |
-
|
861 |
-
df12 = df1.head(value_compound).copy()
|
862 |
-
|
863 |
-
df12.index = (1 / df12.index) * 10000
|
864 |
-
sizes = df12.index.tolist()
|
865 |
-
|
866 |
-
df12.set_index('Compounds', inplace=True)
|
867 |
-
|
868 |
-
df5 = df1.copy()
|
869 |
-
# print(df4.head(10))
|
870 |
-
df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str)
|
871 |
-
df5.reset_index(inplace=True)
|
872 |
-
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
873 |
-
# print(df4)
|
874 |
-
# # Use df.query to get a subset of df1 based on ids in df2
|
875 |
-
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
876 |
-
# # Use merge to join the two DataFrames on id
|
877 |
-
# result = pd.merge(subset, df2b, on='symbol2')
|
878 |
-
# print(result)
|
879 |
-
|
880 |
-
if value_compound <= df_len:
|
881 |
-
# Define the `text` column for labels and `href` column for links
|
882 |
-
# Reset the index
|
883 |
-
df12.reset_index(inplace=True)
|
884 |
-
|
885 |
-
# Replace hyphens with spaces in the 'text' column
|
886 |
-
df12['Compounds'] = df12['Compounds'].str.replace('-', ' ')
|
887 |
-
|
888 |
-
# Set the 'text' column back as the index
|
889 |
-
df12.set_index('Compounds', inplace=True)
|
890 |
-
df12['text'] = df12.index
|
891 |
-
df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
892 |
-
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']]
|
893 |
-
df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']]
|
894 |
-
df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in get_compound_ids(df12['text'])]
|
895 |
-
assert isinstance(df12, object)
|
896 |
-
df12['database'] = database_name
|
897 |
-
|
898 |
-
# df11['name'] = [c for c in result['Approved name']]
|
899 |
-
|
900 |
-
# Create the treemap using `px.treemap`
|
901 |
-
fig = px.treemap(df12, path=[df12['text']], values=sizes,
|
902 |
-
custom_data=['href', 'database', 'href2', 'text', 'href3'],
|
903 |
-
hover_name=(df5.head(value_compound)['SIMILARITY']))
|
904 |
-
|
905 |
-
fig.update(layout_coloraxis_showscale=False)
|
906 |
-
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
907 |
-
fig.update_annotations(visible=False)
|
908 |
-
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
909 |
-
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
910 |
-
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
911 |
-
"<a href='%{customdata[0]}'>PubMed"
|
912 |
-
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
913 |
-
"</a><br><br><a href='%{customdata[4]}'>KEGG Compound Page"
|
914 |
-
"</span></a>")
|
915 |
-
|
916 |
-
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightYellow"])
|
917 |
-
# # display the treemap in Streamlit
|
918 |
-
# with treemap2:
|
919 |
-
|
920 |
-
# st.pyplot(fig2)
|
921 |
-
st.plotly_chart(fig, use_container_width=True)
|
922 |
-
|
923 |
-
st.caption("Compound designation and database provided by KEGG: https://www.kegg.jp/kegg/compound/")
|
924 |
-
|
925 |
-
csv = df1.head(value_compound).to_csv().encode('utf-8')
|
926 |
-
st.download_button(label=f"download top {value_compound} compounds (csv)", data=csv,
|
927 |
-
file_name=f'{database_name}_compounds.csv', mime='text/csv')
|
928 |
-
|
929 |
-
|
930 |
-
else:
|
931 |
-
st.warning(
|
932 |
-
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
933 |
-
st.markdown("---")
|
934 |
-
|
935 |
-
# import os
|
936 |
-
|
937 |
-
# from datasets import Dataset
|
938 |
-
|
939 |
-
# # Check if the comments directory exists
|
940 |
-
# if os.path.exists('comments'):
|
941 |
-
# # Load the dataset from disk
|
942 |
-
# dataset = Dataset.load_from_disk('comments')
|
943 |
-
# else:
|
944 |
-
# # Create a new dataset
|
945 |
-
# dataset = Dataset.from_dict({'id': [], 'text': []})
|
946 |
-
|
947 |
-
# def save_comment(comment):
|
948 |
-
# # Check if the dataset exists
|
949 |
-
# if os.path.exists('comments'):
|
950 |
-
# dataset = Dataset.load_from_disk('comments')
|
951 |
-
# else:
|
952 |
-
# dataset = Dataset.from_dict({'id': [], 'text': []})
|
953 |
|
954 |
-
|
955 |
-
|
956 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
957 |
|
958 |
-
|
959 |
-
# dataset.save_to_disk('comments')
|
960 |
-
|
961 |
-
# print('Comment saved to dataset.')
|
962 |
-
|
963 |
-
# st.title("Abstractalytics Web App")
|
964 |
-
# st.write("We appreciate your feedback!")
|
965 |
-
|
966 |
-
# user_comment = st.text_area("Please send us your anonymous remarks/suggestions about the Abstractalytics Web App: "
|
967 |
-
# "(app will pause while we save your comments)")
|
968 |
-
|
969 |
-
# if st.button("Submit"):
|
970 |
-
# if user_comment:
|
971 |
-
# save_comment(user_comment)
|
972 |
-
# st.success("Your comment has been saved. Thank you for your feedback!")
|
973 |
-
# else:
|
974 |
-
# st.warning("Please enter a comment before submitting.")
|
975 |
-
|
976 |
-
# # Load the comments dataset from disk
|
977 |
-
# if os.path.exists('comments'):
|
978 |
-
# dataset = Dataset.load_from_disk('comments')
|
979 |
-
# else:
|
980 |
-
# dataset = Dataset.from_dict({'id': [], 'text': []})
|
981 |
-
|
982 |
-
# # Access the text column of the dataset
|
983 |
-
# comments = dataset['text']
|
984 |
-
|
985 |
-
# # Define the password
|
986 |
-
# PASSWORD = 'ram100pass'
|
987 |
-
|
988 |
-
# # Prompt the user for the password
|
989 |
-
# password = st.text_input('Password:', type='password')
|
990 |
-
|
991 |
-
# # Display the comments if the password is correct
|
992 |
-
# if password == PASSWORD:
|
993 |
-
# st.title('Comments')
|
994 |
-
# for comment in comments:
|
995 |
-
# st.write(comment)
|
996 |
-
# else:
|
997 |
-
# st.warning('Incorrect password')
|
998 |
|
999 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1000 |
|
1001 |
st.subheader("Cancer-related videos")
|
1002 |
if query:
|
|
|
76 |
# # If the password is correct, show the app content
|
77 |
# if authenticate(password):
|
78 |
opt = st.sidebar.radio("Select a PubMed Corpus", options=('Breast Cancer corpus', 'Lung Cancer corpus',
|
79 |
+
'Skin Cancer corpus', 'Colorectal Cancer corpus',
|
80 |
+
'Prostate Cancer corpus'))
|
81 |
# if opt == "Clotting corpus":
|
82 |
# model_used = ("pubmed_model_clotting")
|
83 |
# num_abstracts = 45493
|
|
|
106 |
model_used = ("prostate_cancer_pubmed_model")
|
107 |
num_abstracts = 89782
|
108 |
database_name = "Prostate_cancer"
|
109 |
+
if opt == "Skin Cancer corpus":
|
110 |
+
model_used = ("skin_cancer_pubmed_model")
|
111 |
+
num_abstracts = 176568
|
112 |
+
database_name = "Skin_cancer"
|
113 |
|
114 |
st.header(f":blue[{database_name} Pubmed corpus.]")
|
115 |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
|
|
|
131 |
bar.progress((i + 1) * 10)
|
132 |
time.sleep(.1)
|
133 |
|
134 |
+
try:
|
135 |
+
model = Word2Vec.load(f"{model_used}") # you can continue training with the loaded model!
|
136 |
+
words = list(model.wv.key_to_index)
|
137 |
+
X = model.wv[model.wv.key_to_index]
|
138 |
+
# print(model.wv['bfgf'])
|
139 |
+
model2 = model.wv[query]
|
140 |
+
# print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
|
141 |
+
df = pd.DataFrame(X)
|
142 |
|
143 |
|
144 |
+
def get_compound_ids(compound_names):
|
145 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
146 |
+
compound_ids = list(executor.map(get_compound_id, compound_names))
|
147 |
+
return compound_ids
|
148 |
|
149 |
|
150 |
+
import requests
|
151 |
|
152 |
|
153 |
+
def get_compound_id(compound_name):
|
154 |
+
url = f"http://rest.kegg.jp/find/compound/{compound_name}"
|
155 |
+
response = requests.get(url)
|
156 |
+
if response.status_code == 200:
|
157 |
+
result = response.text.split('\n')
|
158 |
+
if result[0]:
|
159 |
+
compound_id = result[0].split('\t')[0]
|
160 |
+
return compound_id
|
161 |
+
return None
|
162 |
|
163 |
|
164 |
# except:
|
165 |
# st.error("Term occurrence is too low - please try another term")
|
166 |
# st.stop()
|
167 |
+
st.markdown("---")
|
168 |
|
169 |
+
try:
|
170 |
+
table = model.wv.most_similar_cosmul(query, topn=10000)
|
171 |
+
table = (pd.DataFrame(table))
|
172 |
+
table.index.name = 'Rank'
|
173 |
+
table.columns = ['Word', 'SIMILARITY']
|
174 |
|
175 |
+
pd.set_option('display.max_rows', None)
|
176 |
+
table2 = table.copy()
|
177 |
|
|
|
|
|
|
|
|
|
|
|
178 |
|
|
|
|
|
179 |
|
180 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
|
181 |
+
# f"<span style='color:red; font-style: italic;'>words</span> contextually "
|
182 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
183 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
184 |
+
# unsafe_allow_html=True)
|
|
|
185 |
|
186 |
+
# Set the max number of words to display
|
187 |
+
value_word = min(100, len(table2))
|
|
|
|
|
188 |
|
189 |
+
st.markdown(
|
190 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
|
191 |
+
f"</span>words contextually and semantically similar to "
|
192 |
+
f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
|
193 |
+
f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>",
|
194 |
+
unsafe_allow_html=True)
|
195 |
|
196 |
+
short_table = table2.head(value_word).round(2)
|
197 |
+
short_table.index += 1
|
198 |
+
short_table.index = (1 / short_table.index) * 10
|
199 |
+
sizes = short_table.index.tolist()
|
200 |
+
|
201 |
+
short_table.set_index('Word', inplace=True)
|
202 |
+
table2["SIMILARITY"] = 'Similarity Score ' + table2.head(value_word)["SIMILARITY"].round(2).astype(str)
|
203 |
+
rank_num = list(short_table.index.tolist())
|
204 |
+
|
205 |
+
df = short_table
|
206 |
+
|
207 |
+
|
208 |
+
df['text'] = short_table.index
|
209 |
+
df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
210 |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
|
211 |
+
df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
|
212 |
|
213 |
+
df.loc[:, 'database'] = database_name
|
214 |
|
215 |
+
fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
|
216 |
+
hover_name=(table2.head(value_word)['SIMILARITY']))
|
217 |
|
218 |
+
fig.update(layout_coloraxis_showscale=False)
|
219 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
220 |
+
fig.update_annotations(visible=False)
|
221 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
222 |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="<br><span "
|
223 |
"style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
|
224 |
"<a href='%{customdata[0]}'>PubMed"
|
225 |
"</a><br><br><a href='%{customdata[3]}'>Wikipedia"
|
226 |
"</span></a>")
|
227 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
|
228 |
|
229 |
+
# st.pyplot(fig2)
|
230 |
+
st.plotly_chart(fig, use_container_width=True)
|
231 |
|
232 |
+
# st.caption(
|
233 |
+
# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
|
234 |
+
# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
|
235 |
|
236 |
+
csv = table2.head(value_word).to_csv().encode('utf-8')
|
237 |
+
st.download_button(label=f"download top {value_word} words (csv)", data=csv,
|
238 |
file_name=f'{database_name}_words.csv', mime='text/csv')
|
|
|
|
|
|
|
239 |
|
240 |
+
except:
|
241 |
+
st.warning(
|
242 |
+
f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
|
243 |
+
except KeyError:
|
244 |
+
st.warning(
|
245 |
+
"This word is not found in the corpus, it could be because it is not spelled correctly or could be that it does not have enough representation within the corpus, please try again")
|
246 |
# st.markdown("---")
|
247 |
# # st.write(short_table)
|
248 |
# #
|
|
|
346 |
|
347 |
st.markdown("---")
|
348 |
|
349 |
+
try:
|
350 |
+
df1 = table.copy()
|
351 |
+
df2 = pd.read_csv('Human Genes.csv')
|
352 |
+
m = df1.Word.isin(df2.symbol)
|
353 |
+
df1 = df1[m]
|
354 |
+
df1.rename(columns={'Word': 'Genes'}, inplace=True)
|
355 |
+
df_len = len(df1)
|
356 |
+
print(len(df1))
|
357 |
+
|
358 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
|
359 |
+
# f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
|
360 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
361 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
362 |
+
# unsafe_allow_html=True)
|
363 |
+
|
364 |
+
# Set the number of proteins to display
|
365 |
+
value_gene = min(df_len, 100)
|
366 |
+
|
367 |
+
st.markdown(
|
368 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
|
369 |
+
f"</span>human genes contextually and semantically similar to "
|
370 |
+
f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name} </span>corpus. Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
|
371 |
+
unsafe_allow_html=True)
|
372 |
+
|
373 |
+
df11 = df1.head(value_gene).copy()
|
374 |
+
|
375 |
+
df11.index = (1 / df11.index) * 10000
|
376 |
+
sizes = df11.index.tolist()
|
377 |
+
|
378 |
+
df11.set_index('Genes', inplace=True)
|
379 |
+
|
380 |
+
df4 = df1.copy()
|
381 |
+
# print(df4.head(10))
|
382 |
+
df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_gene)["SIMILARITY"].round(2).astype(str)
|
383 |
+
df4.reset_index(inplace=True)
|
384 |
+
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
385 |
+
# print(df4)
|
386 |
+
# # Use df.query to get a subset of df1 based on ids in df2
|
387 |
+
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
388 |
+
# # Use merge to join the two DataFrames on id
|
389 |
+
# result = pd.merge(subset, df2b, on='symbol2')
|
390 |
+
# print(result)
|
391 |
+
if value_gene <= df_len:
|
392 |
+
# Define the `text` column for labels and `href` column for links
|
393 |
+
df11['text'] = df11.index
|
394 |
+
df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
395 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
|
396 |
+
df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
|
397 |
+
assert isinstance(df11, object)
|
398 |
+
df11['database'] = database_name
|
399 |
+
|
400 |
+
# df11['name'] = [c for c in result['Approved name']]
|
401 |
+
|
402 |
+
# Create the treemap using `px.treemap`
|
403 |
+
fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
404 |
+
hover_name=(df4.head(value_gene)['SIMILARITY']))
|
405 |
+
|
406 |
+
fig.update(layout_coloraxis_showscale=False)
|
407 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
408 |
+
fig.update_annotations(visible=False)
|
409 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
410 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
411 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
412 |
+
"<a href='%{customdata[0]}'>PubMed"
|
413 |
+
"</a><br><br><a href='%{customdata[2]}'>GeneCard"
|
414 |
+
"</span></a>")
|
415 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
|
416 |
+
# # display the treemap in Streamlit
|
417 |
+
# with treemap2:
|
418 |
+
|
419 |
+
# st.pyplot(fig2)
|
420 |
+
st.plotly_chart(fig, use_container_width=True)
|
421 |
+
|
422 |
+
# st.caption(
|
423 |
+
# "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
|
424 |
+
# st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
|
425 |
+
st.caption(
|
426 |
+
"Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
|
427 |
+
st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
|
428 |
+
st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
|
429 |
+
|
430 |
+
csv = df1.head(value_gene).to_csv().encode('utf-8')
|
431 |
+
st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
|
432 |
+
file_name=f'{database_name}_genes.csv', mime='text/csv')
|
433 |
+
|
434 |
+
|
435 |
+
else:
|
436 |
+
st.warning(
|
437 |
+
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
438 |
+
st.markdown("---")
|
439 |
+
# print()
|
440 |
+
# print("Human genes similar to " + str(query))
|
441 |
+
df1 = table.copy()
|
442 |
+
df2 = pd.read_csv('kegg_drug_list_lowercase.csv')
|
443 |
+
m = df1.Word.isin(df2.drugs)
|
444 |
+
df1 = df1[m]
|
445 |
+
df1.rename(columns={'Word': 'Drugs'}, inplace=True)
|
446 |
+
df_len = len(df1)
|
447 |
+
# print(len(df1))
|
448 |
+
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
449 |
+
# print(df1.head(50))
|
450 |
+
# print()
|
451 |
+
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
452 |
+
# time.sleep(2)
|
453 |
+
# Create the slider with increments of 5 up to 100
|
454 |
+
|
455 |
+
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
456 |
+
value_drug = min(df1.shape[0], 100)
|
457 |
+
|
458 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
459 |
+
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
460 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
461 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
462 |
+
# unsafe_allow_html=True)
|
463 |
+
|
464 |
+
st.markdown(
|
465 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_drug} "
|
466 |
+
f"</span>Drugs contextually and semantically similar to "
|
467 |
+
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
468 |
+
unsafe_allow_html=True)
|
469 |
+
|
470 |
+
df13 = df1.head(value_drug).copy()
|
471 |
+
|
472 |
+
df13.index = (1 / df13.index) * 10000
|
473 |
+
sizes = df13.index.tolist()
|
474 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
df13.set_index('Drugs', inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
476 |
|
477 |
+
df6 = df1.copy()
|
478 |
+
# print(df4.head(10))
|
479 |
+
df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str)
|
480 |
+
df6.reset_index(inplace=True)
|
481 |
+
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
482 |
+
# print(df4)
|
483 |
+
# # Use df.query to get a subset of df1 based on ids in df2
|
484 |
+
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
485 |
+
# # Use merge to join the two DataFrames on id
|
486 |
+
# result = pd.merge(subset, df2b, on='symbol2')
|
487 |
+
# print(result)
|
488 |
+
if value_drug <= df_len:
|
489 |
+
# Define the `text` column for labels and `href` column for links
|
490 |
+
# Reset the index
|
491 |
+
df13.reset_index(inplace=True)
|
492 |
+
|
493 |
+
# Replace hyphens with spaces in the 'text' column
|
494 |
+
df13['Drugs'] = df13['Drugs'].str.replace('-', ' ')
|
495 |
+
|
496 |
+
# Set the 'text' column back as the index
|
497 |
+
df13.set_index('Drugs', inplace=True)
|
498 |
+
df13['text'] = df13.index
|
499 |
+
df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
500 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
|
501 |
+
df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
|
502 |
+
assert isinstance(df13, object)
|
503 |
+
df13['database'] = database_name
|
504 |
+
|
505 |
+
# df11['name'] = [c for c in result['Approved name']]
|
506 |
+
|
507 |
+
# Create the treemap using `px.treemap`
|
508 |
+
fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
509 |
+
hover_name=(df6.head(value_drug)['SIMILARITY']))
|
510 |
+
|
511 |
+
fig.update(layout_coloraxis_showscale=False)
|
512 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
513 |
+
fig.update_annotations(visible=False)
|
514 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
515 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
516 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
517 |
+
"<a href='%{customdata[0]}'>PubMed"
|
518 |
+
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
519 |
+
"</span></a>")
|
520 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
|
521 |
+
# # display the treemap in Streamlit
|
522 |
+
# with treemap2:
|
523 |
+
|
524 |
+
# st.pyplot(fig2)
|
525 |
+
st.plotly_chart(fig, use_container_width=True)
|
526 |
+
|
527 |
+
st.caption("Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
|
528 |
+
|
529 |
+
csv = df1.head(value_drug).to_csv().encode('utf-8')
|
530 |
+
st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
|
531 |
+
file_name=f'{database_name}_drugs.csv', mime='text/csv')
|
532 |
+
|
533 |
+
|
534 |
+
else:
|
535 |
+
st.warning(
|
536 |
+
f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
|
537 |
+
st.markdown("---")
|
538 |
+
#
|
539 |
+
# st.markdown("---")
|
540 |
+
# # print()
|
541 |
+
# # print("Human genes similar to " + str(query))
|
542 |
+
# df1 = table.copy()
|
543 |
+
# df2 = pd.read_csv('diseasesKegg.csv')
|
544 |
+
# m = df1.Word.isin(df2.disease)
|
545 |
+
# df1 = df1[m]
|
546 |
+
# df1.rename(columns={'Word': 'Disease'}, inplace=True)
|
547 |
+
# df_len = len(df1)
|
548 |
+
# # print(len(df1))
|
549 |
+
# # df1["Human Gene"] = df1["Human Gene"].str.upper()
|
550 |
+
# # print(df1.head(50))
|
551 |
+
# # print()
|
552 |
+
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
553 |
+
# # time.sleep(2)
|
554 |
+
# # Create the slider with increments of 5 up to 100
|
555 |
+
#
|
556 |
+
# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
557 |
+
# value_disease = min(df1.shape[0], 100)
|
558 |
+
#
|
559 |
+
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
560 |
+
# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
561 |
+
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
562 |
+
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
563 |
+
# # unsafe_allow_html=True)
|
564 |
+
#
|
565 |
+
# st.markdown(
|
566 |
+
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_disease} "
|
567 |
+
# f"</span>Diseases contextually and semantically similar to "
|
568 |
+
# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
569 |
+
# unsafe_allow_html=True)
|
570 |
+
#
|
571 |
+
# df14 = df1.head(value_disease).copy()
|
572 |
+
#
|
573 |
+
# df14.index = (1 / df14.index) * 10000
|
574 |
+
# sizes = df14.index.tolist()
|
575 |
+
#
|
576 |
+
# df14.set_index('Disease', inplace=True)
|
577 |
+
#
|
578 |
+
# df7 = df1.copy()
|
579 |
+
# # print(df4.head(10))
|
580 |
+
# df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str)
|
581 |
+
# df7.reset_index(inplace=True)
|
582 |
+
# # df4 = df4.rename(columns={'Protein': 'symbol2'})
|
583 |
+
# # print(df4)
|
584 |
+
# # # Use df.query to get a subset of df1 based on ids in df2
|
585 |
+
# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
586 |
+
# # # Use merge to join the two DataFrames on id
|
587 |
+
# # result = pd.merge(subset, df2b, on='symbol2')
|
588 |
+
# # print(result)
|
589 |
+
# if value_disease <= df_len:
|
590 |
+
# # Define the `text` column for labels and `href` column for links
|
591 |
+
# # Reset the index
|
592 |
+
# df14.reset_index(inplace=True)
|
593 |
+
#
|
594 |
+
# # Replace hyphens with spaces in the 'text' column
|
595 |
+
# df14['Disease'] = df14['Disease'].str.replace('-', ' ')
|
596 |
+
#
|
597 |
+
# # Set the 'text' column back as the index
|
598 |
+
# df14.set_index('Disease', inplace=True)
|
599 |
+
# df14['text'] = df14.index
|
600 |
+
# df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
601 |
+
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']]
|
602 |
+
# df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']]
|
603 |
+
# assert isinstance(df14, object)
|
604 |
+
# df14['database'] = database_name
|
605 |
+
#
|
606 |
+
# # df11['name'] = [c for c in result['Approved name']]
|
607 |
+
#
|
608 |
+
# # Create the treemap using `px.treemap`
|
609 |
+
# fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
610 |
+
# hover_name=(df7.head(value_disease)['SIMILARITY']))
|
611 |
+
#
|
612 |
+
# fig.update(layout_coloraxis_showscale=False)
|
613 |
+
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
614 |
+
# fig.update_annotations(visible=False)
|
615 |
+
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
616 |
+
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
617 |
+
# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
618 |
+
# "<a href='%{customdata[0]}'>PubMed"
|
619 |
+
# "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
620 |
+
# "</span></a>")
|
621 |
+
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["PaleGoldenRod"])
|
622 |
+
# # # display the treemap in Streamlit
|
623 |
+
# # with treemap2:
|
624 |
+
#
|
625 |
+
# # st.pyplot(fig2)
|
626 |
+
# st.plotly_chart(fig, use_container_width=True)
|
627 |
+
#
|
628 |
+
# st.caption("Disease designation and database provided by KEGG: https://www.genome.jp/kegg/disease/")
|
629 |
+
#
|
630 |
+
# csv = df1.head(value_disease).to_csv().encode('utf-8')
|
631 |
+
# st.download_button(label=f"download top {value_disease} diseases (csv)", data=csv,
|
632 |
+
# file_name=f'{database_name}_disease.csv', mime='text/csv')
|
633 |
+
#
|
634 |
+
#
|
635 |
+
# else:
|
636 |
+
# st.warning(
|
637 |
+
# f"This selection exceeds the number of similar diseases related to {query} within the {database_name} corpus, please choose a lower number")
|
638 |
+
# st.markdown("---")
|
639 |
+
|
640 |
+
# st.markdown("---")
|
641 |
+
# # print()
|
642 |
+
# # print("Human genes similar to " + str(query))
|
643 |
+
# df1 = table.copy()
|
644 |
+
# df2 = pd.read_csv('pathwaysKegg.csv')
|
645 |
+
# m = df1.Word.isin(df2.pathway)
|
646 |
+
# df1 = df1[m]
|
647 |
+
# df1.rename(columns={'Word': 'Pathway'}, inplace=True)
|
648 |
+
# df_len = len(df1)
|
649 |
+
# # print(len(df1))
|
650 |
+
# # df1["Human Gene"] = df1["Human Gene"].str.upper()
|
651 |
+
# # print(df1.head(50))
|
652 |
+
# # print()
|
653 |
+
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
654 |
+
# # time.sleep(2)
|
655 |
+
# # Create the slider with increments of 5 up to 100
|
656 |
+
#
|
657 |
+
# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
658 |
+
# value_pathway = min(df1.shape[0], 100)
|
659 |
+
#
|
660 |
+
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
661 |
+
# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
662 |
+
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
663 |
+
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
664 |
+
# # unsafe_allow_html=True)
|
665 |
+
#
|
666 |
+
# st.markdown(
|
667 |
+
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_pathway} "
|
668 |
+
# f"</span>Pathways contextually and semantically similar to "
|
669 |
+
# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
670 |
+
# unsafe_allow_html=True)
|
671 |
+
#
|
672 |
+
# df16 = df1.head(value_pathway).copy()
|
673 |
+
#
|
674 |
+
# df16.index = (1 / df16.index) * 10000
|
675 |
+
# sizes = df16.index.tolist()
|
676 |
+
#
|
677 |
+
# df16.set_index('Pathway', inplace=True)
|
678 |
+
#
|
679 |
+
# df9 = df1.copy()
|
680 |
+
# # print(df4.head(10))
|
681 |
+
# df9["SIMILARITY"] = 'Similarity Score ' + df9.head(value_pathway)["SIMILARITY"].round(2).astype(str)
|
682 |
+
# df9.reset_index(inplace=True)
|
683 |
+
# # df4 = df4.rename(columns={'Protein': 'symbol2'})
|
684 |
+
# # print(df4)
|
685 |
+
# # # Use df.query to get a subset of df1 based on ids in df2
|
686 |
+
# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
687 |
+
# # # Use merge to join the two DataFrames on id
|
688 |
+
# # result = pd.merge(subset, df2b, on='symbol2')
|
689 |
+
# # print(result)
|
690 |
+
# if value_pathway <= df_len:
|
691 |
+
# # Define the `text` column for labels and `href` column for links
|
692 |
+
# # Reset the index
|
693 |
+
# df16.reset_index(inplace=True)
|
694 |
+
#
|
695 |
+
# # Replace hyphens with spaces in the 'text' column
|
696 |
+
# df16['Pathway'] = df16['Pathway'].str.replace('-', ' ')
|
697 |
+
#
|
698 |
+
# # Set the 'text' column back as the index
|
699 |
+
# df16.set_index('Pathway', inplace=True)
|
700 |
+
# df16['text'] = df16.index
|
701 |
+
# df16['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
702 |
+
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df16['text']]
|
703 |
+
# df16['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df16['text']]
|
704 |
+
# assert isinstance(df16, object)
|
705 |
+
# df16['database'] = database_name
|
706 |
+
#
|
707 |
+
# # df11['name'] = [c for c in result['Approved name']]
|
708 |
+
#
|
709 |
+
# # Create the treemap using `px.treemap`
|
710 |
+
# fig = px.treemap(df16, path=[df16['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
711 |
+
# hover_name=(df9.head(value_pathway)['SIMILARITY']))
|
712 |
+
#
|
713 |
+
# fig.update(layout_coloraxis_showscale=False)
|
714 |
+
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
715 |
+
# fig.update_annotations(visible=False)
|
716 |
+
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
717 |
+
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
718 |
+
# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
719 |
+
# "<a href='%{customdata[0]}'>PubMed"
|
720 |
+
# "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
721 |
+
# "</span></a>")
|
722 |
+
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["FloralWhite"])
|
723 |
+
# # # display the treemap in Streamlit
|
724 |
+
# # with treemap2:
|
725 |
+
#
|
726 |
+
# # st.pyplot(fig2)
|
727 |
+
# st.plotly_chart(fig, use_container_width=True)
|
728 |
+
#
|
729 |
+
# st.caption("Pathway designation and database provided by KEGG: https://www.genome.jp/kegg/pathway.html")
|
730 |
+
#
|
731 |
+
# csv = df1.head(value_pathway).to_csv().encode('utf-8')
|
732 |
+
# st.download_button(label=f"download top {value_pathway} pathways (csv)", data=csv,
|
733 |
+
# file_name=f'{database_name}_pathways.csv', mime='text/csv')
|
734 |
+
#
|
735 |
+
#
|
736 |
+
# else:
|
737 |
+
# st.warning(
|
738 |
+
# f"This selection exceeds the number of similar pathways related to {query} within the {database_name} corpus, please choose a lower number")
|
739 |
+
# st.markdown("---")
|
740 |
+
|
741 |
+
st.markdown("---")
|
742 |
+
# print()
|
743 |
+
# print("Human genes similar to " + str(query))
|
744 |
+
df1 = table.copy()
|
745 |
+
df2 = pd.read_csv('phytochemicals.csv')
|
746 |
+
m = df1.Word.isin(df2.phyto)
|
747 |
+
df1 = df1[m]
|
748 |
+
df1.rename(columns={'Word': 'Phytochemical'}, inplace=True)
|
749 |
+
df_len = len(df1)
|
750 |
+
# print(len(df1))
|
751 |
+
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
752 |
+
# print(df1.head(50))
|
753 |
+
# print()
|
754 |
+
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
755 |
+
# time.sleep(2)
|
756 |
+
# Create the slider with increments of 5 up to 100
|
757 |
+
|
758 |
+
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
759 |
+
value_phyto = min(df1.shape[0], 100)
|
760 |
+
|
761 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
762 |
+
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
763 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
764 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
765 |
+
# unsafe_allow_html=True)
|
766 |
+
|
767 |
+
st.markdown(
|
768 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_phyto} "
|
769 |
+
f"</span>Phytochemicals contextually and semantically similar to "
|
770 |
+
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
|
771 |
+
f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information</span></p></b>",
|
772 |
+
unsafe_allow_html=True)
|
773 |
+
|
774 |
+
df15 = df1.head(value_phyto).copy()
|
775 |
+
|
776 |
+
df15.index = (1 / df15.index) * 10000
|
777 |
+
sizes = df15.index.tolist()
|
778 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
779 |
df15.set_index('Phytochemical', inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
780 |
|
781 |
+
df8 = df1.copy()
|
782 |
+
# print(df4.head(10))
|
783 |
+
df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str)
|
784 |
+
df8.reset_index(inplace=True)
|
785 |
+
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
786 |
+
# print(df4)
|
787 |
+
# # Use df.query to get a subset of df1 based on ids in df2
|
788 |
+
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
789 |
+
# # Use merge to join the two DataFrames on id
|
790 |
+
# result = pd.merge(subset, df2b, on='symbol2')
|
791 |
+
# print(result)
|
792 |
+
if value_phyto <= df_len:
|
793 |
+
# Define the `text` column for labels and `href` column for links
|
794 |
+
# Reset the index
|
795 |
+
df15.reset_index(inplace=True)
|
796 |
+
|
797 |
+
# Replace hyphens with spaces in the 'text' column
|
798 |
+
df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ')
|
799 |
+
|
800 |
+
# Set the 'text' column back as the index
|
801 |
+
df15.set_index('Phytochemical', inplace=True)
|
802 |
+
df15['text'] = df15.index
|
803 |
+
df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
804 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']]
|
805 |
+
df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']]
|
806 |
+
assert isinstance(df15, object)
|
807 |
+
df15['database'] = database_name
|
808 |
+
|
809 |
+
# df11['name'] = [c for c in result['Approved name']]
|
810 |
+
|
811 |
+
# Create the treemap using `px.treemap`
|
812 |
+
fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
|
813 |
+
hover_name=(df8.head(value_phyto)['SIMILARITY']))
|
814 |
+
|
815 |
+
fig.update(layout_coloraxis_showscale=False)
|
816 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
817 |
+
fig.update_annotations(visible=False)
|
818 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
819 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
820 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
821 |
+
"<a href='%{customdata[0]}'>PubMed"
|
822 |
+
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
823 |
+
"</span></a>")
|
824 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightSeaGreen"])
|
825 |
+
# # display the treemap in Streamlit
|
826 |
+
# with treemap2:
|
827 |
+
|
828 |
+
# st.pyplot(fig2)
|
829 |
+
st.plotly_chart(fig, use_container_width=True)
|
830 |
+
|
831 |
+
st.caption("Phytochemical designation and database provided by PhytoHub: https://phytohub.eu/")
|
832 |
+
|
833 |
+
csv = df1.head(value_phyto).to_csv().encode('utf-8')
|
834 |
+
st.download_button(label=f"download top {value_phyto} phytochemicals (csv)", data=csv,
|
835 |
+
file_name=f'{database_name}_phytochemicals.csv', mime='text/csv')
|
836 |
+
|
837 |
+
|
838 |
+
else:
|
839 |
+
st.warning(
|
840 |
+
f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number")
|
841 |
+
st.markdown("---")
|
842 |
+
|
843 |
+
# print()
|
844 |
+
# print("Human genes similar to " + str(query))
|
845 |
+
df1 = table.copy()
|
846 |
+
df2 = pd.read_csv('kegg_compounds_lowercase.csv')
|
847 |
+
m = df1.Word.isin(df2.compound)
|
848 |
+
df1 = df1[m]
|
849 |
+
df1.rename(columns={'Word': 'Compounds'}, inplace=True)
|
850 |
+
df_len = len(df1)
|
851 |
+
# df1["Human Gene"] = df1["Human Gene"].str.upper()
|
852 |
+
# print(df1.head(50))
|
853 |
+
# print()
|
854 |
+
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
|
855 |
+
# time.sleep(2)
|
856 |
+
# Create the slider with increments of 5 up to 100
|
857 |
+
|
858 |
+
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
|
859 |
+
value_compound = min(df1.shape[0], 100)
|
860 |
+
|
861 |
+
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
|
862 |
+
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
|
863 |
+
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
|
864 |
+
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
|
865 |
+
# unsafe_allow_html=True)
|
866 |
+
|
867 |
+
st.markdown(
|
868 |
+
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_compound} "
|
869 |
+
f"</span>Compounds contextually and semantically similar to "
|
870 |
+
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
|
871 |
+
f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)</span></p></b>",
|
872 |
+
unsafe_allow_html=True)
|
873 |
+
|
874 |
+
df12 = df1.head(value_compound).copy()
|
875 |
+
|
876 |
+
df12.index = (1 / df12.index) * 10000
|
877 |
+
sizes = df12.index.tolist()
|
878 |
|
879 |
+
df12.set_index('Compounds', inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
880 |
|
881 |
+
df5 = df1.copy()
|
882 |
+
# print(df4.head(10))
|
883 |
+
df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str)
|
884 |
+
df5.reset_index(inplace=True)
|
885 |
+
# df4 = df4.rename(columns={'Protein': 'symbol2'})
|
886 |
+
# print(df4)
|
887 |
+
# # Use df.query to get a subset of df1 based on ids in df2
|
888 |
+
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
|
889 |
+
# # Use merge to join the two DataFrames on id
|
890 |
+
# result = pd.merge(subset, df2b, on='symbol2')
|
891 |
+
# print(result)
|
892 |
+
|
893 |
+
if value_compound <= df_len:
|
894 |
+
# Define the `text` column for labels and `href` column for links
|
895 |
+
# Reset the index
|
896 |
+
df12.reset_index(inplace=True)
|
897 |
+
|
898 |
+
# Replace hyphens with spaces in the 'text' column
|
899 |
+
df12['Compounds'] = df12['Compounds'].str.replace('-', ' ')
|
900 |
+
|
901 |
+
# Set the 'text' column back as the index
|
902 |
+
df12.set_index('Compounds', inplace=True)
|
903 |
+
df12['text'] = df12.index
|
904 |
+
df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
|
905 |
+
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']]
|
906 |
+
df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']]
|
907 |
+
df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in
|
908 |
+
get_compound_ids(df12['text'])]
|
909 |
+
assert isinstance(df12, object)
|
910 |
+
df12['database'] = database_name
|
911 |
+
|
912 |
+
# df11['name'] = [c for c in result['Approved name']]
|
913 |
+
|
914 |
+
# Create the treemap using `px.treemap`
|
915 |
+
fig = px.treemap(df12, path=[df12['text']], values=sizes,
|
916 |
+
custom_data=['href', 'database', 'href2', 'text', 'href3'],
|
917 |
+
hover_name=(df5.head(value_compound)['SIMILARITY']))
|
918 |
+
|
919 |
+
fig.update(layout_coloraxis_showscale=False)
|
920 |
+
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
|
921 |
+
fig.update_annotations(visible=False)
|
922 |
+
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
|
923 |
+
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
|
924 |
+
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
|
925 |
+
"<a href='%{customdata[0]}'>PubMed"
|
926 |
+
"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
|
927 |
+
"</a><br><br><a href='%{customdata[4]}'>KEGG Compound Page"
|
928 |
+
"</span></a>")
|
929 |
+
|
930 |
+
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightYellow"])
|
931 |
+
# # display the treemap in Streamlit
|
932 |
+
# with treemap2:
|
933 |
+
|
934 |
+
# st.pyplot(fig2)
|
935 |
+
st.plotly_chart(fig, use_container_width=True)
|
936 |
+
|
937 |
+
st.caption("Compound designation and database provided by KEGG: https://www.kegg.jp/kegg/compound/")
|
938 |
+
|
939 |
+
csv = df1.head(value_compound).to_csv().encode('utf-8')
|
940 |
+
st.download_button(label=f"download top {value_compound} compounds (csv)", data=csv,
|
941 |
+
file_name=f'{database_name}_compounds.csv', mime='text/csv')
|
942 |
+
|
943 |
+
|
944 |
+
else:
|
945 |
+
st.warning(
|
946 |
+
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
|
947 |
+
st.markdown("---")
|
948 |
+
|
949 |
+
# import os
|
950 |
+
|
951 |
+
# from datasets import Dataset
|
952 |
+
|
953 |
+
# # Check if the comments directory exists
|
954 |
+
# if os.path.exists('comments'):
|
955 |
+
# # Load the dataset from disk
|
956 |
+
# dataset = Dataset.load_from_disk('comments')
|
957 |
+
# else:
|
958 |
+
# # Create a new dataset
|
959 |
+
# dataset = Dataset.from_dict({'id': [], 'text': []})
|
960 |
+
|
961 |
+
# def save_comment(comment):
|
962 |
+
# # Check if the dataset exists
|
963 |
+
# if os.path.exists('comments'):
|
964 |
+
# dataset = Dataset.load_from_disk('comments')
|
965 |
+
# else:
|
966 |
+
# dataset = Dataset.from_dict({'id': [], 'text': []})
|
967 |
+
|
968 |
+
# # Append the new comment to the dataset
|
969 |
+
# new_comment = {'id': len(dataset), 'text': comment}
|
970 |
+
# dataset = dataset.concatenate(Dataset.from_dict(new_comment))
|
971 |
+
|
972 |
+
# # Save the dataset to disk
|
973 |
+
# dataset.save_to_disk('comments')
|
974 |
+
|
975 |
+
# print('Comment saved to dataset.')
|
976 |
+
|
977 |
+
# st.title("Abstractalytics Web App")
|
978 |
+
# st.write("We appreciate your feedback!")
|
979 |
+
|
980 |
+
# user_comment = st.text_area("Please send us your anonymous remarks/suggestions about the Abstractalytics Web App: "
|
981 |
+
# "(app will pause while we save your comments)")
|
982 |
+
|
983 |
+
# if st.button("Submit"):
|
984 |
+
# if user_comment:
|
985 |
+
# save_comment(user_comment)
|
986 |
+
# st.success("Your comment has been saved. Thank you for your feedback!")
|
987 |
+
# else:
|
988 |
+
# st.warning("Please enter a comment before submitting.")
|
989 |
+
|
990 |
+
# # Load the comments dataset from disk
|
991 |
+
# if os.path.exists('comments'):
|
992 |
+
# dataset = Dataset.load_from_disk('comments')
|
993 |
+
# else:
|
994 |
+
# dataset = Dataset.from_dict({'id': [], 'text': []})
|
995 |
+
|
996 |
+
# # Access the text column of the dataset
|
997 |
+
# comments = dataset['text']
|
998 |
+
|
999 |
+
# # Define the password
|
1000 |
+
# PASSWORD = 'ram100pass'
|
1001 |
+
|
1002 |
+
# # Prompt the user for the password
|
1003 |
+
# password = st.text_input('Password:', type='password')
|
1004 |
+
|
1005 |
+
# # Display the comments if the password is correct
|
1006 |
+
# if password == PASSWORD:
|
1007 |
+
# st.title('Comments')
|
1008 |
+
# for comment in comments:
|
1009 |
+
# st.write(comment)
|
1010 |
+
# else:
|
1011 |
+
# st.warning('Incorrect password')
|
1012 |
+
|
1013 |
+
st.markdown("---")
|
1014 |
+
except:
|
1015 |
+
st.warning("")
|
1016 |
|
1017 |
st.subheader("Cancer-related videos")
|
1018 |
if query:
|