Spaces:
Sleeping
Sleeping
adapt ToxoCEN-Network here
Browse files
app.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import datasets
|
5 |
+
import streamlit as st
|
6 |
+
from streamlit_cytoscapejs import st_cytoscapejs
|
7 |
+
import networkx as nx
|
8 |
+
|
9 |
+
st.set_page_config(layout='wide')
|
10 |
+
|
11 |
+
# parse out gene_ids from URL query args to it's possible to link to this page
|
12 |
+
query_params = st.query_params
|
13 |
+
if "gene_ids" in query_params.keys():
|
14 |
+
input_gene_ids = query_params["gene_ids"]
|
15 |
+
else:
|
16 |
+
input_gene_ids = "CNAG_04365"
|
17 |
+
|
18 |
+
# use "\n" as the separator so it shows correctly in the text area
|
19 |
+
input_gene_ids = input_gene_ids.replace(",", "\n")
|
20 |
+
|
21 |
+
if "coexp_score_threshold" in query_params.keys():
|
22 |
+
coexp_score_threshold = query_params["coexp_score_threshold"]
|
23 |
+
else:
|
24 |
+
coexp_score_threshold = "0.85"
|
25 |
+
|
26 |
+
if "max_per_gene" in query_params.keys():
|
27 |
+
max_per_gene = query_params["max_per_gene"]
|
28 |
+
else:
|
29 |
+
max_per_gene = "25"
|
30 |
+
|
31 |
+
st.markdown("""
|
32 |
+
# CryptoCEN Top50 co-expressed partners
|
33 |
+
**CryptoCEN** is a co-expression network for *Cryptococcus neoformans* built on 1,524 RNA-seq runs across 34 studies.
|
34 |
+
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
|
35 |
+
is often a marker for genes to be involved in similar processes.
|
36 |
+
To Cite:
|
37 |
+
O'Meara MJ, Rapala JR, Nichols CB, Alexandre C, Billmyre RB, Steenwyk JL, A Alspaugh JA, O'Meara TR
|
38 |
+
CryptoCEN: A Co-Expression Network for Cryptococcus neoformans reveals novel proteins involved in DNA damage repair.
|
39 |
+
PLoS Genet 20(2): e1011158. (2024) https://doi.org/10.1371/journal.pgen.1011158
|
40 |
+
* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CryptoCEN
|
41 |
+
* Full network and dataset: https://huggingface.co/datasets/maomlab/CryptoCEN
|
42 |
+
## Plot a network for a set of genes
|
43 |
+
Put a ``CNAG_#####`` gene_id, one one each row to seed the network
|
44 |
+
""")
|
45 |
+
|
46 |
+
|
47 |
+
transcript_annotations = datasets.load_dataset(
|
48 |
+
path = "maomlab/CryptoCEN",
|
49 |
+
data_files = {"transcript_annotations": "transcript_annotations.tsv"})
|
50 |
+
transcript_annotations = transcript_annotations["transcript_annotations"].to_pandas()
|
51 |
+
|
52 |
+
top_coexp_hits = datasets.load_dataset(
|
53 |
+
path = "maomlab/CryptoCEN",
|
54 |
+
data_files = {"top_coexp_hits": "top_coexp_hits.tsv"})
|
55 |
+
top_coexp_hits = top_coexp_hits["top_coexp_hits"].to_pandas()
|
56 |
+
|
57 |
+
|
58 |
+
col1, col2, col3 = st.columns(spec = [0.3, 0.2, 0.5])
|
59 |
+
with col1:
|
60 |
+
input_gene_ids = st.text_area(
|
61 |
+
label = "Gene IDs",
|
62 |
+
value = f"{input_gene_ids}",
|
63 |
+
height = 130,
|
64 |
+
help = "CNAG Gene ID e.g. CNAG_04365")
|
65 |
+
|
66 |
+
|
67 |
+
with col2:
|
68 |
+
coexp_score_threshold = st.text_input(
|
69 |
+
label = "Co-expression threshold [0-1]",
|
70 |
+
value = f"{coexp_score_threshold}",
|
71 |
+
help = "Default: 0.85")
|
72 |
+
|
73 |
+
try:
|
74 |
+
coexp_score_threshold = float(coexp_score_threshold)
|
75 |
+
except:
|
76 |
+
st.error(f"Co-expression threshold should be a number between 0 and 1, instead it is '{coexp_score_threshold}'")
|
77 |
+
if coexp_score_threshold < 0 or 1 < coexp_score_threshold:
|
78 |
+
st.error(f"Co-expression threshold should be a number between 0 and 1, instead it is '{coexp_score_threshold}'")
|
79 |
+
|
80 |
+
max_per_gene = st.text_input(
|
81 |
+
label = "Max per gene",
|
82 |
+
value = f"{max_per_gene}",
|
83 |
+
help = "Default: 25")
|
84 |
+
|
85 |
+
try:
|
86 |
+
max_per_gene = int(max_per_gene)
|
87 |
+
except:
|
88 |
+
st.error(f"Max per gene should be a number greater than 0, instead it is '{max_per_gene}'")
|
89 |
+
if max_per_gene <= 0:
|
90 |
+
st.error(f"Max per gene should be a number greater than 0, instead it is '{max_per_gene}'")
|
91 |
+
|
92 |
+
|
93 |
+
##################################
|
94 |
+
# Parse and check the user input #
|
95 |
+
##################################
|
96 |
+
|
97 |
+
seed_gene_ids = []
|
98 |
+
for input_gene_id in input_gene_ids.split("\n"):
|
99 |
+
gene_id = input_gene_id.strip()
|
100 |
+
if gene_id == "":
|
101 |
+
continue
|
102 |
+
else:
|
103 |
+
seed_gene_ids.append(gene_id)
|
104 |
+
|
105 |
+
neighbors = []
|
106 |
+
for seed_gene_id in seed_gene_ids:
|
107 |
+
hits = top_coexp_hits[
|
108 |
+
(top_coexp_hits.gene_id_1 == seed_gene_id) & (top_coexp_hits.coexp_score > coexp_score_threshold)]
|
109 |
+
if len(hits.index) > max_per_gene:
|
110 |
+
hits = hits[0:max_per_gene]
|
111 |
+
neighbors.append(hits)
|
112 |
+
|
113 |
+
neighbors = pd.concat(neighbors)
|
114 |
+
|
115 |
+
neighbor_gene_ids = list(set(neighbors.gene_id_2))
|
116 |
+
gene_ids = seed_gene_ids + neighbor_gene_ids
|
117 |
+
gene_types = ['seed'] * len(seed_gene_ids) + ['neighbor'] * len(neighbor_gene_ids)
|
118 |
+
|
119 |
+
cnag_ids = []
|
120 |
+
gene_names = []
|
121 |
+
descriptions = []
|
122 |
+
|
123 |
+
for gene_id in gene_ids:
|
124 |
+
try:
|
125 |
+
cnag_id = transcript_annotations.loc[transcript_annotations["cnag_id"] == gene_id]["cnag_id"].values[0]
|
126 |
+
gene_name = transcript_annotations.loc[transcript_annotations["gene_id"] == gene_id]["gene_name"].values[0]
|
127 |
+
description = transcript_annotations.loc[transcript_annotations["gene_id"] == gene_id]["description"].values[0]
|
128 |
+
except:
|
129 |
+
st.error(f"Unable to locate cnag_id for Gene ID: {gene_id}, it should be of the form 'cnag_######'")
|
130 |
+
cnag_id = None
|
131 |
+
gene_name = None
|
132 |
+
description = None
|
133 |
+
|
134 |
+
cnag_ids.append(cnag_id)
|
135 |
+
gene_names.append(gene_name)
|
136 |
+
descriptions.append(description)
|
137 |
+
|
138 |
+
node_info = pd.DataFrame({
|
139 |
+
"gene_index": range(len(gene_ids)),
|
140 |
+
"gene_id" : gene_ids,
|
141 |
+
"gene_type" : gene_types,
|
142 |
+
"cnag_id": cnag_ids,
|
143 |
+
"gene_name": gene_names,
|
144 |
+
"description": description})
|
145 |
+
|
146 |
+
neighbors = neighbors.merge(
|
147 |
+
right = node_info,
|
148 |
+
left_on = "gene_id_1",
|
149 |
+
right_on = "gene_id")
|
150 |
+
|
151 |
+
neighbors = neighbors.merge(
|
152 |
+
right = node_info,
|
153 |
+
left_on = "gene_id_2",
|
154 |
+
right_on = "gene_id",
|
155 |
+
suffixes = ("_a", "_b"))
|
156 |
+
|
157 |
+
################################
|
158 |
+
# Use NetworkX to layout graph #
|
159 |
+
################################
|
160 |
+
# note I think CytoscapeJS can layout graphs
|
161 |
+
# but I'm unsure how to do it through the streamlit-cytoscapejs interface :(
|
162 |
+
|
163 |
+
st.write(neighbors)
|
164 |
+
|
165 |
+
|
166 |
+
G = nx.Graph()
|
167 |
+
for i in range(len(neighbors.index)):
|
168 |
+
edge = neighbors.iloc[i]
|
169 |
+
G.add_edge(
|
170 |
+
edge["gene_index_a"],
|
171 |
+
edge["gene_index_b"],
|
172 |
+
weight = edge["coexp_score"])
|
173 |
+
layout = nx.spring_layout(G)
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
node_color_lut = {
|
179 |
+
"seed" : "#4866F0", # blue
|
180 |
+
"neighbor" : "#F0C547" # gold
|
181 |
+
}
|
182 |
+
|
183 |
+
elements = []
|
184 |
+
singleton_index = 0
|
185 |
+
for i in range(len(node_info.index)):
|
186 |
+
node = node_info.iloc[i]
|
187 |
+
|
188 |
+
if node["gene_index"] in layout.keys():
|
189 |
+
layout_x = layout[node["gene_index"]][0] * 600 + 1500/2
|
190 |
+
layout_y = layout[node["gene_index"]][1] * 600 + 1500/2
|
191 |
+
else:
|
192 |
+
layout_x = (singleton_index % 8) * 150 + 100
|
193 |
+
layout_y = np.floor(singleton_index / 8) * 50 + 30
|
194 |
+
singleton_index += 1
|
195 |
+
|
196 |
+
|
197 |
+
elements.append({
|
198 |
+
"data": {
|
199 |
+
"id": node["gene_id"],
|
200 |
+
"label": node["gene_name"] if node["gene_name"] is not None else node["gene_id"],
|
201 |
+
"color": node_color_lut[node["gene_type"]]},
|
202 |
+
"position": {
|
203 |
+
"x" : layout_x,
|
204 |
+
"y" : layout_y}})
|
205 |
+
|
206 |
+
for i in range(len(neighbors.index)):
|
207 |
+
edge = neighbors.iloc[i]
|
208 |
+
elements.append({
|
209 |
+
"data" : {
|
210 |
+
"source" : edge["gene_id_1"],
|
211 |
+
"target" : edge["gene_id_2"],
|
212 |
+
"width" :
|
213 |
+
20 if edge["coexp_score"] > 0.98 else
|
214 |
+
15 if edge["coexp_score"] > 0.93 else
|
215 |
+
10 if edge["coexp_score"] > 0.90 else
|
216 |
+
8 if edge["coexp_score"] > 0.88 else
|
217 |
+
5}})
|
218 |
+
|
219 |
+
with col3:
|
220 |
+
st.text('') # help alignment with input box
|
221 |
+
st.download_button(
|
222 |
+
label = "Download as as TSV",
|
223 |
+
data = neighbors.to_csv(sep ='\t').encode('utf-8'),
|
224 |
+
file_name = f"CryptoCEN_network.tsv",
|
225 |
+
mime = "text/csv")
|
226 |
+
|
227 |
+
##########################################################
|
228 |
+
|
229 |
+
stylesheet = [
|
230 |
+
{"selector": "node", "style": {
|
231 |
+
"width": 140,
|
232 |
+
"height": 30,
|
233 |
+
"shape": "rectangle",
|
234 |
+
"label" : "data(label)",
|
235 |
+
"labelFontSize": 100,
|
236 |
+
'background-color': 'data(color)',
|
237 |
+
"text-halign": "center",
|
238 |
+
"text-valign": "center",
|
239 |
+
}},
|
240 |
+
{"selector": "edge", "style": {
|
241 |
+
"width": "data(width)"
|
242 |
+
}}
|
243 |
+
]
|
244 |
+
|
245 |
+
st.title("ToxoCEN Network")
|
246 |
+
clicked_elements = st_cytoscapejs(
|
247 |
+
elements = elements,
|
248 |
+
stylesheet = stylesheet,
|
249 |
+
width = 1000,
|
250 |
+
height= 1000,
|
251 |
+
key = "1")
|