maom commited on
Commit
7270996
·
verified ·
1 Parent(s): 7783410

adapt ToxoCEN-Network here

Browse files
Files changed (1) hide show
  1. app.py +251 -0
app.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+ import pandas as pd
4
+ import datasets
5
+ import streamlit as st
6
+ from streamlit_cytoscapejs import st_cytoscapejs
7
+ import networkx as nx
8
+
9
+ st.set_page_config(layout='wide')
10
+
11
+ # parse out gene_ids from URL query args to it's possible to link to this page
12
+ query_params = st.query_params
13
+ if "gene_ids" in query_params.keys():
14
+ input_gene_ids = query_params["gene_ids"]
15
+ else:
16
+ input_gene_ids = "CNAG_04365"
17
+
18
+ # use "\n" as the separator so it shows correctly in the text area
19
+ input_gene_ids = input_gene_ids.replace(",", "\n")
20
+
21
+ if "coexp_score_threshold" in query_params.keys():
22
+ coexp_score_threshold = query_params["coexp_score_threshold"]
23
+ else:
24
+ coexp_score_threshold = "0.85"
25
+
26
+ if "max_per_gene" in query_params.keys():
27
+ max_per_gene = query_params["max_per_gene"]
28
+ else:
29
+ max_per_gene = "25"
30
+
31
+ st.markdown("""
32
+ # CryptoCEN Top50 co-expressed partners
33
+ **CryptoCEN** is a co-expression network for *Cryptococcus neoformans* built on 1,524 RNA-seq runs across 34 studies.
34
+ A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
35
+ is often a marker for genes to be involved in similar processes.
36
+ To Cite:
37
+ O'Meara MJ, Rapala JR, Nichols CB, Alexandre C, Billmyre RB, Steenwyk JL, A Alspaugh JA, O'Meara TR
38
+ CryptoCEN: A Co-Expression Network for Cryptococcus neoformans reveals novel proteins involved in DNA damage repair.
39
+ PLoS Genet 20(2): e1011158. (2024) https://doi.org/10.1371/journal.pgen.1011158
40
+ * Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/CryptoCEN
41
+ * Full network and dataset: https://huggingface.co/datasets/maomlab/CryptoCEN
42
+ ## Plot a network for a set of genes
43
+ Put a ``CNAG_#####`` gene_id, one one each row to seed the network
44
+ """)
45
+
46
+
47
+ transcript_annotations = datasets.load_dataset(
48
+ path = "maomlab/CryptoCEN",
49
+ data_files = {"transcript_annotations": "transcript_annotations.tsv"})
50
+ transcript_annotations = transcript_annotations["transcript_annotations"].to_pandas()
51
+
52
+ top_coexp_hits = datasets.load_dataset(
53
+ path = "maomlab/CryptoCEN",
54
+ data_files = {"top_coexp_hits": "top_coexp_hits.tsv"})
55
+ top_coexp_hits = top_coexp_hits["top_coexp_hits"].to_pandas()
56
+
57
+
58
+ col1, col2, col3 = st.columns(spec = [0.3, 0.2, 0.5])
59
+ with col1:
60
+ input_gene_ids = st.text_area(
61
+ label = "Gene IDs",
62
+ value = f"{input_gene_ids}",
63
+ height = 130,
64
+ help = "CNAG Gene ID e.g. CNAG_04365")
65
+
66
+
67
+ with col2:
68
+ coexp_score_threshold = st.text_input(
69
+ label = "Co-expression threshold [0-1]",
70
+ value = f"{coexp_score_threshold}",
71
+ help = "Default: 0.85")
72
+
73
+ try:
74
+ coexp_score_threshold = float(coexp_score_threshold)
75
+ except:
76
+ st.error(f"Co-expression threshold should be a number between 0 and 1, instead it is '{coexp_score_threshold}'")
77
+ if coexp_score_threshold < 0 or 1 < coexp_score_threshold:
78
+ st.error(f"Co-expression threshold should be a number between 0 and 1, instead it is '{coexp_score_threshold}'")
79
+
80
+ max_per_gene = st.text_input(
81
+ label = "Max per gene",
82
+ value = f"{max_per_gene}",
83
+ help = "Default: 25")
84
+
85
+ try:
86
+ max_per_gene = int(max_per_gene)
87
+ except:
88
+ st.error(f"Max per gene should be a number greater than 0, instead it is '{max_per_gene}'")
89
+ if max_per_gene <= 0:
90
+ st.error(f"Max per gene should be a number greater than 0, instead it is '{max_per_gene}'")
91
+
92
+
93
+ ##################################
94
+ # Parse and check the user input #
95
+ ##################################
96
+
97
+ seed_gene_ids = []
98
+ for input_gene_id in input_gene_ids.split("\n"):
99
+ gene_id = input_gene_id.strip()
100
+ if gene_id == "":
101
+ continue
102
+ else:
103
+ seed_gene_ids.append(gene_id)
104
+
105
+ neighbors = []
106
+ for seed_gene_id in seed_gene_ids:
107
+ hits = top_coexp_hits[
108
+ (top_coexp_hits.gene_id_1 == seed_gene_id) & (top_coexp_hits.coexp_score > coexp_score_threshold)]
109
+ if len(hits.index) > max_per_gene:
110
+ hits = hits[0:max_per_gene]
111
+ neighbors.append(hits)
112
+
113
+ neighbors = pd.concat(neighbors)
114
+
115
+ neighbor_gene_ids = list(set(neighbors.gene_id_2))
116
+ gene_ids = seed_gene_ids + neighbor_gene_ids
117
+ gene_types = ['seed'] * len(seed_gene_ids) + ['neighbor'] * len(neighbor_gene_ids)
118
+
119
+ cnag_ids = []
120
+ gene_names = []
121
+ descriptions = []
122
+
123
+ for gene_id in gene_ids:
124
+ try:
125
+ cnag_id = transcript_annotations.loc[transcript_annotations["cnag_id"] == gene_id]["cnag_id"].values[0]
126
+ gene_name = transcript_annotations.loc[transcript_annotations["gene_id"] == gene_id]["gene_name"].values[0]
127
+ description = transcript_annotations.loc[transcript_annotations["gene_id"] == gene_id]["description"].values[0]
128
+ except:
129
+ st.error(f"Unable to locate cnag_id for Gene ID: {gene_id}, it should be of the form 'cnag_######'")
130
+ cnag_id = None
131
+ gene_name = None
132
+ description = None
133
+
134
+ cnag_ids.append(cnag_id)
135
+ gene_names.append(gene_name)
136
+ descriptions.append(description)
137
+
138
+ node_info = pd.DataFrame({
139
+ "gene_index": range(len(gene_ids)),
140
+ "gene_id" : gene_ids,
141
+ "gene_type" : gene_types,
142
+ "cnag_id": cnag_ids,
143
+ "gene_name": gene_names,
144
+ "description": description})
145
+
146
+ neighbors = neighbors.merge(
147
+ right = node_info,
148
+ left_on = "gene_id_1",
149
+ right_on = "gene_id")
150
+
151
+ neighbors = neighbors.merge(
152
+ right = node_info,
153
+ left_on = "gene_id_2",
154
+ right_on = "gene_id",
155
+ suffixes = ("_a", "_b"))
156
+
157
+ ################################
158
+ # Use NetworkX to layout graph #
159
+ ################################
160
+ # note I think CytoscapeJS can layout graphs
161
+ # but I'm unsure how to do it through the streamlit-cytoscapejs interface :(
162
+
163
+ st.write(neighbors)
164
+
165
+
166
+ G = nx.Graph()
167
+ for i in range(len(neighbors.index)):
168
+ edge = neighbors.iloc[i]
169
+ G.add_edge(
170
+ edge["gene_index_a"],
171
+ edge["gene_index_b"],
172
+ weight = edge["coexp_score"])
173
+ layout = nx.spring_layout(G)
174
+
175
+
176
+
177
+
178
+ node_color_lut = {
179
+ "seed" : "#4866F0", # blue
180
+ "neighbor" : "#F0C547" # gold
181
+ }
182
+
183
+ elements = []
184
+ singleton_index = 0
185
+ for i in range(len(node_info.index)):
186
+ node = node_info.iloc[i]
187
+
188
+ if node["gene_index"] in layout.keys():
189
+ layout_x = layout[node["gene_index"]][0] * 600 + 1500/2
190
+ layout_y = layout[node["gene_index"]][1] * 600 + 1500/2
191
+ else:
192
+ layout_x = (singleton_index % 8) * 150 + 100
193
+ layout_y = np.floor(singleton_index / 8) * 50 + 30
194
+ singleton_index += 1
195
+
196
+
197
+ elements.append({
198
+ "data": {
199
+ "id": node["gene_id"],
200
+ "label": node["gene_name"] if node["gene_name"] is not None else node["gene_id"],
201
+ "color": node_color_lut[node["gene_type"]]},
202
+ "position": {
203
+ "x" : layout_x,
204
+ "y" : layout_y}})
205
+
206
+ for i in range(len(neighbors.index)):
207
+ edge = neighbors.iloc[i]
208
+ elements.append({
209
+ "data" : {
210
+ "source" : edge["gene_id_1"],
211
+ "target" : edge["gene_id_2"],
212
+ "width" :
213
+ 20 if edge["coexp_score"] > 0.98 else
214
+ 15 if edge["coexp_score"] > 0.93 else
215
+ 10 if edge["coexp_score"] > 0.90 else
216
+ 8 if edge["coexp_score"] > 0.88 else
217
+ 5}})
218
+
219
+ with col3:
220
+ st.text('') # help alignment with input box
221
+ st.download_button(
222
+ label = "Download as as TSV",
223
+ data = neighbors.to_csv(sep ='\t').encode('utf-8'),
224
+ file_name = f"CryptoCEN_network.tsv",
225
+ mime = "text/csv")
226
+
227
+ ##########################################################
228
+
229
+ stylesheet = [
230
+ {"selector": "node", "style": {
231
+ "width": 140,
232
+ "height": 30,
233
+ "shape": "rectangle",
234
+ "label" : "data(label)",
235
+ "labelFontSize": 100,
236
+ 'background-color': 'data(color)',
237
+ "text-halign": "center",
238
+ "text-valign": "center",
239
+ }},
240
+ {"selector": "edge", "style": {
241
+ "width": "data(width)"
242
+ }}
243
+ ]
244
+
245
+ st.title("ToxoCEN Network")
246
+ clicked_elements = st_cytoscapejs(
247
+ elements = elements,
248
+ stylesheet = stylesheet,
249
+ width = 1000,
250
+ height= 1000,
251
+ key = "1")