Skjafir commited on
Commit
b91d116
·
verified ·
1 Parent(s): 3470c6c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +358 -0
app.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ data.py
3
+ contains classes for storing all the data objects for sieve analysis and basic input out routines (e.g. to_csv, to_fasta)
4
+
5
+ objects include:
6
+ baseData - unsieved set of sequences and HLAs for simulations
7
+ sieveData - minimum dataset needed for sieve analysis: insert, breakthroughs, HLAs, treatment
8
+ simData - a sieveData object containing a simulation property with metadata about the simulation params,date,epitopes etc.
9
+ resultsData - a sieveData object containing results from potentially many sieve analysis methods
10
+ metaResults - contains results of analysis of many sieve datasets
11
+ '''
12
+
13
+ __all__ = ['sieveData',
14
+ 'sieveDataMethods']
15
+
16
+ import pandas as pd
17
+ from Bio import SeqIO
18
+ from Bio.Seq import Seq
19
+ from Bio.SeqRecord import SeqRecord
20
+ from Bio.Alphabet import Gapped, IUPAC
21
+ from Bio.SubsMat.MatrixInfo import blosum90, ident
22
+ from StringIO import StringIO
23
+
24
+ class sieveData(object):
25
+ masterFn = None
26
+ lookupFn = None
27
+ hlaFn = None
28
+ seqFn = None
29
+ mapFn = None
30
+
31
+ """lists of unique 2 and 4 digit HLA alleles"""
32
+ uHLA4 = None
33
+ uHLA2 = None
34
+
35
+ """DataFrame of sequences with index ptid and columns: seq, seqID"""
36
+ seqDf = None
37
+ regionInds = None
38
+
39
+ """DataFrame of HLAs with index ptid and columns for all HLA alleles (2 and 4)"""
40
+ hlaDf = None
41
+ hlaFreq = None
42
+
43
+ """DataFrame with index ptid and columns: vaccinated, infected, hla"""
44
+ ptidDf = None
45
+
46
+ """contains position number as index and hxb2Pos and hxb2aa as columns"""
47
+ mapDf = None
48
+
49
+ studyName = None
50
+ proteinName = None
51
+ insertName = None
52
+
53
+ """sequence strings of the aligned HXB2 and vaccine insert"""
54
+ HXB2 = None
55
+ insertSeq = None
56
+
57
+ N = None
58
+
59
+ """List of ptids in each group to be used for indexing a df"""
60
+ vacPtid = None
61
+ plaPtid = None
62
+ vacInd = None
63
+ plaInd = None
64
+
65
+ temp = {}
66
+ """Signifies to the saving methods that the data may be different than other datasets from the same study"""
67
+ HLAsubset = False
68
+
69
+ """Indicates whether the sequence and other site indexed objects have already been sliced by regionInds"""
70
+ isSliced = False
71
+
72
+ class sieveDataMethods(object):
73
+ data = None
74
+ def __init__(self,sievedata=None):
75
+ if sievedata is None:
76
+ sievedata = sieveData()
77
+ self.data = sievedata
78
+
79
+ def isvalidAnalysis(self, proteinName, insertName):
80
+ res = [va for va in s.validAnalyses if va['insertName']==insertName and va['proteinName']==proteinName]
81
+ return len(res) > 0
82
+
83
+ def to_nexus(self,fn):
84
+ self.to_fasta(fn,fileformat='nexus',sep='_')
85
+
86
+ def to_fasta(self, fn=None, fileformat='fasta', withHLA=False, withTreatment=False, sep='|', returnString=False):
87
+ """
88
+ >reference|PROTEIN|INSERT
89
+ >ptid|A1|A2|B1|B2 or >ptid|treatment
90
+ >HXB2
91
+ """
92
+ if fn is None:
93
+ fn = '%s.%s.%s.fasta' % (self.data.studyName, self.data.proteinName, self.data.insertName)
94
+
95
+ seqRecP = dict(description = '')
96
+ seqP = dict(alphabet = Gapped(IUPAC.protein))
97
+
98
+ outList = [SeqRecord(Seq(self.data.insertSeq, **seqP), id = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName), **seqRecP),
99
+ SeqRecord(Seq(self.data.HXB2, **seqP), id = 'HXB2', **seqRecP)]
100
+ tmp = self.data.seqDf.join(self.data.ptidDf)
101
+ for ptid,row in tmp.iterrows():
102
+ treatment = 'vaccine' if row['vaccinated'] else 'placebo'
103
+ idStr = ptid
104
+ if withTreatment:
105
+ idStr += '%s%s' % (sep,treatment)
106
+ if withHLA and 'hla' in row.index and isinstance(row['hla'],basestring):
107
+ idStr += '%s%s' % (sep,sep.join(row['hla']))
108
+
109
+ rec = SeqRecord(Seq(row['seq'], **seqP), id = idStr, **seqRecP)
110
+ outList.append(rec)
111
+
112
+ if returnString:
113
+ fn = StringIO()
114
+ SeqIO.write(outList, fn, fileformat)
115
+ fn.seek(0)
116
+ return fn.read()
117
+ else:
118
+ SeqIO.write(outList, fn, fileformat)
119
+
120
+ def to_treatment_csv(self, fn=None, sep='|', returnString=False):
121
+ if fn is None:
122
+ fn = '%s.%s.%s.trt.csv' % (self.data.studyName, self.data.proteinName, self.data.insertName)
123
+
124
+ tmpDf = self.data.seqDf.join(self.data.ptidDf[['vaccinated']], how='left')
125
+ tmpDf['treatment'] = tmpDf.vaccinated.map(lambda s: 'vaccine' if s else 'placebo')
126
+ tmpDf = tmpDf.reset_index()
127
+ tmpDf = tmpDf.rename_axis({'index':'ptid'}, axis=1)
128
+ tmpDf = tmpDf[['ptid','treatment']]
129
+
130
+ """refPtid = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName)
131
+ tmpDf = tmpDf.append({'ptid':refPtid, 'treatment':'reference'}, ignore_index = True)"""
132
+ if returnString:
133
+ fn = StringIO()
134
+ tmpDf.to_csv(fn, index=False)
135
+ fn.seek(0)
136
+ return fn.read()
137
+ else:
138
+ tmpDf.to_csv(fn, index=False)
139
+
140
+ def to_mers(self, mersFn=None, nmers=[9], returnList=False):
141
+ allMers = []
142
+ for seq in self.data.seqDf.seq:
143
+ allMers += getMers(seq.replace('-',''), nmers = nmers)
144
+ allMers += getMers(self.data.insertSeq.replace('-',''), nmers = nmers)
145
+ uMers = sorted(list(set(allMers)))
146
+ if returnList:
147
+ return filter(isvalidmer, uMers)
148
+ else:
149
+ with open(mersFn, 'w') as fh:
150
+ for m in uMers:
151
+ if isvalidmer(m):
152
+ fh.write('%s\n' % m)
153
+ def to_hla(self, hlaFn = None, returnList = False):
154
+ convert = lambda h: h.replace('_','*')
155
+ if returnList:
156
+ return map(convert,filter(isvalidHLA,self.data.uHLA4))
157
+ else:
158
+ with open(hlaFn,'w') as fh:
159
+ for h in self.data.uHLA4:
160
+ if isvalidHLA(h):
161
+ fh.write('%s\n' % convert(h))
162
+ def checkBA(self,ba):
163
+ """Check that all kmers in seqDf and insertSeq are
164
+ present in the binding affinities dict ba, paired with every HLA in hlaDf"""
165
+ tot = 0
166
+ nantot=0
167
+
168
+ allMers = []
169
+ for seq in self.data.seqDf.seq:
170
+ allMers += getMers(seq.replace('-',''),nmers=[9])
171
+ allMers += getMers(self.data.insertSeq.replace('-',''),nmers=[9])
172
+ uMers = sorted(list(set(allMers)))
173
+ for m in uMers:
174
+ if isvalidmer(m):
175
+ for h in self.data.uHLA4:
176
+ if isvalidHLA(h):
177
+ tot += 1
178
+ if isnan(ba[(h,m)]):
179
+ nantot += 1
180
+ print 'Found nan for %d of %d total predictions (%d HLAs, %d mers, %2.0f%% missing)' % (nantot,tot,len(self.data.uHLA4),len(uMers),1e2*nantot/tot)
181
+
182
+ def computeDerivedData(self):
183
+ slicestr = lambda yo,ind: ''.join(array([c for c in yo])[array(ind)])
184
+
185
+ self.data.N = self.data.seqDf.shape[0]
186
+
187
+ """First join ptidDf and seqDf so that plaInd is always a valid boolean index on seqDf"""
188
+ df = self.data.seqDf.join(self.data.ptidDf)
189
+ self.data.vacPtid = df.index[df.vaccinated]
190
+ self.data.plaPtid = df.index[~df.vaccinated]
191
+ """Type of plaInd is ndarray (NOT pd.Series)"""
192
+ self.data.vacInd = df.vaccinated.values.astype(bool)
193
+ self.data.plaInd = (~df.vaccinated).values.astype(bool)
194
+
195
+ self.data.ptidDf = df[self.data.ptidDf.columns]
196
+ self.data.seqDf = df[self.data.seqDf.columns]
197
+
198
+ """Select region of protein based on regionInds"""
199
+ if not self.data.regionInds is None and not self.data.isSliced:
200
+ rInds = self.data.regionInds
201
+ """Slice seqDf,insertSeq,mapDf,HXB2"""
202
+ for ptid in self.data.seqDf.index:
203
+ seq = self.data.seqDf.seq[ptid]
204
+ self.data.seqDf.seq[ptid] = slicestr(seq,rInds)
205
+ self.data.insertSeq = slicestr(self.data.insertSeq,rInds)
206
+
207
+ self.data.mapDf = self.data.mapDf.ix[rInds]
208
+ self.data.mapDf = self.data.mapDf.set_index(arange(len(rInds)))
209
+
210
+ self.data.HXB2 = slicestr(self.data.HXB2,rInds)
211
+ self.data.isSliced = True
212
+
213
+ """Create df for looking up a site num from HXB2 coordinate"""
214
+ self.data.hxb22site = self.data.mapDf.copy()
215
+ self.data.hxb22site['site'] = self.data.hxb22site.index
216
+ self.data.hxb22site = self.data.hxb22site.set_index('hxb2Pos')
217
+ '''
218
+ TODO: move plotting code to a different file
219
+ def clipXVec(self,hxb2Range = None,vec=None,returnInds=False):
220
+ """Clip seq-axis vector based on an HXB2 coordinate range (eg [70,80])"""
221
+ if hxb2Range is None:
222
+ siteRange = [self.data.mapDf.index[0],self.data.mapDf.index[-1]+1]
223
+ else:
224
+ hxb2Range = [str(c) for c in hxb2Range]
225
+ siteRange = [self.data.mapDf.index[self.data.mapDf.hxb2Pos == hxb2Range[0]],self.data.mapDf.index[self.data.mapDf.hxb2Pos==hxb2Range[1]]+1]
226
+ if returnInds:
227
+ return arange(siteRange[0],siteRange[1])
228
+ else:
229
+ return vec[siteRange[0]:siteRange[1]]
230
+ def plotSeqSpace(self,hxb2Range=None,subst=None,method='tsne',interactive=False,force=False,**kwargs):
231
+ """Plot MDS of sequence space using a substitution matrix. If interactive then returns AnnotationPicker obj"""
232
+ if subst is None:
233
+ subst=blosum90
234
+ seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq]
235
+ df=self.data.ptidDf.join(self.data.seqDf,how='right')
236
+ """uInd has length len(seqs) but indexes into uSeqs"""
237
+ uSeqs,uInd=unique(seqs,return_inverse=True)
238
+
239
+ group=[]
240
+ for uniqi,s in enumerate(uSeqs):
241
+ tmp=df.vaccinated[uInd==uniqi].unique()
242
+ if len(tmp)==2:
243
+ group.append('both')
244
+ else:
245
+ group.append(tmp[0])
246
+ insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq)
247
+ uSeqs=append(uSeqs,insertSeq)
248
+ group.append('insert')
249
+
250
+ recalc=True
251
+ """Recalc if seqMethod doesn't exist or if its different than current method"""
252
+ try:
253
+ if method==self.data.temp['seqMethod']:
254
+ dist=self.data.temp['seqDist']
255
+ xy=self.data.temp['seqXY']
256
+ if xy.shape[0]==len(uSeqs):
257
+ recalc=False
258
+ except:
259
+ pass
260
+
261
+ if recalc or force:
262
+ dist=calcDistanceMatrix(uSeqs,distanceFunc=lambda s1,s2: seq_distance(s1,s2,subst=subst))
263
+ xy=embedDistanceMatrix(dist,method=method)
264
+ self.data.temp['seqDist']=dist
265
+ self.data.temp['seqXY']=xy
266
+ self.data.temp['seqMethod']=method
267
+
268
+ freq=objhist(seqs,keys=uSeqs)
269
+ """Make sure the insert has a count of at least 1"""
270
+ if freq[insertSeq]==0:
271
+ freq[insertSeq]=1
272
+
273
+ if all([f==1 for f in freq.values()]):
274
+ freqVec=[30]*len(freq)
275
+ labels=uSeqs
276
+ else:
277
+ freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200)
278
+ labels=['%s: %d' % (s,freq[s]) for s in uSeqs]
279
+
280
+ if interactive:
281
+ picker=3
282
+ else:
283
+ picker=None
284
+
285
+ clf()
286
+ scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs)
287
+ xticks(())
288
+ yticks(())
289
+ if hxb2Range is None:
290
+ hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1])
291
+ title('MDS Embedding of Sequence space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1]))
292
+ if interactive:
293
+ mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small')
294
+ return mp
295
+
296
+ def plotHLASpace(self,hxb2Range=None,hlaList=None,ba=None,method='tsne',interactive=False,**kwargs):
297
+ """
298
+ Plot an MDS embedding of HLA space
299
+ Original features were nHLAs x nMers
300
+ """
301
+ seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq]
302
+ df=self.data.ptidDf.join(self.data.seqDf,how='right')
303
+ """uInd has length len(seqs) but indexes into uSeqs"""
304
+ uSeqs,uInd=unique(seqs,return_inverse=True)
305
+
306
+ group=[]
307
+ for uniqi,s in enumerate(uSeqs):
308
+ tmp=df.vaccinated[uInd==uniqi].unique()
309
+ if len(tmp)==2:
310
+ group.append('both')
311
+ else:
312
+ group.append(tmp[0])
313
+ insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq)
314
+ uSeqs=append(uSeqs,insertSeq)
315
+ group.append('insert')
316
+
317
+ mers=getMers(insertSeq,nmers=[9])
318
+ dist=empty((len(uSeqs),len(mers)*len(hlaList)))
319
+ for si,s in enumerate(uSeqs):
320
+ for meri, mer in enumerate(getMers(s,nmers=[9])):
321
+ for hlai,h in enumerate(hlaList):
322
+ pred=ba[(h,mer)]
323
+ if isnan(pred):
324
+ pred=15
325
+ dist[si,int(meri*len(hlaList)+hlai)]=pred
326
+
327
+ xy=embedDistanceMatrix(dist,method=method)
328
+ freq=objhist(seqs,keys=uSeqs)
329
+ """Make sure the insert has a count of at least 1"""
330
+ if freq[insertSeq]==0:
331
+ freq[insertSeq]=1
332
+
333
+ if all([f==1 for f in freq.values()]):
334
+ freqVec=[30]*len(freq)
335
+ labels=uSeqs
336
+ else:
337
+ freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200)
338
+ labels=['%s: %d' % (s,freq[s]) for s in uSeqs]
339
+
340
+ if interactive:
341
+ picker=3
342
+ else:
343
+ picker=None
344
+
345
+ clf()
346
+ scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs)
347
+ xticks(())
348
+ yticks(())
349
+ if hxb2Range is None:
350
+ hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1])
351
+ title('MDS Embedding of HLA binding space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1]))
352
+ if interactive:
353
+ mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small')
354
+ return mp
355
+ def plotConservation(self,region=None):
356
+ """Plot entropy/conservation site-wise for vaccine and placebo breakthrough sequences"""
357
+ plotSeqEntropy(self.data.seqDf.seq,region=region)
358
+ '''