Dubai / app.py
Skjafir's picture
Create app.py
b91d116 verified
'''
data.py
contains classes for storing all the data objects for sieve analysis and basic input out routines (e.g. to_csv, to_fasta)
objects include:
baseData - unsieved set of sequences and HLAs for simulations
sieveData - minimum dataset needed for sieve analysis: insert, breakthroughs, HLAs, treatment
simData - a sieveData object containing a simulation property with metadata about the simulation params,date,epitopes etc.
resultsData - a sieveData object containing results from potentially many sieve analysis methods
metaResults - contains results of analysis of many sieve datasets
'''
__all__ = ['sieveData',
'sieveDataMethods']
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import Gapped, IUPAC
from Bio.SubsMat.MatrixInfo import blosum90, ident
from StringIO import StringIO
class sieveData(object):
masterFn = None
lookupFn = None
hlaFn = None
seqFn = None
mapFn = None
"""lists of unique 2 and 4 digit HLA alleles"""
uHLA4 = None
uHLA2 = None
"""DataFrame of sequences with index ptid and columns: seq, seqID"""
seqDf = None
regionInds = None
"""DataFrame of HLAs with index ptid and columns for all HLA alleles (2 and 4)"""
hlaDf = None
hlaFreq = None
"""DataFrame with index ptid and columns: vaccinated, infected, hla"""
ptidDf = None
"""contains position number as index and hxb2Pos and hxb2aa as columns"""
mapDf = None
studyName = None
proteinName = None
insertName = None
"""sequence strings of the aligned HXB2 and vaccine insert"""
HXB2 = None
insertSeq = None
N = None
"""List of ptids in each group to be used for indexing a df"""
vacPtid = None
plaPtid = None
vacInd = None
plaInd = None
temp = {}
"""Signifies to the saving methods that the data may be different than other datasets from the same study"""
HLAsubset = False
"""Indicates whether the sequence and other site indexed objects have already been sliced by regionInds"""
isSliced = False
class sieveDataMethods(object):
data = None
def __init__(self,sievedata=None):
if sievedata is None:
sievedata = sieveData()
self.data = sievedata
def isvalidAnalysis(self, proteinName, insertName):
res = [va for va in s.validAnalyses if va['insertName']==insertName and va['proteinName']==proteinName]
return len(res) > 0
def to_nexus(self,fn):
self.to_fasta(fn,fileformat='nexus',sep='_')
def to_fasta(self, fn=None, fileformat='fasta', withHLA=False, withTreatment=False, sep='|', returnString=False):
"""
>reference|PROTEIN|INSERT
>ptid|A1|A2|B1|B2 or >ptid|treatment
>HXB2
"""
if fn is None:
fn = '%s.%s.%s.fasta' % (self.data.studyName, self.data.proteinName, self.data.insertName)
seqRecP = dict(description = '')
seqP = dict(alphabet = Gapped(IUPAC.protein))
outList = [SeqRecord(Seq(self.data.insertSeq, **seqP), id = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName), **seqRecP),
SeqRecord(Seq(self.data.HXB2, **seqP), id = 'HXB2', **seqRecP)]
tmp = self.data.seqDf.join(self.data.ptidDf)
for ptid,row in tmp.iterrows():
treatment = 'vaccine' if row['vaccinated'] else 'placebo'
idStr = ptid
if withTreatment:
idStr += '%s%s' % (sep,treatment)
if withHLA and 'hla' in row.index and isinstance(row['hla'],basestring):
idStr += '%s%s' % (sep,sep.join(row['hla']))
rec = SeqRecord(Seq(row['seq'], **seqP), id = idStr, **seqRecP)
outList.append(rec)
if returnString:
fn = StringIO()
SeqIO.write(outList, fn, fileformat)
fn.seek(0)
return fn.read()
else:
SeqIO.write(outList, fn, fileformat)
def to_treatment_csv(self, fn=None, sep='|', returnString=False):
if fn is None:
fn = '%s.%s.%s.trt.csv' % (self.data.studyName, self.data.proteinName, self.data.insertName)
tmpDf = self.data.seqDf.join(self.data.ptidDf[['vaccinated']], how='left')
tmpDf['treatment'] = tmpDf.vaccinated.map(lambda s: 'vaccine' if s else 'placebo')
tmpDf = tmpDf.reset_index()
tmpDf = tmpDf.rename_axis({'index':'ptid'}, axis=1)
tmpDf = tmpDf[['ptid','treatment']]
"""refPtid = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName)
tmpDf = tmpDf.append({'ptid':refPtid, 'treatment':'reference'}, ignore_index = True)"""
if returnString:
fn = StringIO()
tmpDf.to_csv(fn, index=False)
fn.seek(0)
return fn.read()
else:
tmpDf.to_csv(fn, index=False)
def to_mers(self, mersFn=None, nmers=[9], returnList=False):
allMers = []
for seq in self.data.seqDf.seq:
allMers += getMers(seq.replace('-',''), nmers = nmers)
allMers += getMers(self.data.insertSeq.replace('-',''), nmers = nmers)
uMers = sorted(list(set(allMers)))
if returnList:
return filter(isvalidmer, uMers)
else:
with open(mersFn, 'w') as fh:
for m in uMers:
if isvalidmer(m):
fh.write('%s\n' % m)
def to_hla(self, hlaFn = None, returnList = False):
convert = lambda h: h.replace('_','*')
if returnList:
return map(convert,filter(isvalidHLA,self.data.uHLA4))
else:
with open(hlaFn,'w') as fh:
for h in self.data.uHLA4:
if isvalidHLA(h):
fh.write('%s\n' % convert(h))
def checkBA(self,ba):
"""Check that all kmers in seqDf and insertSeq are
present in the binding affinities dict ba, paired with every HLA in hlaDf"""
tot = 0
nantot=0
allMers = []
for seq in self.data.seqDf.seq:
allMers += getMers(seq.replace('-',''),nmers=[9])
allMers += getMers(self.data.insertSeq.replace('-',''),nmers=[9])
uMers = sorted(list(set(allMers)))
for m in uMers:
if isvalidmer(m):
for h in self.data.uHLA4:
if isvalidHLA(h):
tot += 1
if isnan(ba[(h,m)]):
nantot += 1
print 'Found nan for %d of %d total predictions (%d HLAs, %d mers, %2.0f%% missing)' % (nantot,tot,len(self.data.uHLA4),len(uMers),1e2*nantot/tot)
def computeDerivedData(self):
slicestr = lambda yo,ind: ''.join(array([c for c in yo])[array(ind)])
self.data.N = self.data.seqDf.shape[0]
"""First join ptidDf and seqDf so that plaInd is always a valid boolean index on seqDf"""
df = self.data.seqDf.join(self.data.ptidDf)
self.data.vacPtid = df.index[df.vaccinated]
self.data.plaPtid = df.index[~df.vaccinated]
"""Type of plaInd is ndarray (NOT pd.Series)"""
self.data.vacInd = df.vaccinated.values.astype(bool)
self.data.plaInd = (~df.vaccinated).values.astype(bool)
self.data.ptidDf = df[self.data.ptidDf.columns]
self.data.seqDf = df[self.data.seqDf.columns]
"""Select region of protein based on regionInds"""
if not self.data.regionInds is None and not self.data.isSliced:
rInds = self.data.regionInds
"""Slice seqDf,insertSeq,mapDf,HXB2"""
for ptid in self.data.seqDf.index:
seq = self.data.seqDf.seq[ptid]
self.data.seqDf.seq[ptid] = slicestr(seq,rInds)
self.data.insertSeq = slicestr(self.data.insertSeq,rInds)
self.data.mapDf = self.data.mapDf.ix[rInds]
self.data.mapDf = self.data.mapDf.set_index(arange(len(rInds)))
self.data.HXB2 = slicestr(self.data.HXB2,rInds)
self.data.isSliced = True
"""Create df for looking up a site num from HXB2 coordinate"""
self.data.hxb22site = self.data.mapDf.copy()
self.data.hxb22site['site'] = self.data.hxb22site.index
self.data.hxb22site = self.data.hxb22site.set_index('hxb2Pos')
'''
TODO: move plotting code to a different file
def clipXVec(self,hxb2Range = None,vec=None,returnInds=False):
"""Clip seq-axis vector based on an HXB2 coordinate range (eg [70,80])"""
if hxb2Range is None:
siteRange = [self.data.mapDf.index[0],self.data.mapDf.index[-1]+1]
else:
hxb2Range = [str(c) for c in hxb2Range]
siteRange = [self.data.mapDf.index[self.data.mapDf.hxb2Pos == hxb2Range[0]],self.data.mapDf.index[self.data.mapDf.hxb2Pos==hxb2Range[1]]+1]
if returnInds:
return arange(siteRange[0],siteRange[1])
else:
return vec[siteRange[0]:siteRange[1]]
def plotSeqSpace(self,hxb2Range=None,subst=None,method='tsne',interactive=False,force=False,**kwargs):
"""Plot MDS of sequence space using a substitution matrix. If interactive then returns AnnotationPicker obj"""
if subst is None:
subst=blosum90
seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq]
df=self.data.ptidDf.join(self.data.seqDf,how='right')
"""uInd has length len(seqs) but indexes into uSeqs"""
uSeqs,uInd=unique(seqs,return_inverse=True)
group=[]
for uniqi,s in enumerate(uSeqs):
tmp=df.vaccinated[uInd==uniqi].unique()
if len(tmp)==2:
group.append('both')
else:
group.append(tmp[0])
insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq)
uSeqs=append(uSeqs,insertSeq)
group.append('insert')
recalc=True
"""Recalc if seqMethod doesn't exist or if its different than current method"""
try:
if method==self.data.temp['seqMethod']:
dist=self.data.temp['seqDist']
xy=self.data.temp['seqXY']
if xy.shape[0]==len(uSeqs):
recalc=False
except:
pass
if recalc or force:
dist=calcDistanceMatrix(uSeqs,distanceFunc=lambda s1,s2: seq_distance(s1,s2,subst=subst))
xy=embedDistanceMatrix(dist,method=method)
self.data.temp['seqDist']=dist
self.data.temp['seqXY']=xy
self.data.temp['seqMethod']=method
freq=objhist(seqs,keys=uSeqs)
"""Make sure the insert has a count of at least 1"""
if freq[insertSeq]==0:
freq[insertSeq]=1
if all([f==1 for f in freq.values()]):
freqVec=[30]*len(freq)
labels=uSeqs
else:
freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200)
labels=['%s: %d' % (s,freq[s]) for s in uSeqs]
if interactive:
picker=3
else:
picker=None
clf()
scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs)
xticks(())
yticks(())
if hxb2Range is None:
hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1])
title('MDS Embedding of Sequence space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1]))
if interactive:
mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small')
return mp
def plotHLASpace(self,hxb2Range=None,hlaList=None,ba=None,method='tsne',interactive=False,**kwargs):
"""
Plot an MDS embedding of HLA space
Original features were nHLAs x nMers
"""
seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq]
df=self.data.ptidDf.join(self.data.seqDf,how='right')
"""uInd has length len(seqs) but indexes into uSeqs"""
uSeqs,uInd=unique(seqs,return_inverse=True)
group=[]
for uniqi,s in enumerate(uSeqs):
tmp=df.vaccinated[uInd==uniqi].unique()
if len(tmp)==2:
group.append('both')
else:
group.append(tmp[0])
insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq)
uSeqs=append(uSeqs,insertSeq)
group.append('insert')
mers=getMers(insertSeq,nmers=[9])
dist=empty((len(uSeqs),len(mers)*len(hlaList)))
for si,s in enumerate(uSeqs):
for meri, mer in enumerate(getMers(s,nmers=[9])):
for hlai,h in enumerate(hlaList):
pred=ba[(h,mer)]
if isnan(pred):
pred=15
dist[si,int(meri*len(hlaList)+hlai)]=pred
xy=embedDistanceMatrix(dist,method=method)
freq=objhist(seqs,keys=uSeqs)
"""Make sure the insert has a count of at least 1"""
if freq[insertSeq]==0:
freq[insertSeq]=1
if all([f==1 for f in freq.values()]):
freqVec=[30]*len(freq)
labels=uSeqs
else:
freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200)
labels=['%s: %d' % (s,freq[s]) for s in uSeqs]
if interactive:
picker=3
else:
picker=None
clf()
scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs)
xticks(())
yticks(())
if hxb2Range is None:
hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1])
title('MDS Embedding of HLA binding space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1]))
if interactive:
mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small')
return mp
def plotConservation(self,region=None):
"""Plot entropy/conservation site-wise for vaccine and placebo breakthrough sequences"""
plotSeqEntropy(self.data.seqDf.seq,region=region)
'''