''' data.py contains classes for storing all the data objects for sieve analysis and basic input out routines (e.g. to_csv, to_fasta) objects include: baseData - unsieved set of sequences and HLAs for simulations sieveData - minimum dataset needed for sieve analysis: insert, breakthroughs, HLAs, treatment simData - a sieveData object containing a simulation property with metadata about the simulation params,date,epitopes etc. resultsData - a sieveData object containing results from potentially many sieve analysis methods metaResults - contains results of analysis of many sieve datasets ''' __all__ = ['sieveData', 'sieveDataMethods'] import pandas as pd from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import Gapped, IUPAC from Bio.SubsMat.MatrixInfo import blosum90, ident from StringIO import StringIO class sieveData(object): masterFn = None lookupFn = None hlaFn = None seqFn = None mapFn = None """lists of unique 2 and 4 digit HLA alleles""" uHLA4 = None uHLA2 = None """DataFrame of sequences with index ptid and columns: seq, seqID""" seqDf = None regionInds = None """DataFrame of HLAs with index ptid and columns for all HLA alleles (2 and 4)""" hlaDf = None hlaFreq = None """DataFrame with index ptid and columns: vaccinated, infected, hla""" ptidDf = None """contains position number as index and hxb2Pos and hxb2aa as columns""" mapDf = None studyName = None proteinName = None insertName = None """sequence strings of the aligned HXB2 and vaccine insert""" HXB2 = None insertSeq = None N = None """List of ptids in each group to be used for indexing a df""" vacPtid = None plaPtid = None vacInd = None plaInd = None temp = {} """Signifies to the saving methods that the data may be different than other datasets from the same study""" HLAsubset = False """Indicates whether the sequence and other site indexed objects have already been sliced by regionInds""" isSliced = False class sieveDataMethods(object): data = None def __init__(self,sievedata=None): if sievedata is None: sievedata = sieveData() self.data = sievedata def isvalidAnalysis(self, proteinName, insertName): res = [va for va in s.validAnalyses if va['insertName']==insertName and va['proteinName']==proteinName] return len(res) > 0 def to_nexus(self,fn): self.to_fasta(fn,fileformat='nexus',sep='_') def to_fasta(self, fn=None, fileformat='fasta', withHLA=False, withTreatment=False, sep='|', returnString=False): """ >reference|PROTEIN|INSERT >ptid|A1|A2|B1|B2 or >ptid|treatment >HXB2 """ if fn is None: fn = '%s.%s.%s.fasta' % (self.data.studyName, self.data.proteinName, self.data.insertName) seqRecP = dict(description = '') seqP = dict(alphabet = Gapped(IUPAC.protein)) outList = [SeqRecord(Seq(self.data.insertSeq, **seqP), id = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName), **seqRecP), SeqRecord(Seq(self.data.HXB2, **seqP), id = 'HXB2', **seqRecP)] tmp = self.data.seqDf.join(self.data.ptidDf) for ptid,row in tmp.iterrows(): treatment = 'vaccine' if row['vaccinated'] else 'placebo' idStr = ptid if withTreatment: idStr += '%s%s' % (sep,treatment) if withHLA and 'hla' in row.index and isinstance(row['hla'],basestring): idStr += '%s%s' % (sep,sep.join(row['hla'])) rec = SeqRecord(Seq(row['seq'], **seqP), id = idStr, **seqRecP) outList.append(rec) if returnString: fn = StringIO() SeqIO.write(outList, fn, fileformat) fn.seek(0) return fn.read() else: SeqIO.write(outList, fn, fileformat) def to_treatment_csv(self, fn=None, sep='|', returnString=False): if fn is None: fn = '%s.%s.%s.trt.csv' % (self.data.studyName, self.data.proteinName, self.data.insertName) tmpDf = self.data.seqDf.join(self.data.ptidDf[['vaccinated']], how='left') tmpDf['treatment'] = tmpDf.vaccinated.map(lambda s: 'vaccine' if s else 'placebo') tmpDf = tmpDf.reset_index() tmpDf = tmpDf.rename_axis({'index':'ptid'}, axis=1) tmpDf = tmpDf[['ptid','treatment']] """refPtid = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName) tmpDf = tmpDf.append({'ptid':refPtid, 'treatment':'reference'}, ignore_index = True)""" if returnString: fn = StringIO() tmpDf.to_csv(fn, index=False) fn.seek(0) return fn.read() else: tmpDf.to_csv(fn, index=False) def to_mers(self, mersFn=None, nmers=[9], returnList=False): allMers = [] for seq in self.data.seqDf.seq: allMers += getMers(seq.replace('-',''), nmers = nmers) allMers += getMers(self.data.insertSeq.replace('-',''), nmers = nmers) uMers = sorted(list(set(allMers))) if returnList: return filter(isvalidmer, uMers) else: with open(mersFn, 'w') as fh: for m in uMers: if isvalidmer(m): fh.write('%s\n' % m) def to_hla(self, hlaFn = None, returnList = False): convert = lambda h: h.replace('_','*') if returnList: return map(convert,filter(isvalidHLA,self.data.uHLA4)) else: with open(hlaFn,'w') as fh: for h in self.data.uHLA4: if isvalidHLA(h): fh.write('%s\n' % convert(h)) def checkBA(self,ba): """Check that all kmers in seqDf and insertSeq are present in the binding affinities dict ba, paired with every HLA in hlaDf""" tot = 0 nantot=0 allMers = [] for seq in self.data.seqDf.seq: allMers += getMers(seq.replace('-',''),nmers=[9]) allMers += getMers(self.data.insertSeq.replace('-',''),nmers=[9]) uMers = sorted(list(set(allMers))) for m in uMers: if isvalidmer(m): for h in self.data.uHLA4: if isvalidHLA(h): tot += 1 if isnan(ba[(h,m)]): nantot += 1 print 'Found nan for %d of %d total predictions (%d HLAs, %d mers, %2.0f%% missing)' % (nantot,tot,len(self.data.uHLA4),len(uMers),1e2*nantot/tot) def computeDerivedData(self): slicestr = lambda yo,ind: ''.join(array([c for c in yo])[array(ind)]) self.data.N = self.data.seqDf.shape[0] """First join ptidDf and seqDf so that plaInd is always a valid boolean index on seqDf""" df = self.data.seqDf.join(self.data.ptidDf) self.data.vacPtid = df.index[df.vaccinated] self.data.plaPtid = df.index[~df.vaccinated] """Type of plaInd is ndarray (NOT pd.Series)""" self.data.vacInd = df.vaccinated.values.astype(bool) self.data.plaInd = (~df.vaccinated).values.astype(bool) self.data.ptidDf = df[self.data.ptidDf.columns] self.data.seqDf = df[self.data.seqDf.columns] """Select region of protein based on regionInds""" if not self.data.regionInds is None and not self.data.isSliced: rInds = self.data.regionInds """Slice seqDf,insertSeq,mapDf,HXB2""" for ptid in self.data.seqDf.index: seq = self.data.seqDf.seq[ptid] self.data.seqDf.seq[ptid] = slicestr(seq,rInds) self.data.insertSeq = slicestr(self.data.insertSeq,rInds) self.data.mapDf = self.data.mapDf.ix[rInds] self.data.mapDf = self.data.mapDf.set_index(arange(len(rInds))) self.data.HXB2 = slicestr(self.data.HXB2,rInds) self.data.isSliced = True """Create df for looking up a site num from HXB2 coordinate""" self.data.hxb22site = self.data.mapDf.copy() self.data.hxb22site['site'] = self.data.hxb22site.index self.data.hxb22site = self.data.hxb22site.set_index('hxb2Pos') ''' TODO: move plotting code to a different file def clipXVec(self,hxb2Range = None,vec=None,returnInds=False): """Clip seq-axis vector based on an HXB2 coordinate range (eg [70,80])""" if hxb2Range is None: siteRange = [self.data.mapDf.index[0],self.data.mapDf.index[-1]+1] else: hxb2Range = [str(c) for c in hxb2Range] siteRange = [self.data.mapDf.index[self.data.mapDf.hxb2Pos == hxb2Range[0]],self.data.mapDf.index[self.data.mapDf.hxb2Pos==hxb2Range[1]]+1] if returnInds: return arange(siteRange[0],siteRange[1]) else: return vec[siteRange[0]:siteRange[1]] def plotSeqSpace(self,hxb2Range=None,subst=None,method='tsne',interactive=False,force=False,**kwargs): """Plot MDS of sequence space using a substitution matrix. If interactive then returns AnnotationPicker obj""" if subst is None: subst=blosum90 seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq] df=self.data.ptidDf.join(self.data.seqDf,how='right') """uInd has length len(seqs) but indexes into uSeqs""" uSeqs,uInd=unique(seqs,return_inverse=True) group=[] for uniqi,s in enumerate(uSeqs): tmp=df.vaccinated[uInd==uniqi].unique() if len(tmp)==2: group.append('both') else: group.append(tmp[0]) insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq) uSeqs=append(uSeqs,insertSeq) group.append('insert') recalc=True """Recalc if seqMethod doesn't exist or if its different than current method""" try: if method==self.data.temp['seqMethod']: dist=self.data.temp['seqDist'] xy=self.data.temp['seqXY'] if xy.shape[0]==len(uSeqs): recalc=False except: pass if recalc or force: dist=calcDistanceMatrix(uSeqs,distanceFunc=lambda s1,s2: seq_distance(s1,s2,subst=subst)) xy=embedDistanceMatrix(dist,method=method) self.data.temp['seqDist']=dist self.data.temp['seqXY']=xy self.data.temp['seqMethod']=method freq=objhist(seqs,keys=uSeqs) """Make sure the insert has a count of at least 1""" if freq[insertSeq]==0: freq[insertSeq]=1 if all([f==1 for f in freq.values()]): freqVec=[30]*len(freq) labels=uSeqs else: freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200) labels=['%s: %d' % (s,freq[s]) for s in uSeqs] if interactive: picker=3 else: picker=None clf() scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs) xticks(()) yticks(()) if hxb2Range is None: hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1]) title('MDS Embedding of Sequence space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1])) if interactive: mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small') return mp def plotHLASpace(self,hxb2Range=None,hlaList=None,ba=None,method='tsne',interactive=False,**kwargs): """ Plot an MDS embedding of HLA space Original features were nHLAs x nMers """ seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq] df=self.data.ptidDf.join(self.data.seqDf,how='right') """uInd has length len(seqs) but indexes into uSeqs""" uSeqs,uInd=unique(seqs,return_inverse=True) group=[] for uniqi,s in enumerate(uSeqs): tmp=df.vaccinated[uInd==uniqi].unique() if len(tmp)==2: group.append('both') else: group.append(tmp[0]) insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq) uSeqs=append(uSeqs,insertSeq) group.append('insert') mers=getMers(insertSeq,nmers=[9]) dist=empty((len(uSeqs),len(mers)*len(hlaList))) for si,s in enumerate(uSeqs): for meri, mer in enumerate(getMers(s,nmers=[9])): for hlai,h in enumerate(hlaList): pred=ba[(h,mer)] if isnan(pred): pred=15 dist[si,int(meri*len(hlaList)+hlai)]=pred xy=embedDistanceMatrix(dist,method=method) freq=objhist(seqs,keys=uSeqs) """Make sure the insert has a count of at least 1""" if freq[insertSeq]==0: freq[insertSeq]=1 if all([f==1 for f in freq.values()]): freqVec=[30]*len(freq) labels=uSeqs else: freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200) labels=['%s: %d' % (s,freq[s]) for s in uSeqs] if interactive: picker=3 else: picker=None clf() scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs) xticks(()) yticks(()) if hxb2Range is None: hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1]) title('MDS Embedding of HLA binding space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1])) if interactive: mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small') return mp def plotConservation(self,region=None): """Plot entropy/conservation site-wise for vaccine and placebo breakthrough sequences""" plotSeqEntropy(self.data.seqDf.seq,region=region) '''