Spaces:

Skjafir
/

Dubai

Runtime error

App Files Files Community

Skjafir commited on Oct 18, 2024

Commit

b91d116

verified ·

1 Parent(s): 3470c6c

Create app.py

Browse files

Files changed (1) hide show

app.py +358 -0

app.py ADDED Viewed

	@@ -0,0 +1,358 @@

+'''
+data.py
+contains classes for storing all the data objects for sieve analysis and basic input out routines (e.g. to_csv, to_fasta)
+objects include:
+    baseData - unsieved set of sequences and HLAs for simulations
+    sieveData - minimum dataset needed for sieve analysis: insert, breakthroughs, HLAs, treatment
+    simData - a sieveData object containing a simulation property with metadata about the simulation params,date,epitopes etc.
+    resultsData - a sieveData object containing results from potentially many sieve analysis methods
+    metaResults - contains results of analysis of many sieve datasets
+'''
+__all__ = ['sieveData',
+           'sieveDataMethods']
+import pandas as pd
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import Gapped, IUPAC
+from Bio.SubsMat.MatrixInfo import blosum90, ident
+from StringIO import StringIO
+class sieveData(object):
+    masterFn = None
+    lookupFn = None
+    hlaFn = None
+    seqFn = None
+    mapFn = None
+    """lists of unique 2 and 4 digit HLA alleles"""
+    uHLA4 = None
+    uHLA2 = None
+    """DataFrame of sequences with index ptid and columns: seq, seqID"""
+    seqDf = None
+    regionInds = None
+    """DataFrame of HLAs with index ptid and columns for all HLA alleles (2 and 4)"""
+    hlaDf = None
+    hlaFreq = None
+    """DataFrame with index ptid and columns: vaccinated, infected, hla"""
+    ptidDf = None
+    """contains position number as index and hxb2Pos and hxb2aa as columns"""
+    mapDf = None
+    studyName = None
+    proteinName = None
+    insertName = None
+    """sequence strings of the aligned HXB2 and vaccine insert"""
+    HXB2 = None
+    insertSeq = None
+    N = None
+    """List of ptids in each group to be used for indexing a df"""
+    vacPtid = None
+    plaPtid = None
+    vacInd = None
+    plaInd = None
+    temp = {}
+    """Signifies to the saving methods that the data may be different than other datasets from the same study"""
+    HLAsubset = False
+    """Indicates whether the sequence and other site indexed objects have already been sliced by regionInds"""
+    isSliced = False
+class sieveDataMethods(object):
+    data = None
+    def __init__(self,sievedata=None):
+        if sievedata is None:
+            sievedata = sieveData()
+        self.data = sievedata
+    def isvalidAnalysis(self, proteinName, insertName):
+        res = [va for va in s.validAnalyses if va['insertName']==insertName and va['proteinName']==proteinName]
+        return len(res) > 0
+    def to_nexus(self,fn):
+        self.to_fasta(fn,fileformat='nexus',sep='_')
+    def to_fasta(self, fn=None, fileformat='fasta', withHLA=False, withTreatment=False, sep='|', returnString=False):
+        """
+        >reference|PROTEIN|INSERT
+        >ptid|A1|A2|B1|B2 or >ptid|treatment
+        >HXB2
+        """
+        if fn is None:
+            fn = '%s.%s.%s.fasta' % (self.data.studyName, self.data.proteinName, self.data.insertName)
+        seqRecP = dict(description = '')
+        seqP = dict(alphabet = Gapped(IUPAC.protein))
+        outList = [SeqRecord(Seq(self.data.insertSeq, **seqP), id = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName), **seqRecP),
+                   SeqRecord(Seq(self.data.HXB2, **seqP), id = 'HXB2', **seqRecP)]
+        tmp = self.data.seqDf.join(self.data.ptidDf)
+        for ptid,row in tmp.iterrows():
+            treatment = 'vaccine' if row['vaccinated'] else 'placebo'
+            idStr = ptid
+            if withTreatment:
+                idStr += '%s%s' % (sep,treatment)
+            if withHLA and 'hla' in row.index and isinstance(row['hla'],basestring):
+                idStr += '%s%s' % (sep,sep.join(row['hla']))
+            rec = SeqRecord(Seq(row['seq'], **seqP), id = idStr, **seqRecP)
+            outList.append(rec)
+        if returnString:
+            fn = StringIO()
+            SeqIO.write(outList, fn, fileformat)
+            fn.seek(0)
+            return fn.read()
+        else:
+            SeqIO.write(outList, fn, fileformat)
+    def to_treatment_csv(self, fn=None, sep='|', returnString=False):
+        if fn is None:
+            fn = '%s.%s.%s.trt.csv' % (self.data.studyName, self.data.proteinName, self.data.insertName)
+        tmpDf = self.data.seqDf.join(self.data.ptidDf[['vaccinated']], how='left')
+        tmpDf['treatment'] = tmpDf.vaccinated.map(lambda s: 'vaccine' if s else 'placebo')
+        tmpDf = tmpDf.reset_index()
+        tmpDf = tmpDf.rename_axis({'index':'ptid'}, axis=1)
+        tmpDf = tmpDf[['ptid','treatment']]
+        """refPtid = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName)
+        tmpDf = tmpDf.append({'ptid':refPtid, 'treatment':'reference'}, ignore_index = True)"""
+        if returnString:
+            fn = StringIO()
+            tmpDf.to_csv(fn, index=False)
+            fn.seek(0)
+            return fn.read()
+        else:
+            tmpDf.to_csv(fn, index=False)
+    def to_mers(self, mersFn=None, nmers=[9], returnList=False):
+        allMers = []
+        for seq in self.data.seqDf.seq:
+            allMers += getMers(seq.replace('-',''), nmers = nmers)
+        allMers += getMers(self.data.insertSeq.replace('-',''), nmers = nmers)
+        uMers = sorted(list(set(allMers)))
+        if returnList:
+            return filter(isvalidmer, uMers)
+        else:
+            with open(mersFn, 'w') as fh:
+                for m in uMers:
+                    if isvalidmer(m):
+                        fh.write('%s\n' % m)
+    def to_hla(self, hlaFn = None, returnList = False):
+        convert = lambda h: h.replace('_','*')
+        if returnList:
+            return map(convert,filter(isvalidHLA,self.data.uHLA4))
+        else:
+            with open(hlaFn,'w') as fh:
+                for h in self.data.uHLA4:
+                    if isvalidHLA(h):
+                        fh.write('%s\n' % convert(h))
+    def checkBA(self,ba):
+        """Check that all kmers in seqDf and insertSeq are
+        present in the binding affinities dict ba, paired with every HLA in hlaDf"""
+        tot = 0
+        nantot=0
+        allMers = []
+        for seq in self.data.seqDf.seq:
+            allMers += getMers(seq.replace('-',''),nmers=[9])
+        allMers += getMers(self.data.insertSeq.replace('-',''),nmers=[9])
+        uMers = sorted(list(set(allMers)))
+        for m in uMers:
+            if isvalidmer(m):
+                for h in self.data.uHLA4:
+                    if isvalidHLA(h):
+                        tot += 1
+                        if isnan(ba[(h,m)]):
+                            nantot += 1
+        print 'Found nan for %d of %d total predictions (%d HLAs, %d mers, %2.0f%% missing)' % (nantot,tot,len(self.data.uHLA4),len(uMers),1e2*nantot/tot)
+    def computeDerivedData(self):
+        slicestr = lambda yo,ind: ''.join(array([c for c in yo])[array(ind)])
+        self.data.N = self.data.seqDf.shape[0]
+        """First join ptidDf and seqDf so that plaInd is always a valid boolean index on seqDf"""
+        df = self.data.seqDf.join(self.data.ptidDf)
+        self.data.vacPtid = df.index[df.vaccinated]
+        self.data.plaPtid = df.index[~df.vaccinated]
+        """Type of plaInd is ndarray (NOT pd.Series)"""
+        self.data.vacInd = df.vaccinated.values.astype(bool)
+        self.data.plaInd = (~df.vaccinated).values.astype(bool)
+        self.data.ptidDf = df[self.data.ptidDf.columns]
+        self.data.seqDf = df[self.data.seqDf.columns]
+        """Select region of protein based on regionInds"""
+        if not self.data.regionInds is None and not self.data.isSliced:
+            rInds = self.data.regionInds
+            """Slice seqDf,insertSeq,mapDf,HXB2"""
+            for ptid in self.data.seqDf.index:
+                seq = self.data.seqDf.seq[ptid]
+                self.data.seqDf.seq[ptid] = slicestr(seq,rInds)
+            self.data.insertSeq = slicestr(self.data.insertSeq,rInds)
+            self.data.mapDf = self.data.mapDf.ix[rInds]
+            self.data.mapDf = self.data.mapDf.set_index(arange(len(rInds)))
+            self.data.HXB2 = slicestr(self.data.HXB2,rInds)
+            self.data.isSliced = True
+        """Create df for looking up a site num from HXB2 coordinate"""
+        self.data.hxb22site = self.data.mapDf.copy()
+        self.data.hxb22site['site'] = self.data.hxb22site.index
+        self.data.hxb22site = self.data.hxb22site.set_index('hxb2Pos')
+    '''
+    TODO: move plotting code to a different file
+    def clipXVec(self,hxb2Range = None,vec=None,returnInds=False):
+        """Clip seq-axis vector based on an HXB2 coordinate range (eg [70,80])"""
+        if hxb2Range is None:
+            siteRange = [self.data.mapDf.index[0],self.data.mapDf.index[-1]+1]
+        else:
+            hxb2Range = [str(c) for c in hxb2Range]
+            siteRange = [self.data.mapDf.index[self.data.mapDf.hxb2Pos == hxb2Range[0]],self.data.mapDf.index[self.data.mapDf.hxb2Pos==hxb2Range[1]]+1]
+        if returnInds:
+            return arange(siteRange[0],siteRange[1])
+        else:
+            return vec[siteRange[0]:siteRange[1]]
+    def plotSeqSpace(self,hxb2Range=None,subst=None,method='tsne',interactive=False,force=False,**kwargs):
+        """Plot MDS of sequence space using a substitution matrix. If interactive then returns AnnotationPicker obj"""
+        if subst is None:
+            subst=blosum90
+        seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq]
+        df=self.data.ptidDf.join(self.data.seqDf,how='right')
+        """uInd has length len(seqs) but indexes into uSeqs"""
+        uSeqs,uInd=unique(seqs,return_inverse=True)
+        group=[]
+        for uniqi,s in enumerate(uSeqs):
+            tmp=df.vaccinated[uInd==uniqi].unique()
+            if len(tmp)==2:
+                group.append('both')
+            else:
+                group.append(tmp[0])
+        insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq)
+        uSeqs=append(uSeqs,insertSeq)
+        group.append('insert')
+        recalc=True
+        """Recalc if seqMethod doesn't exist or if its different than current method"""
+        try:
+            if method==self.data.temp['seqMethod']:
+                dist=self.data.temp['seqDist']
+                xy=self.data.temp['seqXY']
+                if xy.shape[0]==len(uSeqs):
+                    recalc=False
+        except:
+            pass
+        if recalc or force:
+            dist=calcDistanceMatrix(uSeqs,distanceFunc=lambda s1,s2: seq_distance(s1,s2,subst=subst))
+            xy=embedDistanceMatrix(dist,method=method)
+            self.data.temp['seqDist']=dist
+            self.data.temp['seqXY']=xy
+            self.data.temp['seqMethod']=method
+        freq=objhist(seqs,keys=uSeqs)
+        """Make sure the insert has a count of at least 1"""
+        if freq[insertSeq]==0:
+            freq[insertSeq]=1
+        if all([f==1 for f in freq.values()]):
+            freqVec=[30]*len(freq)
+            labels=uSeqs
+        else:
+            freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200)
+            labels=['%s: %d' % (s,freq[s]) for s in uSeqs]
+        if interactive:
+            picker=3
+        else:
+            picker=None
+        clf()
+        scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs)
+        xticks(())
+        yticks(())
+        if hxb2Range is None:
+            hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1])
+        title('MDS Embedding of Sequence space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1]))
+        if interactive:
+            mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small')
+            return mp
+    def plotHLASpace(self,hxb2Range=None,hlaList=None,ba=None,method='tsne',interactive=False,**kwargs):
+        """
+        Plot an MDS embedding of HLA space
+        Original features were nHLAs x nMers
+        """
+        seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq]
+        df=self.data.ptidDf.join(self.data.seqDf,how='right')
+        """uInd has length len(seqs) but indexes into uSeqs"""
+        uSeqs,uInd=unique(seqs,return_inverse=True)
+        group=[]
+        for uniqi,s in enumerate(uSeqs):
+            tmp=df.vaccinated[uInd==uniqi].unique()
+            if len(tmp)==2:
+                group.append('both')
+            else:
+                group.append(tmp[0])
+        insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq)
+        uSeqs=append(uSeqs,insertSeq)
+        group.append('insert')
+        mers=getMers(insertSeq,nmers=[9])
+        dist=empty((len(uSeqs),len(mers)*len(hlaList)))
+        for si,s in enumerate(uSeqs):
+            for meri, mer in enumerate(getMers(s,nmers=[9])):
+                for hlai,h in enumerate(hlaList):
+                    pred=ba[(h,mer)]
+                    if isnan(pred):
+                        pred=15
+                    dist[si,int(meri*len(hlaList)+hlai)]=pred
+        xy=embedDistanceMatrix(dist,method=method)
+        freq=objhist(seqs,keys=uSeqs)
+        """Make sure the insert has a count of at least 1"""
+        if freq[insertSeq]==0:
+            freq[insertSeq]=1
+        if all([f==1 for f in freq.values()]):
+            freqVec=[30]*len(freq)
+            labels=uSeqs
+        else:
+            freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200)
+            labels=['%s: %d' % (s,freq[s]) for s in uSeqs]
+        if interactive:
+            picker=3
+        else:
+            picker=None
+        clf()
+        scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs)
+        xticks(())
+        yticks(())
+        if hxb2Range is None:
+            hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1])
+        title('MDS Embedding of HLA binding space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1]))
+        if interactive:
+            mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small')
+            return mp
+    def plotConservation(self,region=None):
+        """Plot entropy/conservation site-wise for vaccine and placebo breakthrough sequences"""
+        plotSeqEntropy(self.data.seqDf.seq,region=region)
+    '''