Spaces:

Skjafir
/

Dubai

Runtime error

App Files Files Community

Dubai / app.py

Skjafir

Create app.py

b91d116 verified 5 months ago

raw

history blame contribute delete

14.4 kB

	'''
	data.py
	contains classes for storing all the data objects for sieve analysis and basic input out routines (e.g. to_csv, to_fasta)

	objects include:
	baseData - unsieved set of sequences and HLAs for simulations
	sieveData - minimum dataset needed for sieve analysis: insert, breakthroughs, HLAs, treatment
	simData - a sieveData object containing a simulation property with metadata about the simulation params,date,epitopes etc.
	resultsData - a sieveData object containing results from potentially many sieve analysis methods
	metaResults - contains results of analysis of many sieve datasets
	'''

	__all__ = ['sieveData',
	'sieveDataMethods']

	import pandas as pd
	from Bio import SeqIO
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from Bio.Alphabet import Gapped, IUPAC
	from Bio.SubsMat.MatrixInfo import blosum90, ident
	from StringIO import StringIO

	class sieveData(object):
	masterFn = None
	lookupFn = None
	hlaFn = None
	seqFn = None
	mapFn = None

	"""lists of unique 2 and 4 digit HLA alleles"""
	uHLA4 = None
	uHLA2 = None

	"""DataFrame of sequences with index ptid and columns: seq, seqID"""
	seqDf = None
	regionInds = None

	"""DataFrame of HLAs with index ptid and columns for all HLA alleles (2 and 4)"""
	hlaDf = None
	hlaFreq = None

	"""DataFrame with index ptid and columns: vaccinated, infected, hla"""
	ptidDf = None

	"""contains position number as index and hxb2Pos and hxb2aa as columns"""
	mapDf = None

	studyName = None
	proteinName = None
	insertName = None

	"""sequence strings of the aligned HXB2 and vaccine insert"""
	HXB2 = None
	insertSeq = None

	N = None

	"""List of ptids in each group to be used for indexing a df"""
	vacPtid = None
	plaPtid = None
	vacInd = None
	plaInd = None

	temp = {}
	"""Signifies to the saving methods that the data may be different than other datasets from the same study"""
	HLAsubset = False

	"""Indicates whether the sequence and other site indexed objects have already been sliced by regionInds"""
	isSliced = False

	class sieveDataMethods(object):
	data = None
	def __init__(self,sievedata=None):
	if sievedata is None:
	sievedata = sieveData()
	self.data = sievedata

	def isvalidAnalysis(self, proteinName, insertName):
	res = [va for va in s.validAnalyses if va['insertName']==insertName and va['proteinName']==proteinName]
	return len(res) > 0

	def to_nexus(self,fn):
	self.to_fasta(fn,fileformat='nexus',sep='_')

	def to_fasta(self, fn=None, fileformat='fasta', withHLA=False, withTreatment=False, sep='\|', returnString=False):
	"""
	>reference\|PROTEIN\|INSERT
	>ptid\|A1\|A2\|B1\|B2 or >ptid\|treatment
	>HXB2
	"""
	if fn is None:
	fn = '%s.%s.%s.fasta' % (self.data.studyName, self.data.proteinName, self.data.insertName)

	seqRecP = dict(description = '')
	seqP = dict(alphabet = Gapped(IUPAC.protein))

	outList = [SeqRecord(Seq(self.data.insertSeq, seqP), id = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName), seqRecP),
	SeqRecord(Seq(self.data.HXB2, seqP), id = 'HXB2', seqRecP)]
	tmp = self.data.seqDf.join(self.data.ptidDf)
	for ptid,row in tmp.iterrows():
	treatment = 'vaccine' if row['vaccinated'] else 'placebo'
	idStr = ptid
	if withTreatment:
	idStr += '%s%s' % (sep,treatment)
	if withHLA and 'hla' in row.index and isinstance(row['hla'],basestring):
	idStr += '%s%s' % (sep,sep.join(row['hla']))

	rec = SeqRecord(Seq(row['seq'], seqP), id = idStr, seqRecP)
	outList.append(rec)

	if returnString:
	fn = StringIO()
	SeqIO.write(outList, fn, fileformat)
	fn.seek(0)
	return fn.read()
	else:
	SeqIO.write(outList, fn, fileformat)

	def to_treatment_csv(self, fn=None, sep='\|', returnString=False):
	if fn is None:
	fn = '%s.%s.%s.trt.csv' % (self.data.studyName, self.data.proteinName, self.data.insertName)

	tmpDf = self.data.seqDf.join(self.data.ptidDf[['vaccinated']], how='left')
	tmpDf['treatment'] = tmpDf.vaccinated.map(lambda s: 'vaccine' if s else 'placebo')
	tmpDf = tmpDf.reset_index()
	tmpDf = tmpDf.rename_axis({'index':'ptid'}, axis=1)
	tmpDf = tmpDf[['ptid','treatment']]

	"""refPtid = 'reference%s%s%s%s' % (sep,self.data.proteinName,sep,self.data.insertName)
	tmpDf = tmpDf.append({'ptid':refPtid, 'treatment':'reference'}, ignore_index = True)"""
	if returnString:
	fn = StringIO()
	tmpDf.to_csv(fn, index=False)
	fn.seek(0)
	return fn.read()
	else:
	tmpDf.to_csv(fn, index=False)

	def to_mers(self, mersFn=None, nmers=[9], returnList=False):
	allMers = []
	for seq in self.data.seqDf.seq:
	allMers += getMers(seq.replace('-',''), nmers = nmers)
	allMers += getMers(self.data.insertSeq.replace('-',''), nmers = nmers)
	uMers = sorted(list(set(allMers)))
	if returnList:
	return filter(isvalidmer, uMers)
	else:
	with open(mersFn, 'w') as fh:
	for m in uMers:
	if isvalidmer(m):
	fh.write('%s\n' % m)
	def to_hla(self, hlaFn = None, returnList = False):
	convert = lambda h: h.replace('_','*')
	if returnList:
	return map(convert,filter(isvalidHLA,self.data.uHLA4))
	else:
	with open(hlaFn,'w') as fh:
	for h in self.data.uHLA4:
	if isvalidHLA(h):
	fh.write('%s\n' % convert(h))
	def checkBA(self,ba):
	"""Check that all kmers in seqDf and insertSeq are
	present in the binding affinities dict ba, paired with every HLA in hlaDf"""
	tot = 0
	nantot=0

	allMers = []
	for seq in self.data.seqDf.seq:
	allMers += getMers(seq.replace('-',''),nmers=[9])
	allMers += getMers(self.data.insertSeq.replace('-',''),nmers=[9])
	uMers = sorted(list(set(allMers)))
	for m in uMers:
	if isvalidmer(m):
	for h in self.data.uHLA4:
	if isvalidHLA(h):
	tot += 1
	if isnan(ba[(h,m)]):
	nantot += 1
	print 'Found nan for %d of %d total predictions (%d HLAs, %d mers, %2.0f%% missing)' % (nantot,tot,len(self.data.uHLA4),len(uMers),1e2*nantot/tot)

	def computeDerivedData(self):
	slicestr = lambda yo,ind: ''.join(array([c for c in yo])[array(ind)])

	self.data.N = self.data.seqDf.shape[0]

	"""First join ptidDf and seqDf so that plaInd is always a valid boolean index on seqDf"""
	df = self.data.seqDf.join(self.data.ptidDf)
	self.data.vacPtid = df.index[df.vaccinated]
	self.data.plaPtid = df.index[~df.vaccinated]
	"""Type of plaInd is ndarray (NOT pd.Series)"""
	self.data.vacInd = df.vaccinated.values.astype(bool)
	self.data.plaInd = (~df.vaccinated).values.astype(bool)

	self.data.ptidDf = df[self.data.ptidDf.columns]
	self.data.seqDf = df[self.data.seqDf.columns]

	"""Select region of protein based on regionInds"""
	if not self.data.regionInds is None and not self.data.isSliced:
	rInds = self.data.regionInds
	"""Slice seqDf,insertSeq,mapDf,HXB2"""
	for ptid in self.data.seqDf.index:
	seq = self.data.seqDf.seq[ptid]
	self.data.seqDf.seq[ptid] = slicestr(seq,rInds)
	self.data.insertSeq = slicestr(self.data.insertSeq,rInds)

	self.data.mapDf = self.data.mapDf.ix[rInds]
	self.data.mapDf = self.data.mapDf.set_index(arange(len(rInds)))

	self.data.HXB2 = slicestr(self.data.HXB2,rInds)
	self.data.isSliced = True

	"""Create df for looking up a site num from HXB2 coordinate"""
	self.data.hxb22site = self.data.mapDf.copy()
	self.data.hxb22site['site'] = self.data.hxb22site.index
	self.data.hxb22site = self.data.hxb22site.set_index('hxb2Pos')
	'''
	TODO: move plotting code to a different file
	def clipXVec(self,hxb2Range = None,vec=None,returnInds=False):
	"""Clip seq-axis vector based on an HXB2 coordinate range (eg [70,80])"""
	if hxb2Range is None:
	siteRange = [self.data.mapDf.index[0],self.data.mapDf.index[-1]+1]
	else:
	hxb2Range = [str(c) for c in hxb2Range]
	siteRange = [self.data.mapDf.index[self.data.mapDf.hxb2Pos == hxb2Range[0]],self.data.mapDf.index[self.data.mapDf.hxb2Pos==hxb2Range[1]]+1]
	if returnInds:
	return arange(siteRange[0],siteRange[1])
	else:
	return vec[siteRange[0]:siteRange[1]]
	def plotSeqSpace(self,hxb2Range=None,subst=None,method='tsne',interactive=False,force=False,**kwargs):
	"""Plot MDS of sequence space using a substitution matrix. If interactive then returns AnnotationPicker obj"""
	if subst is None:
	subst=blosum90
	seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq]
	df=self.data.ptidDf.join(self.data.seqDf,how='right')
	"""uInd has length len(seqs) but indexes into uSeqs"""
	uSeqs,uInd=unique(seqs,return_inverse=True)

	group=[]
	for uniqi,s in enumerate(uSeqs):
	tmp=df.vaccinated[uInd==uniqi].unique()
	if len(tmp)==2:
	group.append('both')
	else:
	group.append(tmp[0])
	insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq)
	uSeqs=append(uSeqs,insertSeq)
	group.append('insert')

	recalc=True
	"""Recalc if seqMethod doesn't exist or if its different than current method"""
	try:
	if method==self.data.temp['seqMethod']:
	dist=self.data.temp['seqDist']
	xy=self.data.temp['seqXY']
	if xy.shape[0]==len(uSeqs):
	recalc=False
	except:
	pass

	if recalc or force:
	dist=calcDistanceMatrix(uSeqs,distanceFunc=lambda s1,s2: seq_distance(s1,s2,subst=subst))
	xy=embedDistanceMatrix(dist,method=method)
	self.data.temp['seqDist']=dist
	self.data.temp['seqXY']=xy
	self.data.temp['seqMethod']=method

	freq=objhist(seqs,keys=uSeqs)
	"""Make sure the insert has a count of at least 1"""
	if freq[insertSeq]==0:
	freq[insertSeq]=1

	if all([f==1 for f in freq.values()]):
	freqVec=[30]*len(freq)
	labels=uSeqs
	else:
	freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200)
	labels=['%s: %d' % (s,freq[s]) for s in uSeqs]

	if interactive:
	picker=3
	else:
	picker=None

	clf()
	scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs)
	xticks(())
	yticks(())
	if hxb2Range is None:
	hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1])
	title('MDS Embedding of Sequence space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1]))
	if interactive:
	mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small')
	return mp

	def plotHLASpace(self,hxb2Range=None,hlaList=None,ba=None,method='tsne',interactive=False,**kwargs):
	"""
	Plot an MDS embedding of HLA space
	Original features were nHLAs x nMers
	"""
	seqs=[self.clipXVec(hxb2Range,s) for s in self.data.seqDf.seq]
	df=self.data.ptidDf.join(self.data.seqDf,how='right')
	"""uInd has length len(seqs) but indexes into uSeqs"""
	uSeqs,uInd=unique(seqs,return_inverse=True)

	group=[]
	for uniqi,s in enumerate(uSeqs):
	tmp=df.vaccinated[uInd==uniqi].unique()
	if len(tmp)==2:
	group.append('both')
	else:
	group.append(tmp[0])
	insertSeq=self.clipXVec(hxb2Range,self.data.insertSeq)
	uSeqs=append(uSeqs,insertSeq)
	group.append('insert')

	mers=getMers(insertSeq,nmers=[9])
	dist=empty((len(uSeqs),len(mers)*len(hlaList)))
	for si,s in enumerate(uSeqs):
	for meri, mer in enumerate(getMers(s,nmers=[9])):
	for hlai,h in enumerate(hlaList):
	pred=ba[(h,mer)]
	if isnan(pred):
	pred=15
	dist[si,int(meri*len(hlaList)+hlai)]=pred

	xy=embedDistanceMatrix(dist,method=method)
	freq=objhist(seqs,keys=uSeqs)
	"""Make sure the insert has a count of at least 1"""
	if freq[insertSeq]==0:
	freq[insertSeq]=1

	if all([f==1 for f in freq.values()]):
	freqVec=[30]*len(freq)
	labels=uSeqs
	else:
	freqVec=scatternorm(array([freq[s] for s in uSeqs]),30,200)
	labels=['%s: %d' % (s,freq[s]) for s in uSeqs]

	if interactive:
	picker=3
	else:
	picker=None

	clf()
	scatter(xy[:,0],xy[:,1],s=freqVec,c=[{'insert':'gold','both':'gray',True:'blue',False:'red'}[g] for g in group],picker=picker,**kwargs)
	xticks(())
	yticks(())
	if hxb2Range is None:
	hxb2Range=(self.data.hxb22site.index[0],self.data.hxb22site.index[-1])
	title('MDS Embedding of HLA binding space for %s (HXB2 %s-%s)' % (insertSeq,hxb2Range[0],hxb2Range[1]))
	if interactive:
	mp=AnnotationPicker(xy[:, 0], xy[:, 1], labels,weight='bold',color='black',size='x-small')
	return mp
	def plotConservation(self,region=None):
	"""Plot entropy/conservation site-wise for vaccine and placebo breakthrough sequences"""
	plotSeqEntropy(self.data.seqDf.seq,region=region)
	'''