#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#from__future__importprint_functionfromfuture.utilsimportiteritemsfrompast.builtinsimportxrange,range,reduceimportosimportsysimportimportlibasimpimportreimportgzipimporttarfileimportzipfileimportstructimportitertoolsimportbs4importtimefromlxmlimportetreetry:importcPickleaspickleexcept:importpickleimportpypath.utils.mappingasmappingimportpypath.utils.reflistsasreflistsimportpypath.share.commonascommonimportpypath_common._constantsas_constimportpypath.share.progressasprogressimportpypath.share.curlascurlimportpypath.internals.interaasinteraimportpypath.internals.refsasrefsimportpypath.inputs.uniprotasuniprot_inputimportpypath.resources.urlsasurlsimportpypath.utils.seqasseqimportpypath.share.sessionassession_modimportpypath.share.cacheascache
[docs]classBioPaxReader(session_mod.Logger):""" This class parses a BioPAX file and exposes its content easily accessible for further processing. First it opens the file, if necessary it extracts from the archive. Then an `lxml.etree.iterparse` object is created, so the iteration is efficient and memory requirements are minimal. The iterparse object is iterated then, and for each tag included in the `BioPaxReader.methods` dict, the appropriate method is called. These me- thods extract information from the BioPAX entity, and store it in arbit- rary data structures: strings, lists or dicts. These are stored in dicts where keys are the original IDs of the tags, prefixed with the unique ID of the parser object. This is necessary to give a way to merge later the result of parsing more BioPAX files. For example, `id42` may identify EGFR in one file, but AKT1 in the other. Then, the parser of the first file has a unique ID of a 5 letter random string, the second parser a different one, and the molecules with the same ID can be distinguished at merging, e.g. EGFR will be `ffjh2@id42` and AKT1 will be `tr9gy@id42`. The methods and the resulted dicts are named after the BioPAX elements, sometimes abbreviated. For example, `BioPaxReader.protein()` processes the `<bp:Protein>` elements, and stores the results in `BioPaxReader.proteins`. In its current state, this class does not parse every information and all BioPax entities. For example, nucleic acid related entities and interactions are omitted. But these easily can be added with minor mo- difications. """
[docs]def__init__(self,biopax,source,cleanup_period=800,file_from_archive=None,silent=False,):""" :param str,FileOpener biopax: either a filename, or a FileOpener object; if string is supplied, the FileOpener will be created in- ternally :param str source: the name of the data source, e.g. *Reactome* :param int cleanup_period: the number of last elements stored during the iteration of lxml.etree.iterparse; lower number results lower memory usage, but might risk that an element is deleted before it has been processed. Default is 800, which is a safe option. :param str file_from_archive: in case of processing an archive which may contain multiple files (tar.gz or zip), the path of the file to be processed needs to be supplied. E.g. *BioPax/Homo_sapiens.owl*. :param bool silent: whether print status messages and progress bars during processing. If you process large number of small files, better to set False, in case of one large file, True. The default is *False*. """session_mod.Logger.__init__(self,name='biopax')self.biopax=biopaxself.source=sourceself.file_from_archive=file_from_archiveself.cleanup_period=cleanup_periodself.biopax_tmp_file=Noneself.cachedir=cache.get_cachedir()self.parser_id=common.random_string()self.silent=silent# string constantsself.bppref='{http://www.biopax.org/release/biopax-level3.owl#}'self.rdfpref='{http://www.w3.org/1999/02/22-rdf-syntax-ns#}'self.rdfid='%sID'%self.rdfprefself.rdfab='%sabout'%self.rdfprefself.rdfres='%sresource'%self.rdfprefself.bpprot='%sProtein'%self.bpprefself.bpcplx='%sComplex'%self.bpprefself.bpprre='%sProteinReference'%self.bpprefself.bpreac='%sBiochemicalReaction'%self.bpprefself.bpcata='%sCatalysis'%self.bpprefself.bpctrl='%sControl'%self.bpprefself.bpmodu='%sModulation'%self.bpprefself.bpcoma='%sComplexAssembly'%self.bpprefself.bptran='%sTransport'%self.bpprefself.bptrbr='%sTransportWithBiochemicalReaction'%self.bpprefself.bpmoli='%sMolecularInteraction'%self.bpprefself.bppstp='%sPathwayStep'%self.bpprefself.bpuxrf='%sUnificationXref'%self.bpprefself.bpstoi='%sStoichiometry'%self.bpprefself.bppubr='%sPublicationXref'%self.bpprefself.bppath='%sPathway'%self.bpprefself.bpfrfe='%sFragmentFeature'%self.bpprefself.bpseqi='%sSequenceInterval'%self.bpprefself.bpseqs='%sSequenceSite'%self.bpprefself.bpmodf='%sModificationFeature'%self.bpprefself.bpmodv='%sSequenceModificationVocabulary'%self.bpprefself.bpmphe='%smemberPhysicalEntity'%self.bpprefself.bpmerf='%smemberEntityReference'%self.bpprefself.bperef='%sentityReference'%self.bpprefself.bpxref='%sxref'%self.bpprefself.bpdinm='%sdisplayName'%self.bpprefself.bprelr='%sRelationshipXref'%self.bpprefself.bpcsto='%scomponentStoichiometry'%self.bpprefself.bpcomp='%scomponent'%self.bpprefself.bpstoc='%sstoichiometricCoefficient'%self.bpprefself.bpphye='%sphysicalEntity'%self.bpprefself.bpcted='%scontrolled'%self.bpprefself.bpcter='%scontroller'%self.bpprefself.bpctyp='%scontrolType'%self.bpprefself.bppart='%sparticipant'%self.bpprefself.bpleft='%sleft'%self.bpprefself.bprgth='%sright'%self.bpprefself.bpsprc='%sstepProcess'%self.bpprefself.bpfeat='%sfeature'%self.bpprefself.bpfelo='%sfeatureLocation'%self.bpprefself.bpibeg='%ssequenceIntervalBegin'%self.bpprefself.bpiend='%ssequenceIntervalEnd'%self.bpprefself.bpseqp='%ssequencePosition'%self.bpprefself.bpmoty='%smodificationType'%self.bpprefself.bppcom='%spathwayComponent'%self.bpprefself.bpterm='%sterm'%self.bpprefself.bpdb='%sdb'%self.bpprefself.bpid='%sid'%self.bpprefself.upStr='UniProt'self.proteins={}self.pfamilies={}self.complexes={}self.cvariations={}self.prefs={}self.ids={}self.reactions={}self.cassemblies={}self.interactions={}self.transports={}self.transwreas={}self.stoichiometries={}self.catalyses={}self.controls={}self.pwsteps={}self.pubrefs={}self.fragfeas={}self.seqints={}self.seqsites={}self.modfeas={}self.seqmodvocs={}self.pathways={}self.interactions_not2={}self.methods={self.bpprot:'protein',self.bpcplx:'cplex',self.bpprre:'pref',self.bpuxrf:'uxref',self.bprelr:'rxref',self.bpstoi:'stoichiometry',self.bpreac:'reaction',self.bpcoma:'cassembly',self.bptrbr:'reaction',self.bptran:'reaction',self.bpmoli:'interaction',self.bpcata:'catalysis',self.bpmodu:'catalysis',self.bpctrl:'control',self.bppstp:'pwstep',self.bppubr:'pubref',self.bpfrfe:'fragfea',self.bpseqi:'seqint',self.bpseqs:'seqsite',self.bpmodf:'modfea',self.bpmodv:'seqmodvoc',self.bppath:'pathway'}
[docs]defprocess(self,silent=False):""" This method executes the total workflow of BioPax processing. :param bool silent: whether to print status messages and progress bars. """self.silent=silentself.open_biopax()self.biopax_size()self.extract()self.set_progress()self.init_etree()ifself.bpisnotNone:self.iterate()self.close_biopax()iflen(self.interactions_not2):self._log('%u interactions have not exactly 2 ''participants (%s).'%(len(self.interactions_not2),self.source,))else:self._log('XML syntax error or empty file encountered. ''Skipping to next file or resource.')
[docs]defopen_biopax(self):""" Opens the BioPax file. This method should not be called directly, ``BioPaxReader.process()`` calls it. """opener_args=({'default_mode':'rb',}ifself.file_from_archiveisNoneelse{'files_needed':[self.file_from_archive],})iftype(self.biopax)iscurl.FileOpener:self.opener=self.biopaxelse:self.opener=curl.FileOpener(self.biopax,**opener_args)iftype(self.opener.result)isdict:ifself.file_from_archiveisNoneor \
self.file_from_archivenotinself.opener.result:self.file_from_archive= \
sorted(list(self.opener.result.keys()))[0]self._biopax=self.opener.result[self.file_from_archive]elifself.opener.type=='gz':self._biopax=self.opener.gzfileelse:self._biopax=self.opener.fileobj
[docs]defbiopax_size(self):""" Gets the uncompressed size of the BioPax XML. This is needed in order to have a progress bar. This method should not be called directly, ``BioPaxReader.process()`` calls it. """self.bp_filesize=self.opener.sizes[self.file_from_archive] \
ifhasattr(self.opener,'sizes')elseself.opener.size
[docs]defextract(self):""" Extracts the BioPax file from compressed archive. Creates a temporary file. This is needed to trace the progress of processing, which is useful in case of large files. This method should not be called directly, ``BioPaxReader.process()`` calls it. """ifself.opener.type!='plain':self.biopax_tmp_file=os.path.join(self.cachedir,'biopax.processing.tmp.owl',)self._log('Extracting %s from %s compressed file.'%(self.file_from_archive,self.opener.type))ifnotself.silent:prg=progress.Progress(self.bp_filesize,'Extracting %s from %s compressed file'%(self.file_from_archive,self.opener.type),1000000)withopen(self.biopax_tmp_file,'wb')astmpf:whileTrue:chunk=self._biopax.read(100000)ifnotself.silent:prg.step(len(chunk))ifnotlen(chunk):breakifhasattr(chunk,'encode'):chunk=chunk.encode('utf8')tmpf.write(chunk)self._biopax=open(self.biopax_tmp_file,'rb')ifnotself.silent:prg.terminate()
[docs]definit_etree(self):""" Creates the ``lxml.etree.iterparse`` object. This method should not be called directly, ``BioPaxReader.process()`` calls it. """try:self.bp=etree.iterparse(self._biopax,events=('start','end'))_,self.root=next(self.bp)exceptetree.XMLSyntaxError:self.bp=Noneself.used_elements=[]
[docs]defset_progress(self):""" Initializes a progress bar. This method should not be called directly, ``BioPaxReader.process()`` calls it. """ifnotself.silent:self.prg=progress.Progress(self.bp_filesize,'Processing %s from BioPAX XML'%self.source,33)
[docs]defiterate(self):""" Iterates the BioPax XML and calls the appropriate methods for each element. This method should not be called directly, ``BioPaxReader.process()`` calls it. """self.fpos=self._biopax.tell()try:forev,eleminself.bp:# step the progressbar:new_fpos=self._biopax.tell()ifnotself.silent:self.prg.step(new_fpos-self.fpos)self.fpos=new_fposself.next_elem=elemself.next_event=evself.next_id='%s@%s'% \
(self.parser_id,self.next_elem.get(self.rdfid)) \
ifself.rdfidinelem.attrib \
else'%s@%s'% \
(self.parser_id,self.next_elem.get(self.rdfab))ifev=='end'andself.next_elem.taginself.methods:method=getattr(self,self.methods[self.next_elem.tag])method()self.root.clear()# self.used_elements.append(self.next_elem)# self.cleanup_hook()exceptetree.XMLSyntaxErrorase:ifnotself.silent:self.prg.terminate(status='failed')sys.stdout.write('\n\t:: Syntax error in BioPAX:\n\t\t%s\n'%str(e))sys.stdout.flush()ifnotself.silent:self.prg.terminate()
[docs]defcleanup_hook(self):""" Removes the used elements to free up memory. This method should not be called directly, ``BioPaxReader.iterate()`` calls it. """iflen(self.used_elements)>self.cleanup_period:for_inxrange(int(self.cleanup_period/2)):e=self.used_elements.pop()e.clear()
[docs]defclose_biopax(self):""" Deletes the iterator and closes the file object. This method should not be called directly, ``BioPaxReader.process()`` calls it. """delself.bpself._biopax.close()
defprotein(self):entref=self.next_elem.find(self.bperef)ifentrefisnotNone:protein=self.get_none(entref.get(self.rdfres)).replace('#','')self.proteins[self.next_id]={'protein':'%s@%s'%(self.parser_id,protein),'seqfeatures':self._bp_collect_resources(self.bpfeat),'modfeatures':self._bp_collect_resources(self.bpfeat)}else:self.pfamilies[self.next_id]= \
self._bp_collect_resources(self.bpmphe)defpref(self):self.prefs[self.next_id]={}self.prefs[self.next_id]['uxrefs']= \
self._bp_collect_resources(self.bpxref)self.prefs[self.next_id]['prefs']= \
self._bp_collect_resources(self.bpmerf)defuxref(self):db=self.next_elem.find(self.bpdb)id_type=db.text.lower()i=self.next_elem.find(self.bpid)ifiisnotNone:self.ids[self.next_id]=(id_type,i.text)defrxref(self):db=self.next_elem.find(self.bpdb)ifdbisnotNone:id_type=db.text.lower()i=self.next_elem.find(self.bpid)ifiisnotNone:self.ids[self.next_id]=(id_type,i.text)defcplex(self):ifself.next_elem.find(self.bpcsto)isnotNone:self.complexes[self.next_id]= \
self._bp_collect_resources(self.bpcsto)elifself.next_elem.find(self.bpmphe)isnotNone:self.cvariations[self.next_id]= \
self._bp_collect_resources(self.bpmphe)elifself.next_elem.find(self.bpcomp)isnotNone:self.complexes[self.next_id]= \
self._bp_collect_resources(self.bpcomp)defstoichiometry(self):snum=self.next_elem.find(self.bpstoc).textself.stoichiometries[self.next_id]=('%s@%s'%(self.parser_id,self.next_elem.find(self.bpphye).get(self.rdfres).replace('#','')),int(float(snum)))definteraction(self):part=self._bp_collect_resources(self.bpleft)iflen(part)==2:self.interactions[self.next_id]={'refs':self._bp_collect_resources(self.bpxref),'left':[part[0]],'right':[part[1]],'type':self.next_elem.tag.split('}')[-1]}else:self.interactions_not2[self.next_id]=len(part)defreaction(self):self.reactions[self.next_id]={'refs':self._bp_collect_resources(self.bpxref),'left':self._bp_collect_resources(self.bpleft),'right':self._bp_collect_resources(self.bprgth),'type':self.next_elem.tag.split('}')[-1]}defcassembly(self):self.cassemblies[self.next_id]={'refs':self._bp_collect_resources(self.bpxref),'left':self._bp_collect_resources(self.bpleft),'right':self._bp_collect_resources(self.bprgth),'type':self.next_elem.tag.split('}')[-1]}defcatalysis(self):cter=self.next_elem.find(self.bpcter)cted=self.next_elem.find(self.bpcted)ifcterisnotNoneandctedisnotNone:typ=self.next_elem.find(self.bpctyp)self.catalyses[self.next_id]={'controller':'%s@%s'%(self.parser_id,self.get_none(cter.get(self.rdfres))),'controlled':'%s@%s'%(self.parser_id,self.get_none(cted.get(self.rdfres))),'type':''iftypisNoneelsetyp.text}defcontrol(self):cter=self.next_elem.find(self.bpcter)cted=self.next_elem.find(self.bpcted)ifcterisnotNoneandctedisnotNone:typ=self.next_elem.find(self.bpctyp)self.controls[self.next_id]={'refs':self._bp_collect_resources(self.bpxref),'controller':'%s@%s'%(self.parser_id,cter.get(self.rdfres).replace('#','')),'controlled':'%s@%s'%(self.parser_id,cted.get(self.rdfres).replace('#','')),'type':''iftypisNoneelsetyp.text}defpwstep(self):self.pwsteps[self.next_id]=self._bp_collect_resources(self.bppstp)defpubref(self):pmid=self.next_elem.find(self.bpid)ifpmidisnotNone:self.pubrefs[self.next_id]=pmid.textdeffragfea(self):felo=self.next_elem.find(self.bpfelo).get(self.rdfres)self.fragfeas[self.next_id]='%s@%s'% \
(self.parser_id,felo.replace('#',''))defseqint(self):beg=self.next_elem.find(self.bpibeg)end=self.next_elem.find(self.bpiend)self.seqints[self.next_id]=('%s@%s'%(self.parser_id,beg.get(self.rdfres).replace('#',''))ifbegisnotNoneelseNone,'%s@%s'%(self.parser_id,end.get(self.rdfres).replace('#',''))ifendisnotNoneelseNone)defseqsite(self):seqp=self.next_elem.find(self.bpseqp)ifseqpisnotNoneandseqp.textisnotNone:self.seqsites[self.next_id]=int(seqp.text)defmodfea(self):felo=self.next_elem.find(self.bpfelo)moty=self.next_elem.find(self.bpmoty)iffeloisnotNoneandmotyisnotNone:self.modfeas[self.next_id]=('%s@%s'%(self.parser_id,self.next_elem.find(self.bpfelo).get(self.rdfres).replace('#','')),'%s@%s'%(self.parser_id,self.next_elem.find(self.bpmoty).get(self.rdfres).replace('#','')))defseqmodvoc(self):term=self.next_elem.find(self.bpterm)iftermisnotNone:self.seqmodvocs[self.next_id]=term.textdefpathway(self):name=self.next_elem.find(self.bpdinm)ifnameisnotNone:name=name.texttry:self.pathways[self.next_id]={'components':self._bp_collect_resources(self.bppcom),'name':name}exceptTypeError:sys.stdout.write('Wrong type at element:\n')sys.stdout.write('%s%s'%(str(etree.tostring(self.next_elem))[:76],'...'))sys.stdout.flush()defget_none(self,something):ifsomethingisnotNone:returnsomething.replace('#','')returnsomethingdef_bp_collect_resources(self,tag,restype=None):return \
list(map(lambdae:'%s@%s'%(self.parser_id,e.get(self.rdfres).replace('#','')),filter(lambdae:self.rdfresine.attriband(restypeisNoneore.get(self.rdfres).replace('#','').startswith(restype)),self.next_elem.iterfind(tag))))
defadd_source(self,source):iftype(source)in_const.CHAR_TYPES:self._add_source(source)else:forsinsource:self._add_source(s)def_add_source(self,source):self.sources.add(source)ifsourcenotinself.attrs:self.attrs[source]={}defmerge_attrs(self,attrs):self.attrs=common.merge_dicts(self.attrs,attrs)defupdate_attr(self,attr):self.attrs=common.dict_set_path(self.attrs,attr)def__iadd__(self,other):''' Members or ids of entities should never change, as these are their unique, hashable and comparable attributes. __iadd__ operator is for merging entities with identical members or ids. '''self.sources=self.sources|other.sourcesself.merge_attrs(other.attrs)returnself
def__str__(self):return'%s (%s)'%(self.id,self.id_type)def__hash__(self):returnhash(self.__str__())def__eq__(self,other):returnself.__str__()==other.__str__()def__gt__(self,other):returnself.__str__()>other.__str__()def__lt__(self,other):returnself.__str__()<other.__str__()def__repr__(self):return'%s (%s)'%(self.id,self.id_type)def__iter__(self):""" With this method it is possible to iterate ``Entity`` objects just like ``EntitySet`` objects. Yields the object. """foriin[self]:yieldi
[docs]defexpand(self):""" With this method it is possible to iterate ``Entity`` objects just like ``EntitySet`` objects. Yields string. """foriin[self.id]:yieldi
[docs]defitermembers(self):""" This is a convenient iterator for the expand methods of higher classes like ``ReactionSide`` or ``Control``. """forminself.__iter__():attrs=dict(map(lambdas:(s,reduce(lambdac1,c2:c1|c2,map(lambdacid:m.attrs[s][cid]['children']if'children'inm.attrs[s][cid]elseset([cid]),filter(lambdacid:cidinm.attrs[s],self.attrs[s]['cids'])))),self.sources))yield(m,attrs)
defreload(self):modname=self.__class__.__module__mod=__import__(modname,fromlist=[modname.split('.')[0]])imp.reload(mod)new=getattr(mod,self.__class__.__name__)setattr(self,'__class__',new)defload_reactome(self):self._log('Loading Reactome.')defreactome_id_proc(_id):_id=_id.split('-')return{'id':_id[0],'isoform':int(_id[1])iflen(_id)>1else1}biopax=curl.Curl(urls.urls['reactome']['biopax_l3'],large=True,silent=self.silent,)parser=BioPaxReader(biopax.outfile,'Reactome',file_from_archive='Homo_sapiens.owl')parser.process()self.add_dataset(parser,id_types={'uniprot isoform':'uniprot'},process_id=reactome_id_proc)defload_acsn(self):self._log('Loading ACSN.')biopax=curl.Curl(urls.urls['acsn']['biopax_l3'],large=True,silent=self.silent,default_mode='rb',)self.parser=BioPaxReader(biopax.outfile,'ACSN')self.parser.process()self.add_dataset(self.parser,id_types={'hgnc':'genesymbol'})delself.parserdefload_kegg(self):self._log('Loading KEGG.')biopax=curl.Curl(urls.urls['kegg_pws']['biopax_l3'],large=True,silent=self.silent,)parser=BioPaxReader(biopax.outfile,'KEGG')parser.process()self.add_dataset(parser,{'uniprot knowledgebase':'uniprot'})defload_pid(self):self._log('Loading NCI-PID.')biopax=curl.Curl(urls.urls['nci-pid']['biopax_l3'],large=True,silent=self.silent,)parser=BioPaxReader(biopax.outfile,'NCI-PID')parser.process()self.add_dataset(parser)defload_wikipathways(self):self._log('Loading WikiPathways.')biopaxes=curl.Curl(urls.urls['wikipw']['biopax_l3'],large=True,silent=self.silent)ifnotself.silent:prg=progress.Progress(len(biopaxes.result),'Processing multiple BioPAX files',1,percent=False,)silent_default=self.silentself.silent=Trueforfnameinbiopaxes.files_multipart.keys():ifnotsilent_default:prg.step()parser=BioPaxReader(biopaxes.outfile,'WikiPathways',file_from_archive=fname)parser.process(silent=True)self.add_dataset(parser,id_types={'ensembl':'ensg','entrez gene':'entrez','hgnc':'genesymbol'})ifnotsilent_default:prg.terminate()self.silent=silent_defaultdefload_panther(self):self._log('Loading PANTHER.')biopaxes=curl.Curl(urls.urls['panther']['biopax_l3'],large=True,silent=self.silent)ifnotself.silent:prg=progress.Progress(len(biopaxes.files_multipart),'Processing multiple BioPAX files',1,percent=False)silent_default=self.silentself.silent=Trueforfnameinbiopaxes.files_multipart.keys():ifnotsilent_default:prg.step()parser=BioPaxReader(biopaxes.outfile,'PANTHER',file_from_archive=fname)parser.process(silent=True)self.add_dataset(parser,id_types={'ensembl':'ensg','entrez gene':'entrez','hgnc':'genesymbol'})ifnotsilent_default:prg.terminate()self.silent=silent_defaultdefload_netpath(self):self._log('Loading NetPath.')names=self.netpath_names()ifnotself.silent:prg=progress.Progress(len(names),'Processing multiple BioPAX files',1,percent=False)silent_default=self.silentself.silent=Trueforpwnuminnames.keys():ifnotsilent_default:prg.step()biopax=curl.Curl(urls.urls['netpath_bp']['biopax_l3']%int(pwnum),silent=True)parser=BioPaxReader(biopax.outfile,'NetPath')parser.process(silent=True)self.add_dataset(parser,id_types={'ensembl':'ensg','entrez gene':'entrez','hgnc':'genesymbol'})ifnotsilent_default:prg.terminate()self.silent=silent_defaultdefnetpath_names(self):repwnum=re.compile(r'_([0-9]+)$')result={}url=urls.urls['netpath_names']['url']c=curl.Curl(url)html=c.resultsoup=bs4.BeautifulSoup(html,'html.parser')forainsoup.find_all('a'):ifa.attrs['href'].startswith('pathways'):num=repwnum.findall(a.attrs['href'])[0]name=a.textresult[num]=namereturnresultdefload_all(self):self._log('Loading all databases.')self.load_wikipathways()self.load_netpath()self.load_panther()self.load_acsn()self.load_pid()self.load_reactome()defadd_dataset(self,parser,id_types={},process_id=lambdax:{'id':x}):self.id_types.update(id_types)self.source=parser.sourceself.parser=parserself.id_processor=process_idself.merge()defmerge(self):ifself.sourcenotinself.huge_complexes:self.huge_complexes[self.source]={}ifself.sourcenotinself.slow_complexes:self.slow_complexes[self.source]={}ifself.sourcenotinself.huge_reactions:self.huge_reactions[self.source]={}ifnotself.silent:self.prg=progress.Progress(12,'Processing %s'%self.source,1,percent=False)self.sources.add(self.source)ifself.sourcenotinself.parsers:self.parsers[self.source]={}self.parsers[self.source][self.parser.parser_id]=self.parserself.set_corrections()ifnotself.silent:self.prg.step(status='processing references')self.merge_refs()ifnotself.silent:self.prg.step(status='processing proteins')self.merge_proteins()ifnotself.silent:self.prg.step(status='processing protein families')self.merge_pfamilies()ifnotself.silent:self.prg.step(status='processing protein modifications')ifself.modifications:self.merge_modifications()ifnotself.silent:self.prg.step(status='processing complexes')remaining=self.merge_complexes()ifnotself.silent:self.prg.step(status='processing complex variations')self.merge_cvariations()ifnotself.silent:self.prg.step(status='processing complexes 2')remaining=self.merge_complexes(this_round=remaining)ifnotself.silent:self.prg.step(status='generating complex variations')self.gen_cvariations()ifnotself.silent:self.prg.step(status='processing reactions')self.merge_reactions()ifnotself.silent:self.prg.step(status='processing complex asseblies')self.merge_cassemblies()ifnotself.silent:self.prg.step(status='processing controls')self.merge_controls()ifnotself.silent:self.prg.step(status='processing catalyses')self.merge_catalyses()ifnotself.silent:self.prg.terminate()sys.stdout.write('\t:: %u proteins, %u complexes'', %u reactions and %u'' controls have been added.\n'%(self.proteins_added,self.complexes_added,self.reactions_added+self.cassemblies_added,self.controls_added+self.catalyses_added))sys.stdout.write('\t:: %u complexes have not been expanded ''because number of combinations larger than %u.\n'%(len(self.huge_complexes[self.source]),self.max_complex_combinations))sys.stdout.write('\t:: %u complexes took longer ''to process than 5 seconds.\n'%(len(self.slow_complexes[self.source])))sys.stdout.write('\t:: %u reactions have not been expanded ''because number of combinations larger than %u.\n'%(len(self.huge_reactions[self.source]),self.max_reaction_combinations))sys.stdout.write('\t:: Access them in `huge_complexes`, ''`slow_complexes` and `huge_reactions`.\n')sys.stdout.flush()self.remove_defaults()defremove_defaults(self):self.parser=Noneself.source=Nonedefset_corrections(self):# because not all can follow the standards...ifself.source=='ACSN':self.pref_correction=lambdal:filter(lambdae:e[6:10]=='HUGO',l)else:self.pref_correction=lambdal:lifself.source=='ACSN':self.pref_refs=lambdal:filter(lambdae:e[6:12]=='PubMed',l)else:self.pref_refs=lambdal:[]ifself.sourcein['ACSN','WikiPathways','NetPath','PANTHER','NCI-PID','KEGG']:self.ambiguous_ids_permitted=Trueelse:self.ambiguous_ids_permitted=Falsedefmerge_refs(self):self.refs_added=0ifself.sourcenotinself.rrefs:self.rrefs[self.source]={}forrefid,pubmediniteritems(self.parser.pubrefs):ref=Reference(pubmed,sources=self.source)ref.attrs[self.source]['refid']=set([])ref.attrs[self.source]['refid'].add(refid)self.refs_added+=1ifpubmedinself.refs:self.refs[pubmed]+=refelse:self.refs[pubmed]=refself.rrefs[self.source][refid]=pubmeddefmerge_proteins(self):defget_protein_ids(pref):pids=[]refs=[]ifprefinself.parser.prefs:uxrefs=self.pref_correction(self.parser.prefs[pref]['uxrefs'])pids= \
common.unique_list(map(lambdauxref:self.parser.ids[uxref],filter(lambdauxref:uxrefinself.parser.ids,uxrefs)))refids=self.pref_refs(self.parser.prefs[pref]['uxrefs'])refs= \
common.unique_list(map(lambdarefid:self.parser.ids[refid],filter(lambdarefid:refidinself.parser.ids,refids)))# in panther proteinreferences are children of# other proteinreferences...forsubprefinself.parser.prefs[pref]['prefs']:subpids,subrefs=get_protein_ids(subpref)pids.extend(subpids)refs.extend(subrefs)returnpids,refsdefmap_protein_ids(ids):target_ids=[]id_attrs={}forid_type,_idinids:ifid_typeinself.id_types:std_id_type=self.id_types[id_type]else:std_id_type=id_typeid_a=self.id_processor(_id)id_attrs[id_a['id']]=id_aifid_a['id']isnotNone:target_ids.extend(self.mapper.map_name(id_a['id'],std_id_type,self.default_id_types['protein']))target_ids=filter(lambdap:reflists.check(p,self.default_id_types['protein'],self.ncbi_tax_id),target_ids)target_ids=common.unique_list(target_ids)iflen(target_ids)>1:ifnotself.ambiguous_ids_permitted:sys.stdout.write('\t:: Ambiguous ID ''translation: from %s to %s\n'%(ids,target_ids))eliflen(target_ids)==0:target_ids=Noneelse:target_ids=list(target_ids)[0]returntarget_ids,id_attrsself.rproteins[self.source]={}self.proteins_added=0self.pfamilies_added=0forpid,piniteritems(self.parser.proteins):ids,pubmeds=get_protein_ids(p['protein'])target_id,id_attrs=map_protein_ids(ids)iftarget_idisNone:continueiftype(target_id)islist:# go for a protein family:self.add_pfamily(list(map(lambdat:(t,pid),target_id)),pid)continueattrs={self.source:{'prefs':set([p['protein']]),'pids':{pid:{}},'refs':set(pubmeds),'originals':set([])}}fororiginal_id,id_ainiteritems(id_attrs):attrs[self.source]['originals'].add(original_id)fork,viniteritems(id_a):ifk!='id':attrs[self.source]['pids'][pid][k]=set([])attrs[self.source]['pids'][pid][k].add(v)protein=Protein(target_id,sources=set([self.source]),attrs=attrs)self.proteins_added+=1iftarget_idinself.proteins:self.proteins[target_id]+=proteinelse:self.proteins[target_id]=proteinself.rproteins[self.source][pid]=target_iddefpreprocess_seqmodvoc(self):self.seqmod_dict={}ifself.sourceincommon.mod_keywords:kws=common.mod_keywords[self.source]aas=common.aanamesformodkey,modnameiniteritems(self.parser.seqmodvocs):formod_std_name,kwlistinkws:ifall(map(lambdakw:kwinmodname,kwlist)):this_aa=Noneforaan,aainiteritems(aas):ifaaninmodname:this_aa=aabreakself.seqmod_dict[modkey]=(mod_std_name,this_aa)defmerge_modifications(self):self.load_sequences()self.preprocess_seqmodvoc()defget_protein(pid):proteins=[]ifpidinself.rproteins[self.source]:_id=self.rproteins[self.source][pid]if'isoform'in \
self.proteins[_id].attrs[self.source]['pids'][pid]:forisofinself.proteins[_id].attrs[self.source]\
['pids'][pid]['isoform']:proteins.append((_id,isof))else:proteins.append((_id,None))returnproteinsdefget_seqsite(seqsite):ifseqsiteinself.parser.seqsites:returnint(float(self.parser.seqsites[seqsite]))defget_residue(protein,isof,resnum,resname):ifproteininself.seq:ifisofisnotNoneandisofinself.seq[protein].isof:sresname=self.seq[protein].get(resnum,isoform=isof)ifsresname==resnameorresnameisNone:returnsresname,isofforisofinsorted(self.seq[protein].isoforms()):sresname=self.seq[protein].get(resnum,isoform=isof)ifsresname==resnameorresnameisNone:returnsresname,isofreturnresname,isofself.fragfeatures_added=0self.modfeatures_added=0forpid,piniteritems(self.parser.proteins):proteins=get_protein(pid)formodfeainp['modfeatures']:ifmodfeainself.parser.fragfeas:seqint=self.parser.fragfeas[modfea]ifseqintinself.parser.seqints:start=get_seqsite(self.parser.seqints[seqint][0])end=get_seqsite(self.parser.seqints[seqint][1])ifstartisnotNoneandendisnotNone:forprotein,isofinproteins:ifproteininself.seq:ifself.seq[protein].has_isoform(isof):frag=(protein,isof,start,end)iffraginself.frags:self.frags[frag].add_evidences(self.source)else:instance= \
self.seq[protein].get(start,end,isof)mot=intera.Motif(protein,start,end,isoform=isof,instance=instance,evidences=self.source,)self.frags[frag]=motself.proteins[protein].update_attr([self.source,'pids',pid,'frags',{frag}])self.fragfeatures_added+=1ifmodfeainself.parser.modfeas:resnum=get_seqsite(self.parser.modfeas[modfea][0])seqmodvoc=self.parser.modfeas[modfea][1]ifseqmodvocinself.seqmod_dict:typ,resname=self.seqmod_dict[seqmodvoc]forprotein,isofinproteins:ifproteininself.seqandresnumisnotNone:resname,isof= \
get_residue(protein,isof,resnum,resname)mod=(protein,isof,resnum,resname,typ)ifmodinself.mods:self.mods[mod].add_evidences(self.source)else:res=intera.Residue(resnum,resname,protein,isoform=isof)start,end,instance=self.seq[protein].get_region(resnum,isoform=isof)mot=intera.Motif(protein,start,end,isoform=isof,instance=instance)ptm=intera.Ptm(protein,motif=mot,residue=res,evidences=self.source,isoform=isof,typ=typ,)self.mods[mod]=ptmtry:self.proteins[protein].update_attr([self.source,'pids',pid,'mods',set([mod])])except:print(protein,pid)self.modfeatures_added+=1defmerge_pfamilies(self):ifself.sourcenotinself.rpfamilies:self.rpfamilies[self.source]={}this_round=set(list(self.parser.pfamilies.keys()))next_round=[]prev_round=-1whilelen(this_round)-prev_round!=0:prev_round=len(this_round)forpfidinthis_round:pids=self.parser.pfamilies[pfid]subpf_unproc= \
any(map(lambdapid:pidinself.parser.pfamilies,pids))ifsubpf_unproc:next_round.append(pfid)continueproteins= \
list(map(lambdapid:(self.rproteins[self.source][pid],pid),filter(lambdapid:pidinself.rproteins[self.source],pids)))subpfs= \
list(map(lambdapid:(self.rpfamilies[self.source][pid],pid),filter(lambdapid:pidinself.rpfamilies[self.source],pids)))forspf,spfidinsubpfs:spfmembs= \
list(map(lambdap:(p[0],p[1]['pid']),iteritems(self.pfamilies[spf].attrs[self.source][spfid])))proteins.extend(spfmembs)self.add_pfamily(proteins,pfid)this_round=next_roundnext_round=[]defadd_pfamily(self,proteins,pfid):ifself.sourcenotinself.rpfamilies:self.rpfamilies[self.source]={}members=sorted(common.unique_list(map(lambdap:p[0],proteins)))# this necessary if we add protein family because of# ambiguous id mapping; we want to make sure protein# exists for each member of the family.forminmembers:ifmnotinself.proteins:p=Protein(m,sources=self.source)p.attrs[self.source]['pids']={}p.attrs[self.source]['pids'][pfid]={}self.proteins[m]=piflen(members):pf=ProteinFamily(members,source=self.source,parent=self)members=tuple(members)pf.attrs[self.source][pfid]={}forprotein,pidinproteins:pf.attrs[self.source][pfid][protein]={}pf.attrs[self.source][pfid][protein]['pid']=pidifmembersnotinself.pfamilies:self.pfamilies[members]=pfelse:self.pfamilies[members]+=pfself.rpfamilies[self.source][pfid]=membersself.pfamilies_added+=1
[docs]defmerge_complexes(self,this_round=None):""" Merges complexes from the active ``BioPaxReader`` object. Protein families and subcomplexes are expanded, and all combinations are created as separate complexes. The complexes from the same ID are added to sets in the ``rcomplexes`` dict. """self.complexes_added=0ifself.sourcenotinself.rcomplexes:self.rcomplexes[self.source]={}no_protein=set([])this_round=set(list(self.parser.complexes.keys())) \
ifthis_roundisNoneelsethis_roundnext_round=[]prev_round=-1whilelen(this_round)-prev_round!=0:prev_round=len(this_round)forcidinthis_round:start_time=time.time()stois=self.parser.complexes[cid]iflen(self.parser.stoichiometries):pids=list(map(lambdastoi:self.parser.stoichiometries[stoi],stois))else:pids=list(map(lambdacomp:(comp,1),stois))subc_unproc= \
any(map(lambdapid:(pid[0]inself.parser.complexesorpid[0]inself.parser.cvariations)andpid[0]notinself.rcomplexes[self.source]andpid[0]notinno_protein,pids))ifsubc_unproc:next_round.append(cid)continueproteins= \
list(map(lambdapid:(self.rproteins[self.source][pid[0]],pid[1],pid[0]),filter(lambdapid:pid[0]inself.rproteins[self.source],pids)))pfamilies= \
list(map(lambdapfid:list(map(lambdamemb:(memb[0],pfid[1],memb[1]['pid']),iteritems(self.pfamilies[self.rpfamilies[self.source][pfid[0]]].attrs[self.source][pfid[0]]))),filter(lambdapfid:pfid[0]inself.rpfamilies[self.source],pids)))pfnum=0iflen(pfamilies):pfnum=reduce(lambdapf1l,pf2l:pf1l*pf2l,map(lambdapf:len(pf),pfamilies))ifpfnum>self.max_complex_combinations:self.huge_complexes[self.source][cid]=pfnumcontinuesubcplexs= \
list(map(lambdascid:map(lambdamemb:(memb,scid[1],scid[0]),self.rcomplexes[self.source][scid[0]]),filter(lambdascid:scid[0]inself.rcomplexes[self.source],pids)))iflen(subcplexs):subcplexs=itertools.product(*subcplexs)subcmembs=[]forthis_subcplexinsubcplexs:forsckey,scstoi,scidinthis_subcplex:ifscidnotinno_protein:sc=self.complexes[sckey]scmembs=sc.get_stoichiometries(self.source,scid,with_pids=True)scmembs=list(map(lambdap:(p[0],p[1]*scstoi,p[2]),scmembs))subcmembs.append(scmembs)else:subcmembs=[[]]iflen(subcmembs)*pfnum>self.max_complex_combinations:self.huge_complexes[self.source][cid]= \
len(subcmembs)*pfnumcontinueiflen(proteins)orlen(pfamilies)or \
type(subcplexs)isnotlist:ifnotlen(pfamilies):pfamilies=[[]]else:pfamilies=itertools.product(*pfamilies)forpfamilyinpfamilies:forsubcinsubcmembs:this_proteins= \
proteins+list(pfamily)+list(subc)members=sorted(common.unique_list(map(lambdap:p[0],this_proteins)))ifnotlen(members):continuecplex=Complex(members,source=self.source,parent=self)members=tuple(members)cplex.attrs[self.source][cid]={}forprotein,stoi,pidinthis_proteins:cplex.attrs[self.source][cid][protein]={}cplex.attrs[self.source][cid][protein]['pid']=pidcplex.attrs[self.source][cid][protein]['stoi']=stoiifmembersnotinself.complexes:self.complexes[members]=cplexelse:self.complexes[members]+=cplexself.complexes_added+=1ifcidnotinself.rcomplexes[self.source]:self.rcomplexes[self.source][cid]=set([])self.rcomplexes[self.source][cid].add(members)else:no_protein.add(cid)elapsed=time.time()-start_timeifelapsed>5:self.slow_complexes[self.source][cid]=elapsedthis_round=next_roundnext_round=[]returnthis_round
[docs]defmerge_cvariations(self):""" This processes those complexes which are in fact a set of complex variations. As simple complexes also are always extended to complex variations because they might have not only simple proteins but protein families as members, here we only add new records to the attributes of already existing complexes. After ``merge_complexes`` will be called again, to process those simple complexes which have any of the complex variations processed here among their subcomplexes. """self.cvariations_added=0forcvid,cviniteritems(self.parser.cvariations):cplexes= \
dict(map(lambdacid:(cid,self.rcomplexes[self.source][cid]),filter(lambdacid:cidinself.rcomplexes[self.source],cv)))forcid,ckeysiniteritems(cplexes):forckeyinckeys:c=self.complexes[ckey]ifcvidnotinc.attrs[self.source]andcidinc.attrs[self.source]:c.attrs[self.source][cvid]={'children':set([])}ifcidinc.attrs[self.source]:c.attrs[self.source][cvid]['children'].add(cid)c.attrs[self.source][cvid].update(c.attrs[self.source][cid])iflen(cplexes):self.rcomplexes[self.source][cvid]= \
reduce(lambdac1,c2:c1|c2,cplexes.values())self.cvariations_added+=1
[docs]defgen_cvariations(self):""" Because one key from the BioPax file might represent more complexes, *complexvariations* are created to give a way to represent sets of combinations. These are created for all complexes, even with only one unambiguous constitution. The keys are the constitutions of all the combinations listed in alphabetic order, separated by ``|``. For example, ``A,B,C|A,B,D|A,B,E``. """self.rcvariations[self.source]={}forcid,keysiniteritems(self.rcomplexes[self.source]):membs=map(lambdakey:self.complexes[key],keys)cvar=ComplexVariations(membs,source=self.source,parent=self)cvar.attrs[self.source]['cids']=set([cid])key=cvar.__str__()ifkeyinself.cvariations:self.cvariations[key]+=cvarelse:self.cvariations[key]=cvarself.rcvariations[self.source][cid]=key
defmerge_reactions(self):self.reactions_added=0self._merge_reactions(('reactions','reaction'))defmerge_cassemblies(self):self.cassemblies_added=0self._merge_reactions(('cassemblies','cassembly'))def_merge_reactions(self,rclass):""" Merges reaction type entities from the active parser. Here protein families and complex variations are not expanded. """ifself.sourcenotinself.rreactions:self.rreactions[self.source]={}defget_side(ids):members=[]memb_ids={}for_idinids:forclsin('proteins','pfamilies','cvariations'):r=getattr(self,'r%s'%cls)[self.source]if_idinr:e=getattr(self,cls)[r[_id]]members.append(e)memb_ids[e.key()]={'id':_id,'type':cls}returnmembers,memb_idsforrid,reaciniteritems(getattr(self.parser,rclass[0])):left,l_ids=get_side(reac['left'])right,r_ids=get_side(reac['right'])left_attrs={self.source:{rid:l_ids}}right_attrs={self.source:{rid:r_ids}}nleft= \
reduce(lambdam1,m2:m1*m2,map(lambdam:len(m.members)ifhasattr(m,'members')else1,left),1)nright= \
reduce(lambdam1,m2:m1*m2,map(lambdam:len(m.members)ifhasattr(m,'members')else1,right),1)iflen(left)orlen(right):ifnleft<=self.max_reaction_combinationsand \
nright<=self.max_reaction_combinations:reaction=Reaction(left,right,left_attrs,right_attrs,source=self.source,parent=self)reaction.attrs[self.source][rid]={}this_refs= \
set(list(map(lambdar:self.rrefs[self.source][r],filter(lambdar:rinself.parser.pubrefs,reac['refs']))))reaction.attrs[self.source][rid]['refs']=this_refsreaction.attrs[self.source][rid]['type']=rclass[1]key=reaction.__str__()ifkeyinself.reactions:# print(key, type(self.reactions[key]), self.reactions[key].__str__(), type(reaction), reaction.__str__())self.reactions[key]+=reactionelse:self.reactions[key]=reactionsetattr(self,'%s_added'%rclass[0],getattr(self,'%s_added'%rclass[0])+1)self.rreactions[self.source][rid]=keyelse:self.huge_reactions[self.source][rid]=max(nleft,nright)defmerge_controls(self):self.controls_added=0self._merge_controls(('controls','control'))defmerge_catalyses(self):self.catalyses_added=0self._merge_controls(('catalyses','catalysis'))def_merge_controls(self,cclass):ifself.sourcenotinself.rcontrols:self.rcontrols[self.source]={}defget_party(_id):forclsin['proteins','pfamilies','cvariations','reactions']:if_idingetattr(self,'r%s'%cls)[self.source]:key=getattr(self,'r%s'%cls)[self.source][_id]entity=getattr(self,cls)[key]return(cls,key,entity)returnNone,None,Noneforcid,ctrliniteritems(getattr(self.parser,cclass[0])):erclass,erkey,erent=get_party(ctrl['controller'])edclass,edkey,edent=get_party(ctrl['controlled'])# print('n = %u, erclass: %s, edclass: %s, er: %s, ed: %s' % (n, erclass, edclass, ctrl['controller'], ctrl['controlled']))iferentisnotNoneandedentisnotNone:this_refs= \
set(list(map(lambdar:self.rrefs[self.source][r],filter(lambdar:rinself.parser.pubrefs,ctrl['refs'])))) \
if'refs'inctrlelseset([])control=Control(erent,edent,source=self.source,parent=self)control.attrs[self.source][cid]={}control.attrs[self.source][cid]['refs']=this_refscontrol.attrs[self.source][cid]['class']=cclass[1]control.attrs[self.source][cid]['type']=ctrl['type']key=control.__str__()ifkeyinself.controls:self.controls[key]+=controlelse:self.controls[key]=controlsetattr(self,'%s_added'%cclass[0],getattr(self,'%s_added'%cclass[0])+1)self.rcontrols[self.source][cid]=keydefbasic_stats(self,exclude_empty=False):self.stats={'proteins':{},'complexes':{},'mods':{},'reactions':{},'controls':{},'refs':{}}comb=[]forninxrange(1,len(self.sources)+1):comb.extend(list(itertools.combinations(self.sources,n)))comb= \
list(map(lambdas:(tuple(sorted(s)),set(s)),comb))foretypinself.stats.keys():self.stats[etyp]= \
dict(map(lambdas:(s[0],0),comb))foreingetattr(self,etyp).values():forcincomb:_sources=(e.evidences.get_resource_names()ifhasattr(e,'evidences')elsecommon.to_set(e.sources))ifc[1]<=_sources:if \
notexclude_empty \
or(etypnotin['complexes','reactions','controls'])or(etyp=='complexes'andlen(e.members)>1)or(etyp=='reactions'andlen(e.left.members)andlen(e.right.members))or(etyp=='controls'and(((e.controller.__class__.__name__=='Complex'ore.controller.__class__.__name__=='ProteinFamily')andlen(e.controller.members))or(e.controller.__class__.__name__=='ComplexVariations'andany(map(lambdam:bool(len(m.members)),e.controller.members))))and(len(e.controlled.left.members)andlen(e.controlled.right.members))):self.stats[etyp][c[0]]+=1defsimpson_stats(self):ifnothasattr(self,'stats'):self.basic_stats()self.simpson_sim={'proteins':{},'complexes':{},'mods':{},'reactions':{},'controls':{},'refs':{}}foretypinself.simpson_sim.keys():fors1inself.sources:fors2inself.sources:ifs1!=s2:self.simpson_sim[etyp][(s1,s2)]= \
common.simpson_index_counts(self.stats[etyp][tuple([s1])],self.stats[etyp][tuple([s2])],self.stats[etyp][tuple(sorted([s1,s2]))])defresource_graph_edges(self,etyp):ifnothasattr(self,'simpson_sim'):self.simpson_stats()stats=self.stats[etyp]sim=self.simpson_sim[etyp]edges=[]nodes={}fors1inself.sources:nodes[s1]=stats[(s1,)]fors2inself.sources:ifs1!=s2andsim[(s1,s2)]>0.0:edges.append([s1,s2,sim[(s1,s2)]])returnedges,nodesdefiterate_reactions(self):passdefload_sequences(self):ifself.seqisNone:self.seq=seq.swissprot_seq(self.ncbi_tax_id,isoforms=True)# interaction iterators from heredefexpand(self):defadd_interactions(gen):foriingen:key=(i[0],i[1])ifkeynotinaggregate:aggregate[key]=iaggregate[key][2]=set([i[2]])else:aggregate[key][4].update(i[4])aggregate[key][5].update(i[5])aggregate[key][2].add(i[2])aggregate[key][3]=aggregate[key][3]ori[3]aggregate={}add_interactions(self.in_same_component())add_interactions(self.co_control())add_interactions(self.interacts_with())add_interactions(self.state_change())self.interactions=list(aggregate.values())defexpand_by_source(self):defadd_interactions(gen):foriingen:key=(i[0],i[1],i[4])ifkeynotinaggregate:aggregate[key]=iaggregate[key][2]=set([i[2]])else:aggregate[key][5].update(i[5])aggregate[key][2].add(i[2])aggregate[key][3]=aggregate[key][3]ori[3]aggregate={}add_interactions(self.in_same_component(by_source=True))add_interactions(self.co_control(by_source=True))add_interactions(self.interacts_with(by_source=True))add_interactions(self.state_change(by_source=True))self.interactions_by_source=list(aggregate.values())pickle.dump(self.interactions_by_source,open(os.path.join(self.cachedir,'reaction_interactions_by_source.pickle',),'wb'))
[docs]defin_same_component(self,by_source=False):""" For all complexes connects all members of the complex with each other. """self.prg=progress.Progress(len(self.complexes),'Expanding `in same component` interactions',1)aggregate_src={}forcinself.complexes.values():self.prg.step()fori,p1inenumerate(c):forp2inlist(c)[i+1:]:key=(p1,p2)ifkeynotinaggregate_src:aggregate_src[key]=set([])aggregate_src[key].update(c.sources)self.prg.terminate()for(p1,p2),siniteritems(aggregate_src):ifby_source:forssins:yield[p1,p2,'IN_SAME_COMPONENT',False,ss,set([])]else:yield[p1,p2,'IN_SAME_COMPONENT',False,s,set([])]
[docs]defexpand(self):""" Expands the ``ReactionSide`` by iterating over all combinations of all ``ComplexVariation`` and ``ProteinFamily`` members, so yields ``ReactionSide`` objects with only ``Protein`` and ``Complex`` members. Yields tuple, because ``ReactionSide`` is initialized in ``Reaction``, the tuple is suitable to serve as ``members`` and ``attrs``. """# collecting protein attributesifself.is_expanded:foriin[1]:yieldself.members,self.attrselse:try:pattrs= \
dict(map(lambdam:(m.id,dict(map(lambdad1:(d1[0],dict(map(lambdad2:(d2[0],d2[1][m.id]),iteritems(d1[1])))),iteritems(self.attrs)))),filter(lambdam:m.type=='protein',self.members)))except:print(self.attrs)forcin \
itertools.product(*list(map(lambdam:list(zip(m.itermembers(),[m.key()]*(len(m.members)ifhasattr(m,'members')else1))),self.members))):attrs=dict(map(lambdas:(s,dict(map(lambdarid:(rid,{}),self.attrs[s].keys()))),self.sources))members=[]for((m,a),k)inc:members.append(m)ifm.type=='protein':# if it was a protein, we just copyifm.idinpattrs:fors,d1initeritems(pattrs[m.id]):forrid,d2initeritems(d1):attrs[s][rid][m.id]=d2# if it is from a protein familyelse:# for each resourcefors,riniteritems(attrs):# for each original reaction idforrid,diniteritems(self.attrs[s]):# the key of the new entity (here: str,# uniprot id)ifkinself.attrs[s][rid]:attrs[s][rid][m.key()]=(# the type is obvious, the id is from the `a` dict supplied# by the ProteinFamily object, and we look up the id belonging# to the key of the original entity{'type':'proteins','id':a[s][self.attrs[s][rid][k]['id']]})# if it is a complex from a complex variationselifm.type=='complex':fors,riniteritems(attrs):forrid,diniteritems(r):ifkinself.attrs[s][rid]:cid=self.attrs[s][rid][k]['id']attrs[s][rid][m.key()]= \
{'type':'complexes','id':cid}yieldmembers,attrs