Source code for pypath.utils.pyreact

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#  This file is part of the `pypath` python module
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#  Website:

from __future__ import print_function
from future.utils import iteritems
from past.builtins import xrange, range, reduce

import os
import sys
import importlib as imp
import re
import gzip
import tarfile
import zipfile
import struct
import itertools
import bs4
import time
from lxml import etree
    import cPickle as pickle
    import pickle

import pypath.utils.mapping as mapping
import pypath.utils.reflists as reflists
import pypath.share.common as common
import pypath_common._constants as _const
import pypath.share.progress as progress
import pypath.share.curl as curl
import pypath.internals.intera as intera
import pypath.internals.refs as refs
import pypath.inputs.uniprot as uniprot_input
import pypath.resources.urls as urls
import pypath.utils.seq as seq
import pypath.share.session as session_mod
import pypath.share.cache as cache

[docs] class BioPaxReader(session_mod.Logger): """ This class parses a BioPAX file and exposes its content easily accessible for further processing. First it opens the file, if necessary it extracts from the archive. Then an `lxml.etree.iterparse` object is created, so the iteration is efficient and memory requirements are minimal. The iterparse object is iterated then, and for each tag included in the `BioPaxReader.methods` dict, the appropriate method is called. These me- thods extract information from the BioPAX entity, and store it in arbit- rary data structures: strings, lists or dicts. These are stored in dicts where keys are the original IDs of the tags, prefixed with the unique ID of the parser object. This is necessary to give a way to merge later the result of parsing more BioPAX files. For example, `id42` may identify EGFR in one file, but AKT1 in the other. Then, the parser of the first file has a unique ID of a 5 letter random string, the second parser a different one, and the molecules with the same ID can be distinguished at merging, e.g. EGFR will be `ffjh2@id42` and AKT1 will be `tr9gy@id42`. The methods and the resulted dicts are named after the BioPAX elements, sometimes abbreviated. For example, `BioPaxReader.protein()` processes the `<bp:Protein>` elements, and stores the results in `BioPaxReader.proteins`. In its current state, this class does not parse every information and all BioPax entities. For example, nucleic acid related entities and interactions are omitted. But these easily can be added with minor mo- difications. """
[docs] def __init__( self, biopax, source, cleanup_period=800, file_from_archive=None, silent=False, ): """ :param str,FileOpener biopax: either a filename, or a FileOpener object; if string is supplied, the FileOpener will be created in- ternally :param str source: the name of the data source, e.g. *Reactome* :param int cleanup_period: the number of last elements stored during the iteration of lxml.etree.iterparse; lower number results lower memory usage, but might risk that an element is deleted before it has been processed. Default is 800, which is a safe option. :param str file_from_archive: in case of processing an archive which may contain multiple files (tar.gz or zip), the path of the file to be processed needs to be supplied. E.g. *BioPax/Homo_sapiens.owl*. :param bool silent: whether print status messages and progress bars during processing. If you process large number of small files, better to set False, in case of one large file, True. The default is *False*. """ session_mod.Logger.__init__(self, name = 'biopax') self.biopax = biopax self.source = source self.file_from_archive = file_from_archive self.cleanup_period = cleanup_period self.biopax_tmp_file = None self.cachedir = cache.get_cachedir() self.parser_id = common.random_string() self.silent = silent # string constants self.bppref = '{}' self.rdfpref = '{}' self.rdfid = '%sID' % self.rdfpref self.rdfab = '%sabout' % self.rdfpref self.rdfres = '%sresource' % self.rdfpref self.bpprot = '%sProtein' % self.bppref self.bpcplx = '%sComplex' % self.bppref self.bpprre = '%sProteinReference' % self.bppref self.bpreac = '%sBiochemicalReaction' % self.bppref self.bpcata = '%sCatalysis' % self.bppref self.bpctrl = '%sControl' % self.bppref self.bpmodu = '%sModulation' % self.bppref self.bpcoma = '%sComplexAssembly' % self.bppref self.bptran = '%sTransport' % self.bppref self.bptrbr = '%sTransportWithBiochemicalReaction' % self.bppref self.bpmoli = '%sMolecularInteraction' % self.bppref self.bppstp = '%sPathwayStep' % self.bppref self.bpuxrf = '%sUnificationXref' % self.bppref self.bpstoi = '%sStoichiometry' % self.bppref self.bppubr = '%sPublicationXref' % self.bppref self.bppath = '%sPathway' % self.bppref self.bpfrfe = '%sFragmentFeature' % self.bppref self.bpseqi = '%sSequenceInterval' % self.bppref self.bpseqs = '%sSequenceSite' % self.bppref self.bpmodf = '%sModificationFeature' % self.bppref self.bpmodv = '%sSequenceModificationVocabulary' % self.bppref self.bpmphe = '%smemberPhysicalEntity' % self.bppref self.bpmerf = '%smemberEntityReference' % self.bppref self.bperef = '%sentityReference' % self.bppref self.bpxref = '%sxref' % self.bppref self.bpdinm = '%sdisplayName' % self.bppref self.bprelr = '%sRelationshipXref' % self.bppref self.bpcsto = '%scomponentStoichiometry' % self.bppref self.bpcomp = '%scomponent' % self.bppref self.bpstoc = '%sstoichiometricCoefficient' % self.bppref self.bpphye = '%sphysicalEntity' % self.bppref self.bpcted = '%scontrolled' % self.bppref self.bpcter = '%scontroller' % self.bppref self.bpctyp = '%scontrolType' % self.bppref self.bppart = '%sparticipant' % self.bppref self.bpleft = '%sleft' % self.bppref self.bprgth = '%sright' % self.bppref self.bpsprc = '%sstepProcess' % self.bppref self.bpfeat = '%sfeature' % self.bppref self.bpfelo = '%sfeatureLocation' % self.bppref self.bpibeg = '%ssequenceIntervalBegin' % self.bppref self.bpiend = '%ssequenceIntervalEnd' % self.bppref self.bpseqp = '%ssequencePosition' % self.bppref self.bpmoty = '%smodificationType' % self.bppref self.bppcom = '%spathwayComponent' % self.bppref self.bpterm = '%sterm' % self.bppref self.bpdb = '%sdb' % self.bppref self.bpid = '%sid' % self.bppref self.upStr = 'UniProt' self.proteins = {} self.pfamilies = {} self.complexes = {} self.cvariations = {} self.prefs = {} self.ids = {} self.reactions = {} self.cassemblies = {} self.interactions = {} self.transports = {} self.transwreas = {} self.stoichiometries = {} self.catalyses = {} self.controls = {} self.pwsteps = {} self.pubrefs = {} self.fragfeas = {} self.seqints = {} self.seqsites = {} self.modfeas = {} self.seqmodvocs = {} self.pathways = {} self.interactions_not2 = {} self.methods = { self.bpprot: 'protein', self.bpcplx: 'cplex', self.bpprre: 'pref', self.bpuxrf: 'uxref', self.bprelr: 'rxref', self.bpstoi: 'stoichiometry', self.bpreac: 'reaction', self.bpcoma: 'cassembly', self.bptrbr: 'reaction', self.bptran: 'reaction', self.bpmoli: 'interaction', self.bpcata: 'catalysis', self.bpmodu: 'catalysis', self.bpctrl: 'control', self.bppstp: 'pwstep', self.bppubr: 'pubref', self.bpfrfe: 'fragfea', self.bpseqi: 'seqint', self.bpseqs: 'seqsite', self.bpmodf: 'modfea', self.bpmodv: 'seqmodvoc', self.bppath: 'pathway' }
def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new)
[docs] def process(self, silent=False): """ This method executes the total workflow of BioPax processing. :param bool silent: whether to print status messages and progress bars. """ self.silent = silent self.open_biopax() self.biopax_size() self.extract() self.set_progress() self.init_etree() if self.bp is not None: self.iterate() self.close_biopax() if len(self.interactions_not2): self._log( '%u interactions have not exactly 2 ' 'participants (%s).' % ( len(self.interactions_not2), self.source, ) ) else: self._log( 'XML syntax error or empty file encountered. ' 'Skipping to next file or resource.' )
[docs] def open_biopax(self): """ Opens the BioPax file. This method should not be called directly, ``BioPaxReader.process()`` calls it. """ opener_args = ( { 'default_mode': 'rb', } if self.file_from_archive is None else { 'files_needed': [self.file_from_archive], } ) if type(self.biopax) is curl.FileOpener: self.opener = self.biopax else: self.opener = curl.FileOpener(self.biopax, **opener_args) if type(self.opener.result) is dict: if self.file_from_archive is None or \ self.file_from_archive not in self.opener.result: self.file_from_archive = \ sorted(list(self.opener.result.keys()))[0] self._biopax = self.opener.result[self.file_from_archive] elif self.opener.type == 'gz': self._biopax = self.opener.gzfile else: self._biopax = self.opener.fileobj
[docs] def biopax_size(self): """ Gets the uncompressed size of the BioPax XML. This is needed in order to have a progress bar. This method should not be called directly, ``BioPaxReader.process()`` calls it. """ self.bp_filesize = self.opener.sizes[self.file_from_archive] \ if hasattr(self.opener, 'sizes') else self.opener.size
[docs] def extract(self): """ Extracts the BioPax file from compressed archive. Creates a temporary file. This is needed to trace the progress of processing, which is useful in case of large files. This method should not be called directly, ``BioPaxReader.process()`` calls it. """ if self.opener.type != 'plain': self.biopax_tmp_file = os.path.join( self.cachedir, 'biopax.processing.tmp.owl', ) self._log( 'Extracting %s from %s compressed file.' % ( self.file_from_archive, self.opener.type ) ) if not self.silent: prg = progress.Progress( self.bp_filesize, 'Extracting %s from %s compressed file' % (self.file_from_archive, self.opener.type), 1000000) with open(self.biopax_tmp_file, 'wb') as tmpf: while True: chunk = if not self.silent: prg.step(len(chunk)) if not len(chunk): break if hasattr(chunk, 'encode'): chunk = chunk.encode('utf8') tmpf.write(chunk) self._biopax = open(self.biopax_tmp_file, 'rb') if not self.silent: prg.terminate()
[docs] def init_etree(self): """ Creates the ``lxml.etree.iterparse`` object. This method should not be called directly, ``BioPaxReader.process()`` calls it. """ try: self.bp = etree.iterparse(self._biopax, events=('start', 'end')) _, self.root = next(self.bp) except etree.XMLSyntaxError: self.bp = None self.used_elements = []
[docs] def set_progress(self): """ Initializes a progress bar. This method should not be called directly, ``BioPaxReader.process()`` calls it. """ if not self.silent: self.prg = progress.Progress(self.bp_filesize, 'Processing %s from BioPAX XML' % self.source, 33)
[docs] def iterate(self): """ Iterates the BioPax XML and calls the appropriate methods for each element. This method should not be called directly, ``BioPaxReader.process()`` calls it. """ self.fpos = self._biopax.tell() try: for ev, elem in self.bp: # step the progressbar: new_fpos = self._biopax.tell() if not self.silent: self.prg.step(new_fpos - self.fpos) self.fpos = new_fpos self.next_elem = elem self.next_event = ev self.next_id = '%s@%s' % \ (self.parser_id, self.next_elem.get(self.rdfid)) \ if self.rdfid in elem.attrib \ else '%s@%s' % \ (self.parser_id, self.next_elem.get(self.rdfab)) if ev == 'end' and self.next_elem.tag in self.methods: method = getattr(self, self.methods[self.next_elem.tag]) method() self.root.clear() # self.used_elements.append(self.next_elem) # self.cleanup_hook() except etree.XMLSyntaxError as e: if not self.silent: self.prg.terminate(status='failed') sys.stdout.write('\n\t:: Syntax error in BioPAX:\n\t\t%s\n' % str(e)) sys.stdout.flush() if not self.silent: self.prg.terminate()
[docs] def cleanup_hook(self): """ Removes the used elements to free up memory. This method should not be called directly, ``BioPaxReader.iterate()`` calls it. """ if len(self.used_elements) > self.cleanup_period: for _ in xrange(int(self.cleanup_period / 2)): e = self.used_elements.pop() e.clear()
[docs] def close_biopax(self): """ Deletes the iterator and closes the file object. This method should not be called directly, ``BioPaxReader.process()`` calls it. """ del self.bp self._biopax.close()
def protein(self): entref = self.next_elem.find(self.bperef) if entref is not None: protein = self.get_none(entref.get(self.rdfres)).replace('#', '') self.proteins[self.next_id] = { 'protein': '%s@%s' % (self.parser_id, protein), 'seqfeatures': self._bp_collect_resources(self.bpfeat), 'modfeatures': self._bp_collect_resources(self.bpfeat) } else: self.pfamilies[self.next_id] = \ self._bp_collect_resources(self.bpmphe) def pref(self): self.prefs[self.next_id] = {} self.prefs[self.next_id]['uxrefs'] = \ self._bp_collect_resources(self.bpxref) self.prefs[self.next_id]['prefs'] = \ self._bp_collect_resources(self.bpmerf) def uxref(self): db = self.next_elem.find(self.bpdb) id_type = db.text.lower() i = self.next_elem.find(self.bpid) if i is not None: self.ids[self.next_id] = (id_type, i.text) def rxref(self): db = self.next_elem.find(self.bpdb) if db is not None: id_type = db.text.lower() i = self.next_elem.find(self.bpid) if i is not None: self.ids[self.next_id] = (id_type, i.text) def cplex(self): if self.next_elem.find(self.bpcsto) is not None: self.complexes[self.next_id] = \ self._bp_collect_resources(self.bpcsto) elif self.next_elem.find(self.bpmphe) is not None: self.cvariations[self.next_id] = \ self._bp_collect_resources(self.bpmphe) elif self.next_elem.find(self.bpcomp) is not None: self.complexes[self.next_id] = \ self._bp_collect_resources(self.bpcomp) def stoichiometry(self): snum = self.next_elem.find(self.bpstoc).text self.stoichiometries[self.next_id] = ('%s@%s' % ( self.parser_id, self.next_elem.find(self.bpphye).get(self.rdfres).replace( '#', '')), int(float(snum))) def interaction(self): part = self._bp_collect_resources(self.bpleft) if len(part) == 2: self.interactions[self.next_id] = { 'refs': self._bp_collect_resources(self.bpxref), 'left': [part[0]], 'right': [part[1]], 'type': self.next_elem.tag.split('}')[-1] } else: self.interactions_not2[self.next_id] = len(part) def reaction(self): self.reactions[self.next_id] = { 'refs': self._bp_collect_resources(self.bpxref), 'left': self._bp_collect_resources(self.bpleft), 'right': self._bp_collect_resources(self.bprgth), 'type': self.next_elem.tag.split('}')[-1] } def cassembly(self): self.cassemblies[self.next_id] = { 'refs': self._bp_collect_resources(self.bpxref), 'left': self._bp_collect_resources(self.bpleft), 'right': self._bp_collect_resources(self.bprgth), 'type': self.next_elem.tag.split('}')[-1] } def catalysis(self): cter = self.next_elem.find(self.bpcter) cted = self.next_elem.find(self.bpcted) if cter is not None and cted is not None: typ = self.next_elem.find(self.bpctyp) self.catalyses[self.next_id] = { 'controller': '%s@%s' % (self.parser_id, self.get_none(cter.get(self.rdfres))), 'controlled': '%s@%s' % (self.parser_id, self.get_none(cted.get(self.rdfres))), 'type': '' if typ is None else typ.text } def control(self): cter = self.next_elem.find(self.bpcter) cted = self.next_elem.find(self.bpcted) if cter is not None and cted is not None: typ = self.next_elem.find(self.bpctyp) self.controls[self.next_id] = { 'refs': self._bp_collect_resources(self.bpxref), 'controller': '%s@%s' % (self.parser_id, cter.get(self.rdfres).replace('#', '')), 'controlled': '%s@%s' % (self.parser_id, cted.get(self.rdfres).replace('#', '')), 'type': '' if typ is None else typ.text } def pwstep(self): self.pwsteps[self.next_id] = self._bp_collect_resources(self.bppstp) def pubref(self): pmid = self.next_elem.find(self.bpid) if pmid is not None: self.pubrefs[self.next_id] = pmid.text def fragfea(self): felo = self.next_elem.find(self.bpfelo).get(self.rdfres) self.fragfeas[self.next_id] = '%s@%s' % \ (self.parser_id, felo.replace('#', '')) def seqint(self): beg = self.next_elem.find(self.bpibeg) end = self.next_elem.find(self.bpiend) self.seqints[self.next_id] = ( '%s@%s' % (self.parser_id, beg.get(self.rdfres).replace('#', '')) if beg is not None else None, '%s@%s' % (self.parser_id, end.get(self.rdfres).replace('#', '')) if end is not None else None) def seqsite(self): seqp = self.next_elem.find(self.bpseqp) if seqp is not None and seqp.text is not None: self.seqsites[self.next_id] = int(seqp.text) def modfea(self): felo = self.next_elem.find(self.bpfelo) moty = self.next_elem.find(self.bpmoty) if felo is not None and moty is not None: self.modfeas[self.next_id] = ( '%s@%s' % ( self.parser_id, self.next_elem.find(self.bpfelo).get(self.rdfres).replace( '#', '')), '%s@%s' % (self.parser_id, self.next_elem.find(self.bpmoty).get(self.rdfres).replace( '#', ''))) def seqmodvoc(self): term = self.next_elem.find(self.bpterm) if term is not None: self.seqmodvocs[self.next_id] = term.text def pathway(self): name = self.next_elem.find(self.bpdinm) if name is not None: name = name.text try: self.pathways[self.next_id] = { 'components': self._bp_collect_resources(self.bppcom), 'name': name } except TypeError: sys.stdout.write('Wrong type at element:\n') sys.stdout.write('%s %s' % (str(etree.tostring(self.next_elem))[:76], '...')) sys.stdout.flush() def get_none(self, something): if something is not None: return something.replace('#', '') return something def _bp_collect_resources(self, tag, restype=None): return \ list( map( lambda e: '%s@%s' % (self.parser_id, e.get(self.rdfres).replace('#', '')), filter( lambda e: self.rdfres in e.attrib and ( restype is None or e.get(self.rdfres).replace('#', '') .startswith(restype) ), self.next_elem.iterfind(tag) ) ) )
[docs] class AttributeHandler(object):
[docs] def __init__(self): pass
def add_source(self, source): if type(source) in _const.CHAR_TYPES: self._add_source(source) else: for s in source: self._add_source(s) def _add_source(self, source): self.sources.add(source) if source not in self.attrs: self.attrs[source] = {} def merge_attrs(self, attrs): self.attrs = common.merge_dicts(self.attrs, attrs) def update_attr(self, attr): self.attrs = common.dict_set_path(self.attrs, attr) def __iadd__(self, other): ''' Members or ids of entities should never change, as these are their unique, hashable and comparable attributes. __iadd__ operator is for merging entities with identical members or ids. ''' self.sources = self.sources | other.sources self.merge_attrs(other.attrs) return self
[docs] class Entity(AttributeHandler):
[docs] def __init__(self, identifier, id_type, sources=[], attrs=None): super(Entity, self).__init__() = identifier self.id_type = id_type self.sources = set([]) self.attrs = {} self.add_source(sources) if attrs is not None: self.merge_attrs(attrs)
def __str__(self): return '%s (%s)' % (, self.id_type) def __hash__(self): return hash(self.__str__()) def __eq__(self, other): return self.__str__() == other.__str__() def __gt__(self, other): return self.__str__() > other.__str__() def __lt__(self, other): return self.__str__() < other.__str__() def __repr__(self): return '%s (%s)' % (, self.id_type) def __iter__(self): """ With this method it is possible to iterate ``Entity`` objects just like ``EntitySet`` objects. Yields the object. """ for i in [self]: yield i
[docs] def expand(self): """ With this method it is possible to iterate ``Entity`` objects just like ``EntitySet`` objects. Yields string. """ for i in []: yield i
def itermembers(self): for m in self.__iter__(): yield (m, {})
[docs] class Protein(Entity):
[docs] def __init__(self, protein_id, id_type='uniprot', sources=[], attrs=None): super(Protein, self).__init__( protein_id, id_type, sources=sources, attrs=attrs) self.type = 'protein'
def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new) def key(self): return def proteins(self): return set([]) def __ror__(self, other): return self.proteins() | other def __rand__(self, other): return self.proteins() & other def __rsub__(self, other): return self.proteins() - other
[docs] class Reference(Entity):
[docs] def __init__(self, ref_id, id_type='pubmed', sources=[]): super(Reference, self).__init__(ref_id, id_type, sources=sources)
def get_ref(self): return refs.Reference(self.ref_id)
[docs] class EntitySet(AttributeHandler):
[docs] def __init__(self, members, sources=[], sep=';', parent=None): super(EntitySet, self).__init__() self.parent = parent self.members = sorted(common.unique_list(members)) self.set = set(self.members) self.sources = set([]) self.attrs = {} self.add_source(sources) self.originals = {} self.type = None self.sep = sep
def __str__(self): return self.sep.join(sorted(map(lambda x: str(x), list(self.members)))) def __repr__(self): return '%s: %s' % (self.__class__.__name__, self.__str__()) def __hash__(self): return hash(self.__str__()) def __eq__(self, other): return self.__str__() == other.__str__() def __gt__(self, other): return self.__str__() > other.__str__() def __lt__(self, other): return self.__str__() < other.__str__() def __iter__(self): for m in self.members: yield m def add_source(self, source): self.sources.add(source) if source not in self.attrs: self.attrs[source] = {}
[docs] class Intersecting(object):
[docs] def __init__(self): pass
def __eq__(self, other): return self.type == other.type and \ not self.set.isdisjoint(other.set)
[docs] class Complex(EntitySet):
[docs] def __init__(self, members, source, parent=None): super(Complex, self).__init__(members, source, parent=parent) self.type = 'complex'
def __str__(self): return '<%s>' % EntitySet.__str__(self) def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new) def get_stoichiometries(self, source, cid, with_pids=False): if source in self.sources and cid in self.attrs[source]: return \ list( map( lambda memb: (memb[0], memb[1]['stoi'], memb[1]['pid']) if with_pids else (memb[0], memb[1]['stoi']), filter( lambda memb: type(memb[1]) is dict and 'stoi' in memb[1] and 'pid' in memb[1], iteritems(self.attrs[source][cid]) ) ) ) def expand(self): for i, m1 in enumerate(self.members): for m2 in self.members[i + 1:]: yield m1, m2 def itermembers(self): for m in self.members: return self.parent.proteins[m] def key(self): return tuple(self.members) def proteins(self): return set(self.members) def __ror__(self, other): return self.proteins() | other def __rand__(self, other): return self.proteins() & other def __rsub__(self, other): return self.proteins() - other
[docs] class ProteinFamily(Intersecting, EntitySet):
[docs] def __init__(self, members, source, parent=None): EntitySet.__init__(self, members, source, parent=parent) Intersecting.__init__(self) self.type = 'pfamily'
def __str__(self): return '|%s|' % EntitySet.__str__(self) def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new) def _itermembers1(self): """ Iterates protein family, yields proteins. """ for m in self.members: if m in self.parent.proteins: yield self.parent.proteins[m] else: this_attrs = dict(map(lambda s: (s, {'pids': {}, 'prefs': {}, 'originals': {}}), self.source)) for s, a in iteritems(self.attrs): for pfid, aa in iteritems(a): this_attrs[s]['pids'][pfid] = {} yield Protein(m, sources=self.sources, attrs=this_attrs) def expand(self): for pkey in self.__iter__(): yield self.parent.proteins[pkey] def itermembers(self): for m in self.expand(): attrs = \ dict( map( lambda s: ( s, dict( map( lambda pf: (pf[0], pf[1][]['pid']), iteritems(self.attrs[s]) ) ) ), self.sources ) ) yield (m, attrs) def key(self): return tuple(self.members) def proteins(self): return set(self.members) def __ror__(self, other): return self.proteins() | other def __rand__(self, other): return self.proteins() & other def __rsub__(self, other): return self.proteins() - other
[docs] class ComplexVariations(Intersecting, EntitySet):
[docs] def __init__(self, members, source, parent=None): EntitySet.__init__(self, members, source, sep='|', parent=parent) Intersecting.__init__(self) self.type = 'cvariations'
def __str__(self): return '<%s>' % EntitySet.__str__(self) def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new) def expand(self): for m in self.members: for m1, m2 in m.expand(): yield m1, m2
[docs] def itermembers(self): """ This is a convenient iterator for the expand methods of higher classes like ``ReactionSide`` or ``Control``. """ for m in self.__iter__(): attrs = dict( map( lambda s: ( s, reduce( lambda c1, c2: c1 | c2, map( lambda cid: m.attrs[s][cid]['children'] if 'children' in m.attrs[s][cid] else set([cid]), filter( lambda cid: cid in m.attrs[s], self.attrs[s]['cids'] ) ) ) ), self.sources ) ) yield (m, attrs)
def key(self): return self.__str__() def proteins(self): return \ reduce( lambda m1, m2: m1 | m2, self.members, set([]) ) def __ror__(self, other): return self.proteins() | other def __rand__(self, other): return self.proteins() & other def __rsub__(self, other): return self.proteins() - other
[docs] class PyReact(session_mod.Logger):
[docs] def __init__( self, ncbi_tax_id=9606, default_id_types={}, modifications=True, seq=None, silent=False, max_complex_combinations=100, max_reaction_combinations=100, ): self.cachedir = cache.get_cachedir() self.ncbi_tax_id = ncbi_tax_id self.modifications = modifications self.parsers = {} self.sources = set([]) self.seq = seq self.mapper = mapping self.silent = silent self.max_complex_combinations = max_complex_combinations self.max_reaction_combinations = max_reaction_combinations self.slow_complexes = {} self.huge_complexes = {} self.huge_reactions = {} self.refs = {} self.species = {} self.proteins = {} self.pfamilies = {} self.complexes = {} self.cvariations = {} self.reactions = {} self.controls = {} self.mods = {} self.frags = {} self.rrefs = {} self.rproteins = {} self.rpfamilies = {} self.rcomplexes = {} self.rcvariations = {} self.rreactions = {} self.rcontrols = {} self.references = {} self.id_types = {} self.default_id_types = {'protein': 'uniprot'} self.default_id_types.update(default_id_types) session_mod.Logger.__init__(self, name = 'pyreact')
def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new) def load_reactome(self): self._log('Loading Reactome.') def reactome_id_proc(_id): _id = _id.split('-') return { 'id': _id[0], 'isoform': int(_id[1]) if len(_id) > 1 else 1 } biopax = curl.Curl( urls.urls['reactome']['biopax_l3'], large=True, silent=self.silent, ) parser = BioPaxReader( biopax.outfile, 'Reactome', file_from_archive='Homo_sapiens.owl' ) parser.process() self.add_dataset( parser, id_types={'uniprot isoform': 'uniprot'}, process_id=reactome_id_proc) def load_acsn(self): self._log('Loading ACSN.') biopax = curl.Curl( urls.urls['acsn']['biopax_l3'], large=True, silent=self.silent, default_mode = 'rb', ) self.parser = BioPaxReader(biopax.outfile, 'ACSN') self.parser.process() self.add_dataset(self.parser, id_types={'hgnc': 'genesymbol'}) del self.parser def load_kegg(self): self._log('Loading KEGG.') biopax = curl.Curl( urls.urls['kegg_pws']['biopax_l3'], large=True, silent=self.silent, ) parser = BioPaxReader(biopax.outfile, 'KEGG') parser.process() self.add_dataset(parser, {'uniprot knowledgebase': 'uniprot'}) def load_pid(self): self._log('Loading NCI-PID.') biopax = curl.Curl( urls.urls['nci-pid']['biopax_l3'], large=True, silent=self.silent, ) parser = BioPaxReader(biopax.outfile, 'NCI-PID') parser.process() self.add_dataset(parser) def load_wikipathways(self): self._log('Loading WikiPathways.') biopaxes = curl.Curl( urls.urls['wikipw']['biopax_l3'], large=True, silent=self.silent ) if not self.silent: prg = progress.Progress( len(biopaxes.result), 'Processing multiple BioPAX files', 1, percent=False, ) silent_default = self.silent self.silent = True for fname in biopaxes.files_multipart.keys(): if not silent_default: prg.step() parser = BioPaxReader( biopaxes.outfile, 'WikiPathways', file_from_archive=fname) parser.process(silent=True) self.add_dataset( parser, id_types={ 'ensembl': 'ensg', 'entrez gene': 'entrez', 'hgnc': 'genesymbol' }) if not silent_default: prg.terminate() self.silent = silent_default def load_panther(self): self._log('Loading PANTHER.') biopaxes = curl.Curl( urls.urls['panther']['biopax_l3'], large=True, silent=self.silent) if not self.silent: prg = progress.Progress( len(biopaxes.files_multipart), 'Processing multiple BioPAX files', 1, percent=False) silent_default = self.silent self.silent = True for fname in biopaxes.files_multipart.keys(): if not silent_default: prg.step() parser = BioPaxReader( biopaxes.outfile, 'PANTHER', file_from_archive=fname) parser.process(silent=True) self.add_dataset( parser, id_types={ 'ensembl': 'ensg', 'entrez gene': 'entrez', 'hgnc': 'genesymbol' }) if not silent_default: prg.terminate() self.silent = silent_default def load_netpath(self): self._log('Loading NetPath.') names = self.netpath_names() if not self.silent: prg = progress.Progress( len(names), 'Processing multiple BioPAX files', 1, percent=False) silent_default = self.silent self.silent = True for pwnum in names.keys(): if not silent_default: prg.step() biopax = curl.Curl( urls.urls['netpath_bp']['biopax_l3'] % int(pwnum), silent=True) parser = BioPaxReader(biopax.outfile, 'NetPath') parser.process(silent=True) self.add_dataset( parser, id_types={ 'ensembl': 'ensg', 'entrez gene': 'entrez', 'hgnc': 'genesymbol' }) if not silent_default: prg.terminate() self.silent = silent_default def netpath_names(self): repwnum = re.compile(r'_([0-9]+)$') result = {} url = urls.urls['netpath_names']['url'] c = curl.Curl(url) html = c.result soup = bs4.BeautifulSoup(html, 'html.parser') for a in soup.find_all('a'): if a.attrs['href'].startswith('pathways'): num = repwnum.findall(a.attrs['href'])[0] name = a.text result[num] = name return result def load_all(self): self._log('Loading all databases.') self.load_wikipathways() self.load_netpath() self.load_panther() self.load_acsn() self.load_pid() self.load_reactome() def add_dataset(self, parser, id_types={}, process_id=lambda x: {'id': x}): self.id_types.update(id_types) self.source = parser.source self.parser = parser self.id_processor = process_id self.merge() def merge(self): if self.source not in self.huge_complexes: self.huge_complexes[self.source] = {} if self.source not in self.slow_complexes: self.slow_complexes[self.source] = {} if self.source not in self.huge_reactions: self.huge_reactions[self.source] = {} if not self.silent: self.prg = progress.Progress( 12, 'Processing %s' % self.source, 1, percent=False) self.sources.add(self.source) if self.source not in self.parsers: self.parsers[self.source] = {} self.parsers[self.source][self.parser.parser_id] = self.parser self.set_corrections() if not self.silent: self.prg.step(status='processing references') self.merge_refs() if not self.silent: self.prg.step(status='processing proteins') self.merge_proteins() if not self.silent: self.prg.step(status='processing protein families') self.merge_pfamilies() if not self.silent: self.prg.step(status='processing protein modifications') if self.modifications: self.merge_modifications() if not self.silent: self.prg.step(status='processing complexes') remaining = self.merge_complexes() if not self.silent: self.prg.step(status='processing complex variations') self.merge_cvariations() if not self.silent: self.prg.step(status='processing complexes 2') remaining = self.merge_complexes(this_round=remaining) if not self.silent: self.prg.step(status='generating complex variations') self.gen_cvariations() if not self.silent: self.prg.step(status='processing reactions') self.merge_reactions() if not self.silent: self.prg.step(status='processing complex asseblies') self.merge_cassemblies() if not self.silent: self.prg.step(status='processing controls') self.merge_controls() if not self.silent: self.prg.step(status='processing catalyses') self.merge_catalyses() if not self.silent: self.prg.terminate() sys.stdout.write('\t:: %u proteins, %u complexes' ', %u reactions and %u' ' controls have been added.\n' % (self.proteins_added, self.complexes_added, self.reactions_added + self.cassemblies_added, self.controls_added + self.catalyses_added)) sys.stdout.write('\t:: %u complexes have not been expanded ' 'because number of combinations larger than %u.\n' % (len(self.huge_complexes[self.source]), self.max_complex_combinations)) sys.stdout.write('\t:: %u complexes took longer ' 'to process than 5 seconds.\n' % (len(self.slow_complexes[self.source]))) sys.stdout.write('\t:: %u reactions have not been expanded ' 'because number of combinations larger than %u.\n' % (len(self.huge_reactions[self.source]), self.max_reaction_combinations)) sys.stdout.write('\t:: Access them in `huge_complexes`, ' '`slow_complexes` and `huge_reactions`.\n') sys.stdout.flush() self.remove_defaults() def remove_defaults(self): self.parser = None self.source = None def set_corrections(self): # because not all can follow the standards... if self.source == 'ACSN': self.pref_correction = lambda l: filter(lambda e: e[6:10] == 'HUGO', l) else: self.pref_correction = lambda l: l if self.source == 'ACSN': self.pref_refs = lambda l: filter(lambda e: e[6:12] == 'PubMed', l) else: self.pref_refs = lambda l: [] if self.source in [ 'ACSN', 'WikiPathways', 'NetPath', 'PANTHER', 'NCI-PID', 'KEGG' ]: self.ambiguous_ids_permitted = True else: self.ambiguous_ids_permitted = False def merge_refs(self): self.refs_added = 0 if self.source not in self.rrefs: self.rrefs[self.source] = {} for refid, pubmed in iteritems(self.parser.pubrefs): ref = Reference(pubmed, sources=self.source) ref.attrs[self.source]['refid'] = set([]) ref.attrs[self.source]['refid'].add(refid) self.refs_added += 1 if pubmed in self.refs: self.refs[pubmed] += ref else: self.refs[pubmed] = ref self.rrefs[self.source][refid] = pubmed def merge_proteins(self): def get_protein_ids(pref): pids = [] refs = [] if pref in self.parser.prefs: uxrefs = self.pref_correction(self.parser.prefs[pref][ 'uxrefs']) pids = \ common.unique_list( map( lambda uxref: self.parser.ids[uxref], filter( lambda uxref: uxref in self.parser.ids, uxrefs ) ) ) refids = self.pref_refs(self.parser.prefs[pref]['uxrefs']) refs = \ common.unique_list( map( lambda refid: self.parser.ids[refid], filter( lambda refid: refid in self.parser.ids, refids ) ) ) # in panther proteinreferences are children of # other proteinreferences... for subpref in self.parser.prefs[pref]['prefs']: subpids, subrefs = get_protein_ids(subpref) pids.extend(subpids) refs.extend(subrefs) return pids, refs def map_protein_ids(ids): target_ids = [] id_attrs = {} for id_type, _id in ids: if id_type in self.id_types: std_id_type = self.id_types[id_type] else: std_id_type = id_type id_a = self.id_processor(_id) id_attrs[id_a['id']] = id_a if id_a['id'] is not None: target_ids.extend( self.mapper.map_name(id_a['id'], std_id_type, self.default_id_types['protein'])) target_ids = filter( lambda p: reflists.check( p, self.default_id_types['protein'], self.ncbi_tax_id ), target_ids) target_ids = common.unique_list(target_ids) if len(target_ids) > 1: if not self.ambiguous_ids_permitted: sys.stdout.write('\t:: Ambiguous ID ' 'translation: from %s to %s\n' % (ids, target_ids)) elif len(target_ids) == 0: target_ids = None else: target_ids = list(target_ids)[0] return target_ids, id_attrs self.rproteins[self.source] = {} self.proteins_added = 0 self.pfamilies_added = 0 for pid, p in iteritems(self.parser.proteins): ids, pubmeds = get_protein_ids(p['protein']) target_id, id_attrs = map_protein_ids(ids) if target_id is None: continue if type(target_id) is list: # go for a protein family: self.add_pfamily(list(map(lambda t: (t, pid), target_id)), pid) continue attrs = { self.source: { 'prefs': set([p['protein']]), 'pids': { pid: {} }, 'refs': set(pubmeds), 'originals': set([]) } } for original_id, id_a in iteritems(id_attrs): attrs[self.source]['originals'].add(original_id) for k, v in iteritems(id_a): if k != 'id': attrs[self.source]['pids'][pid][k] = set([]) attrs[self.source]['pids'][pid][k].add(v) protein = Protein( target_id, sources=set([self.source]), attrs=attrs) self.proteins_added += 1 if target_id in self.proteins: self.proteins[target_id] += protein else: self.proteins[target_id] = protein self.rproteins[self.source][pid] = target_id def preprocess_seqmodvoc(self): self.seqmod_dict = {} if self.source in common.mod_keywords: kws = common.mod_keywords[self.source] aas = common.aanames for modkey, modname in iteritems(self.parser.seqmodvocs): for mod_std_name, kwlist in kws: if all(map(lambda kw: kw in modname, kwlist)): this_aa = None for aan, aa in iteritems(aas): if aan in modname: this_aa = aa break self.seqmod_dict[modkey] = (mod_std_name, this_aa) def merge_modifications(self): self.load_sequences() self.preprocess_seqmodvoc() def get_protein(pid): proteins = [] if pid in self.rproteins[self.source]: _id = self.rproteins[self.source][pid] if 'isoform' in \ self.proteins[_id].attrs[self.source]['pids'][pid]: for isof in self.proteins[_id].attrs[self.source]\ ['pids'][pid]['isoform']: proteins.append((_id, isof)) else: proteins.append((_id, None)) return proteins def get_seqsite(seqsite): if seqsite in self.parser.seqsites: return int(float(self.parser.seqsites[seqsite])) def get_residue(protein, isof, resnum, resname): if protein in self.seq: if isof is not None and isof in self.seq[protein].isof: sresname = self.seq[protein].get(resnum, isoform=isof) if sresname == resname or resname is None: return sresname, isof for isof in sorted(self.seq[protein].isoforms()): sresname = self.seq[protein].get(resnum, isoform=isof) if sresname == resname or resname is None: return sresname, isof return resname, isof self.fragfeatures_added = 0 self.modfeatures_added = 0 for pid, p in iteritems(self.parser.proteins): proteins = get_protein(pid) for modfea in p['modfeatures']: if modfea in self.parser.fragfeas: seqint = self.parser.fragfeas[modfea] if seqint in self.parser.seqints: start = get_seqsite(self.parser.seqints[seqint][0]) end = get_seqsite(self.parser.seqints[seqint][1]) if start is not None and end is not None: for protein, isof in proteins: if protein in self.seq: if self.seq[protein].has_isoform(isof): frag = (protein, isof, start, end) if frag in self.frags: self.frags[frag].add_evidences( self.source ) else: instance = \ self.seq[protein].get( start, end, isof) mot = intera.Motif( protein, start, end, isoform=isof, instance=instance, evidences=self.source, ) self.frags[frag] = mot self.proteins[protein].update_attr([ self.source, 'pids', pid, 'frags', {frag} ]) self.fragfeatures_added += 1 if modfea in self.parser.modfeas: resnum = get_seqsite(self.parser.modfeas[modfea][0]) seqmodvoc = self.parser.modfeas[modfea][1] if seqmodvoc in self.seqmod_dict: typ, resname = self.seqmod_dict[seqmodvoc] for protein, isof in proteins: if protein in self.seq and resnum is not None: resname, isof = \ get_residue(protein, isof, resnum, resname) mod = (protein, isof, resnum, resname, typ) if mod in self.mods: self.mods[mod].add_evidences(self.source) else: res = intera.Residue( resnum, resname, protein, isoform=isof) start, end, instance = self.seq[ protein].get_region( resnum, isoform=isof) mot = intera.Motif( protein, start, end, isoform=isof, instance=instance) ptm = intera.Ptm( protein, motif=mot, residue=res, evidences=self.source, isoform=isof, typ=typ, ) self.mods[mod] = ptm try: self.proteins[protein].update_attr([ self.source, 'pids', pid, 'mods', set([mod]) ]) except: print(protein, pid) self.modfeatures_added += 1 def merge_pfamilies(self): if self.source not in self.rpfamilies: self.rpfamilies[self.source] = {} this_round = set(list(self.parser.pfamilies.keys())) next_round = [] prev_round = -1 while len(this_round) - prev_round != 0: prev_round = len(this_round) for pfid in this_round: pids = self.parser.pfamilies[pfid] subpf_unproc = \ any(map(lambda pid: pid in self.parser.pfamilies, pids)) if subpf_unproc: next_round.append(pfid) continue proteins = \ list( map( lambda pid: (self.rproteins[self.source][pid], pid), filter( lambda pid: pid in self.rproteins[self.source], pids ) ) ) subpfs = \ list( map( lambda pid: (self.rpfamilies[self.source][pid], pid), filter( lambda pid: pid in self.rpfamilies[self.source], pids ) ) ) for spf, spfid in subpfs: spfmembs = \ list( map( lambda p: (p[0], p[1]['pid']), iteritems(self.pfamilies[spf].attrs[ self.source][spfid]) ) ) proteins.extend(spfmembs) self.add_pfamily(proteins, pfid) this_round = next_round next_round = [] def add_pfamily(self, proteins, pfid): if self.source not in self.rpfamilies: self.rpfamilies[self.source] = {} members = sorted(common.unique_list(map(lambda p: p[0], proteins))) # this necessary if we add protein family because of # ambiguous id mapping; we want to make sure protein # exists for each member of the family. for m in members: if m not in self.proteins: p = Protein(m, sources=self.source) p.attrs[self.source]['pids'] = {} p.attrs[self.source]['pids'][pfid] = {} self.proteins[m] = p if len(members): pf = ProteinFamily(members, source=self.source, parent=self) members = tuple(members) pf.attrs[self.source][pfid] = {} for protein, pid in proteins: pf.attrs[self.source][pfid][protein] = {} pf.attrs[self.source][pfid][protein]['pid'] = pid if members not in self.pfamilies: self.pfamilies[members] = pf else: self.pfamilies[members] += pf self.rpfamilies[self.source][pfid] = members self.pfamilies_added += 1
[docs] def merge_complexes(self, this_round=None): """ Merges complexes from the active ``BioPaxReader`` object. Protein families and subcomplexes are expanded, and all combinations are created as separate complexes. The complexes from the same ID are added to sets in the ``rcomplexes`` dict. """ self.complexes_added = 0 if self.source not in self.rcomplexes: self.rcomplexes[self.source] = {} no_protein = set([]) this_round = set(list(self.parser.complexes.keys())) \ if this_round is None else this_round next_round = [] prev_round = -1 while len(this_round) - prev_round != 0: prev_round = len(this_round) for cid in this_round: start_time = time.time() stois = self.parser.complexes[cid] if len(self.parser.stoichiometries): pids = list( map(lambda stoi: self.parser.stoichiometries[stoi], stois)) else: pids = list(map(lambda comp: (comp, 1), stois)) subc_unproc = \ any( map( lambda pid: (pid[0] in self.parser.complexes or pid[0] in self.parser.cvariations) and pid[0] not in self.rcomplexes[self.source] and pid[0] not in no_protein, pids ) ) if subc_unproc: next_round.append(cid) continue proteins = \ list( map( lambda pid: (self.rproteins[self.source][pid[0]], pid[1], pid[0]), filter( lambda pid: pid[0] in self.rproteins[self.source], pids ) ) ) pfamilies = \ list( map( lambda pfid: list( map( lambda memb: (memb[0], pfid[1], memb[1]['pid']), iteritems( self.pfamilies[ self.rpfamilies [self.source][pfid[0]]] .attrs[self.source][pfid[0] ] ) ) ), filter( lambda pfid: pfid[0] in self.rpfamilies[self.source], pids ) ) ) pfnum = 0 if len(pfamilies): pfnum = reduce(lambda pf1l, pf2l: pf1l * pf2l, map(lambda pf: len(pf), pfamilies)) if pfnum > self.max_complex_combinations: self.huge_complexes[self.source][cid] = pfnum continue subcplexs = \ list( map( lambda scid: map( lambda memb: (memb, scid[1], scid[0]), self.rcomplexes[self.source][scid[0]] ), filter( lambda scid: scid[0] in self.rcomplexes[self.source], pids ) ) ) if len(subcplexs): subcplexs = itertools.product(*subcplexs) subcmembs = [] for this_subcplex in subcplexs: for sckey, scstoi, scid in this_subcplex: if scid not in no_protein: sc = self.complexes[sckey] scmembs = sc.get_stoichiometries( self.source, scid, with_pids=True) scmembs = list( map(lambda p: (p[0], p[1] * scstoi, p[2]), scmembs)) subcmembs.append(scmembs) else: subcmembs = [[]] if len(subcmembs) * pfnum > self.max_complex_combinations: self.huge_complexes[self.source][cid] = \ len(subcmembs) * pfnum continue if len(proteins) or len(pfamilies) or \ type(subcplexs) is not list: if not len(pfamilies): pfamilies = [[]] else: pfamilies = itertools.product(*pfamilies) for pfamily in pfamilies: for subc in subcmembs: this_proteins = \ proteins + list(pfamily) + list(subc) members = sorted( common.unique_list( map(lambda p: p[0], this_proteins))) if not len(members): continue cplex = Complex( members, source=self.source, parent=self) members = tuple(members) cplex.attrs[self.source][cid] = {} for protein, stoi, pid in this_proteins: cplex.attrs[self.source][cid][protein] = {} cplex.attrs[self.source][cid][protein][ 'pid'] = pid cplex.attrs[self.source][cid][protein][ 'stoi'] = stoi if members not in self.complexes: self.complexes[members] = cplex else: self.complexes[members] += cplex self.complexes_added += 1 if cid not in self.rcomplexes[self.source]: self.rcomplexes[self.source][cid] = set([]) self.rcomplexes[self.source][cid].add(members) else: no_protein.add(cid) elapsed = time.time() - start_time if elapsed > 5: self.slow_complexes[self.source][cid] = elapsed this_round = next_round next_round = [] return this_round
[docs] def merge_cvariations(self): """ This processes those complexes which are in fact a set of complex variations. As simple complexes also are always extended to complex variations because they might have not only simple proteins but protein families as members, here we only add new records to the attributes of already existing complexes. After ``merge_complexes`` will be called again, to process those simple complexes which have any of the complex variations processed here among their subcomplexes. """ self.cvariations_added = 0 for cvid, cv in iteritems(self.parser.cvariations): cplexes = \ dict( map( lambda cid: (cid, self.rcomplexes[self.source][cid]), filter( lambda cid: cid in self.rcomplexes[self.source], cv ) ) ) for cid, ckeys in iteritems(cplexes): for ckey in ckeys: c = self.complexes[ckey] if cvid not in c.attrs[self.source] and cid in c.attrs[ self.source]: c.attrs[self.source][cvid] = {'children': set([])} if cid in c.attrs[self.source]: c.attrs[self.source][cvid]['children'].add(cid) c.attrs[self.source][cvid].update(c.attrs[self.source][ cid]) if len(cplexes): self.rcomplexes[self.source][cvid] = \ reduce( lambda c1, c2: c1 | c2, cplexes.values() ) self.cvariations_added += 1
[docs] def gen_cvariations(self): """ Because one key from the BioPax file might represent more complexes, *complexvariations* are created to give a way to represent sets of combinations. These are created for all complexes, even with only one unambiguous constitution. The keys are the constitutions of all the combinations listed in alphabetic order, separated by ``|``. For example, ``A,B,C|A,B,D|A,B,E``. """ self.rcvariations[self.source] = {} for cid, keys in iteritems(self.rcomplexes[self.source]): membs = map(lambda key: self.complexes[key], keys) cvar = ComplexVariations(membs, source=self.source, parent=self) cvar.attrs[self.source]['cids'] = set([cid]) key = cvar.__str__() if key in self.cvariations: self.cvariations[key] += cvar else: self.cvariations[key] = cvar self.rcvariations[self.source][cid] = key
def merge_reactions(self): self.reactions_added = 0 self._merge_reactions(('reactions', 'reaction')) def merge_cassemblies(self): self.cassemblies_added = 0 self._merge_reactions(('cassemblies', 'cassembly')) def _merge_reactions(self, rclass): """ Merges reaction type entities from the active parser. Here protein families and complex variations are not expanded. """ if self.source not in self.rreactions: self.rreactions[self.source] = {} def get_side(ids): members = [] memb_ids = {} for _id in ids: for cls in ('proteins', 'pfamilies', 'cvariations'): r = getattr(self, 'r%s' % cls)[self.source] if _id in r: e = getattr(self, cls)[r[_id]] members.append(e) memb_ids[e.key()] = {'id': _id, 'type': cls} return members, memb_ids for rid, reac in iteritems(getattr(self.parser, rclass[0])): left, l_ids = get_side(reac['left']) right, r_ids = get_side(reac['right']) left_attrs = {self.source: {rid: l_ids}} right_attrs = {self.source: {rid: r_ids}} nleft = \ reduce( lambda m1, m2: m1 * m2, map( lambda m: len(m.members) if hasattr(m, 'members') else 1, left ), 1 ) nright = \ reduce( lambda m1, m2: m1 * m2, map( lambda m: len(m.members) if hasattr(m, 'members') else 1, right ), 1 ) if len(left) or len(right): if nleft <= self.max_reaction_combinations and \ nright <= self.max_reaction_combinations: reaction = Reaction( left, right, left_attrs, right_attrs, source=self.source, parent=self) reaction.attrs[self.source][rid] = {} this_refs = \ set( list( map( lambda r: self.rrefs[self.source][r], filter( lambda r: r in self.parser.pubrefs, reac['refs'] ) ) ) ) reaction.attrs[self.source][rid]['refs'] = this_refs reaction.attrs[self.source][rid]['type'] = rclass[1] key = reaction.__str__() if key in self.reactions: # print(key, type(self.reactions[key]), self.reactions[key].__str__(), type(reaction), reaction.__str__()) self.reactions[key] += reaction else: self.reactions[key] = reaction setattr(self, '%s_added' % rclass[0], getattr(self, '%s_added' % rclass[0]) + 1) self.rreactions[self.source][rid] = key else: self.huge_reactions[self.source][rid] = max(nleft, nright) def merge_controls(self): self.controls_added = 0 self._merge_controls(('controls', 'control')) def merge_catalyses(self): self.catalyses_added = 0 self._merge_controls(('catalyses', 'catalysis')) def _merge_controls(self, cclass): if self.source not in self.rcontrols: self.rcontrols[self.source] = {} def get_party(_id): for cls in ['proteins', 'pfamilies', 'cvariations', 'reactions']: if _id in getattr(self, 'r%s' % cls)[self.source]: key = getattr(self, 'r%s' % cls)[self.source][_id] entity = getattr(self, cls)[key] return (cls, key, entity) return None, None, None for cid, ctrl in iteritems(getattr(self.parser, cclass[0])): erclass, erkey, erent = get_party(ctrl['controller']) edclass, edkey, edent = get_party(ctrl['controlled']) # print('n = %u, erclass: %s, edclass: %s, er: %s, ed: %s' % (n, erclass, edclass, ctrl['controller'], ctrl['controlled'])) if erent is not None and edent is not None: this_refs = \ set( list( map( lambda r: self.rrefs[self.source][r], filter( lambda r: r in self.parser.pubrefs, ctrl['refs'] ) ) ) ) \ if 'refs' in ctrl else set([]) control = Control( erent, edent, source=self.source, parent=self) control.attrs[self.source][cid] = {} control.attrs[self.source][cid]['refs'] = this_refs control.attrs[self.source][cid]['class'] = cclass[1] control.attrs[self.source][cid]['type'] = ctrl['type'] key = control.__str__() if key in self.controls: self.controls[key] += control else: self.controls[key] = control setattr(self, '%s_added' % cclass[0], getattr(self, '%s_added' % cclass[0]) + 1) self.rcontrols[self.source][cid] = key def basic_stats(self, exclude_empty=False): self.stats = { 'proteins': {}, 'complexes': {}, 'mods': {}, 'reactions': {}, 'controls': {}, 'refs': {} } comb = [] for n in xrange(1, len(self.sources) + 1): comb.extend(list(itertools.combinations(self.sources, n))) comb = \ list( map( lambda s: (tuple(sorted(s)), set(s)), comb ) ) for etyp in self.stats.keys(): self.stats[etyp] = \ dict( map( lambda s: (s[0], 0), comb ) ) for e in getattr(self, etyp).values(): for c in comb: _sources = ( e.evidences.get_resource_names() if hasattr(e, 'evidences') else common.to_set(e.sources) ) if c[1] <= _sources: if \ not exclude_empty \ or ( etyp not in ['complexes', 'reactions', 'controls'] ) or ( etyp == 'complexes' and len(e.members) > 1 ) or ( etyp == 'reactions' and len(e.left.members) and len(e.right.members) ) or ( etyp == 'controls' and ( ( ( e.controller.__class__.__name__ == 'Complex' or e.controller.__class__.__name__ == 'ProteinFamily' ) and len(e.controller.members) ) or ( e.controller.__class__.__name__ == 'ComplexVariations' and any(map(lambda m: bool(len(m.members)), e.controller.members)) ) ) and ( len(e.controlled.left.members) and len(e.controlled.right.members) ) ): self.stats[etyp][c[0]] += 1 def simpson_stats(self): if not hasattr(self, 'stats'): self.basic_stats() self.simpson_sim = { 'proteins': {}, 'complexes': {}, 'mods': {}, 'reactions': {}, 'controls': {}, 'refs': {} } for etyp in self.simpson_sim.keys(): for s1 in self.sources: for s2 in self.sources: if s1 != s2: self.simpson_sim[etyp][(s1, s2)] = \ common.simpson_index_counts( self.stats[etyp][tuple([s1])], self.stats[etyp][tuple([s2])], self.stats[etyp][tuple(sorted([s1, s2]))] ) def resource_graph_edges(self, etyp): if not hasattr(self, 'simpson_sim'): self.simpson_stats() stats = self.stats[etyp] sim = self.simpson_sim[etyp] edges = [] nodes = {} for s1 in self.sources: nodes[s1] = stats[(s1, )] for s2 in self.sources: if s1 != s2 and sim[(s1, s2)] > 0.0: edges.append([s1, s2, sim[(s1, s2)]]) return edges, nodes def iterate_reactions(self): pass def load_sequences(self): if self.seq is None: self.seq = seq.swissprot_seq( self.ncbi_tax_id, isoforms=True) # interaction iterators from here def expand(self): def add_interactions(gen): for i in gen: key = (i[0], i[1]) if key not in aggregate: aggregate[key] = i aggregate[key][2] = set([i[2]]) else: aggregate[key][4].update(i[4]) aggregate[key][5].update(i[5]) aggregate[key][2].add(i[2]) aggregate[key][3] = aggregate[key][3] or i[3] aggregate = {} add_interactions(self.in_same_component()) add_interactions(self.co_control()) add_interactions(self.interacts_with()) add_interactions(self.state_change()) self.interactions = list(aggregate.values()) def expand_by_source(self): def add_interactions(gen): for i in gen: key = (i[0], i[1], i[4]) if key not in aggregate: aggregate[key] = i aggregate[key][2] = set([i[2]]) else: aggregate[key][5].update(i[5]) aggregate[key][2].add(i[2]) aggregate[key][3] = aggregate[key][3] or i[3] aggregate = {} add_interactions(self.in_same_component(by_source=True)) add_interactions(self.co_control(by_source=True)) add_interactions(self.interacts_with(by_source=True)) add_interactions(self.state_change(by_source=True)) self.interactions_by_source = list(aggregate.values()) pickle.dump( self.interactions_by_source, open( os.path.join( self.cachedir, 'reaction_interactions_by_source.pickle', ), 'wb' ) )
[docs] def in_same_component(self, by_source=False): """ For all complexes connects all members of the complex with each other. """ self.prg = progress.Progress( len(self.complexes), 'Expanding `in same component` interactions', 1) aggregate_src = {} for c in self.complexes.values(): self.prg.step() for i, p1 in enumerate(c): for p2 in list(c)[i + 1:]: key = (p1, p2) if key not in aggregate_src: aggregate_src[key] = set([]) aggregate_src[key].update(c.sources) self.prg.terminate() for (p1, p2), s in iteritems(aggregate_src): if by_source: for ss in s: yield [p1, p2, 'IN_SAME_COMPONENT', False, ss, set([])] else: yield [p1, p2, 'IN_SAME_COMPONENT', False, s, set([])]
def protein_get_refs(self, source, protein_elem): if protein_elem in self.rproteins[source]: protein = self.proteins[self.rproteins[source][protein_elem]] elem = protein.attrs[source]['pids'][protein_elem] if 'refs' in elem: return elem['refs'] return set([]) def complex_get_refs(self, source, cplex_elem): refs = set([]) for cplex_key in self.rcomplexes[source][cplex_elem]: cplex = self.complexes[cplex_key] if cplex_elem in cplex.attrs[source]: elem = cplex.attrs[source][cplex_elem] for protein, pdata in iteritems(elem): if protein != 'children': refs.update( self.protein_get_refs(source, elem[protein][ 'pid'])) return refs def co_control(self, by_source=False): self.prg = progress.Progress( len(self.controls), 'Expanding `co-control` interactions', 1) aggregate_src = {} aggregate_ref = {} for co in self.controls.values(): self.prg.step() if co.controller.type != 'pfamily': proteins = sorted(list(co.controller.proteins())) for i, p1 in enumerate(proteins): for p2 in proteins[i + 1:]: key = (p1, p2) if key not in aggregate_src: aggregate_src[key] = set([]) aggregate_src[key].update(co.sources) for s, codata in iteritems(co.attrs): for coid, d in iteritems(codata): if by_source: if s not in aggregate_ref: aggregate_ref[s] = {} if key not in aggregate_ref[s]: aggregate_ref[s][key] = set([]) aggregate_ref[s][key].update(d['refs']) else: if key not in aggregate_ref: aggregate_ref[key] = set([]) aggregate_ref[key].update(d['refs']) self.prg.terminate() for (p1, p2), s in iteritems(aggregate_src): if by_source: for ss in s: yield [ p1, p2, 'CO_CONTROL', False, ss, aggregate_ref[ss][( p1, p2)] ] else: yield [p1, p2, 'CO_CONTROL', False, s, aggregate_ref[(p1, p2)]] def interacts_with(self, by_source=False): aggregate_src = {} aggregate_ref = {} self.prg = progress.Progress( len(self.reactions), 'Expanding `interacts with` interactions', 1) for rs in self.reactions.values(): self.prg.step() isrc = \ set(list( map( lambda s: s[0], filter( lambda s: len( list( filter( lambda rr: rr['type'] == 'interaction', s[1].values() ) ) ), iteritems(rs.attrs) ) ) )) if len(isrc): for r in rs.expand(): for i, p1 in enumerate(r.left.proteins()): for p2 in list(r.right.proteins())[i + 1:]: key = tuple(sorted([p1, p2])) if key not in aggregate_src: aggregate_src[key] = set([]) aggregate_src[key].update(isrc) for s in isrc: if by_source: if s not in aggregate_ref: aggregate_ref[s] = {} if key not in aggregate_ref[s]: aggregate_ref[s][key] = set([]) for rid, rdata in iteritems(r.attrs[s]): if rdata['type'] == 'interaction': aggregate_ref[s][key].update(rdata[ 'refs']) else: if key not in aggregate_ref: aggregate_ref[key] = set([]) for rid, rdata in iteritems(r.attrs[s]): if rdata['type'] == 'interaction': aggregate_ref[key].update(rdata[ 'refs']) self.prg.terminate() for (p1, p2), s in iteritems(aggregate_src): if by_source: for ss in s: yield [ p1, p2, 'INTERACTS_WITH', False, ss, aggregate_ref[ss][ (p1, p2)] ] else: yield [ p1, p2, 'INTERACTS_WITH', False, s, aggregate_ref[(p1, p2)] ] def state_change(self, by_source=False): self.prg = progress.Progress( len(self.controls), 'Expanding `state change` interactions', 1) aggregate_src = {} aggregate_ref = {} for cos in self.controls.values(): self.prg.step() for co in cos.expand(): er_proteins = co.controller.proteins() for s in co.sources: ldiff, rdiff = self.reaction_mod_diff(s, co.controlled) for p2 in ldiff.keys(): if ldiff[p2] != rdiff[p2]: for p1 in er_proteins: key = (p1, p2) if key not in aggregate_src: aggregate_src[key] = set([]) aggregate_src[key].add(s) refs = set([]) for rid, rdata in iteritems( co.controlled.attrs[s]): refs.update(rdata['refs']) if by_source: if s not in aggregate_ref: aggregate_ref[s] = {} if key not in aggregate_ref[s]: aggregate_ref[s][key] = set([]) aggregate_ref[s][key].update(refs) else: if key not in aggregate_ref: aggregate_ref[key] = set([]) aggregate_ref[key].update(refs) self.prg.terminate() for (p1, p2), s in iteritems(aggregate_src): if by_source: for ss in s: yield [ p1, p2, 'STATE_CHANGE', True, ss, aggregate_ref[ss][( p1, p2)] ] else: yield [ p1, p2, 'STATE_CHANGE', True, s, aggregate_ref[(p1, p2)] ] def reaction_mod_diff(self, source, reaction, by_rid=False): left = self.reaction_side_get_mods(source, reaction.left, by_rid) right = self.reaction_side_get_mods(source, reaction.right, by_rid) return common.dict_diff(left, right) def reaction_side_get_mods(self, source, rside, by_rid=False): mods = {} for rid, rd in iteritems(rside.attrs[source]): for ent_key, data in iteritems(rd): if data['type'] == 'proteins': next_mods = { ent_key: self.protein_get_mods(source, data['id']) } elif data['type'] == 'complexes': next_mods = self.complex_get_mods(source, data['id']) if by_rid: if rid not in mods: mods[rid] = {} common.merge_dicts(mods[rid], {rid: next_mods}) else: common.merge_dicts(mods, next_mods) return mods def protein_get_mods(self, source, protein_elem): if protein_elem in self.rproteins[source]: protein = self.proteins[self.rproteins[source][protein_elem]] elem = protein.attrs[source]['pids'][protein_elem] if 'mods' in elem: return elem['mods'] return set([]) def complex_get_mods(self, source, complex_elem): mods = {} for cplex_key in self.rcomplexes[source][complex_elem]: cplex = self.complexes[cplex_key] if complex_elem in cplex.attrs[source]: elem = cplex.attrs[source][complex_elem] for protein, pdata in iteritems(elem): if protein != 'children': mods[protein] = self.protein_get_mods( source, elem[protein]['pid']) return mods
# ## ## ## ## ## ##
[docs] class ReactionSide(AttributeHandler):
[docs] def __init__(self, members, source=[], parent=None): super(ReactionSide, self).__init__() self.members = sorted(members) self.sources = set([]) self.attrs = {} self.add_source(source) self.parent = parent self.is_expanded = False
def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new) map(lambda m: m.reload(), self.members) def __hash__(self): return hash(self.__str__()) def __repr__(self): return self.__str__() def __str__(self): return 'ReactionSide: (%s)' % \ ('+'.join(map(lambda m: m.__str__(), self.members))) def equality(self, one, two): return \ all( map( lambda m1: any( map( lambda m2: not m1.set.isdisjoint(m2.set), filter( lambda m2: m2.type == m1.type, two.members ) ) ), filter( lambda m1: m1.type == 'cvariations' or m1.type == 'pfamily', one.members ) ) ) and \ all( map( lambda m1: m1 in two.set, filter( lambda m1: m1.type == 'protein', one.members ) ) ) def __eq__(self, other): return self.equality(self, other) and self.equality(other, self) def __iter__(self): for m in self.members: yield m
[docs] def expand(self): """ Expands the ``ReactionSide`` by iterating over all combinations of all ``ComplexVariation`` and ``ProteinFamily`` members, so yields ``ReactionSide`` objects with only ``Protein`` and ``Complex`` members. Yields tuple, because ``ReactionSide`` is initialized in ``Reaction``, the tuple is suitable to serve as ``members`` and ``attrs``. """ # collecting protein attributes if self.is_expanded: for i in [1]: yield self.members, self.attrs else: try: pattrs = \ dict( map( lambda m: (, dict( map( lambda d1: ( d1[0], dict( map( lambda d2: (d2[0], d2[ 1][]), iteritems(d1[1]) ) ) ), iteritems(self.attrs) ) ) ), filter( lambda m: m.type == 'protein', self.members ) ) ) except: print(self.attrs) for c in \ itertools.product( *list( map( lambda m: list( zip( m.itermembers(), [m.key()] * ( len(m.members) if hasattr(m, 'members') else 1) ) ), self.members ) ) ): attrs = dict(map(lambda s: (s, dict(map(lambda rid: (rid, {}), self.attrs[s].keys()))), self.sources)) members = [] for ((m, a), k) in c: members.append(m) if m.type == 'protein': # if it was a protein, we just copy if in pattrs: for s, d1 in iteritems(pattrs[]): for rid, d2 in iteritems(d1): attrs[s][rid][] = d2 # if it is from a protein family else: # for each resource for s, r in iteritems(attrs): # for each original reaction id for rid, d in iteritems(self.attrs[s]): # the key of the new entity (here: str, # uniprot id) if k in self.attrs[s][rid]: attrs[s][rid][m.key()] = ( # the type is obvious, the id is from the `a` dict supplied # by the ProteinFamily object, and we look up the id belonging # to the key of the original entity { 'type': 'proteins', 'id': a[s][self.attrs[s][rid][ k]['id']] }) # if it is a complex from a complex variations elif m.type == 'complex': for s, r in iteritems(attrs): for rid, d in iteritems(r): if k in self.attrs[s][rid]: cid = self.attrs[s][rid][k]['id'] attrs[s][rid][m.key()] = \ {'type': 'complexes', 'id': cid} yield members, attrs
def proteins(self): return \ reduce( lambda m1, m2: m1 | m2.proteins(), self.members, set([]) ) def __ror__(self, other): return self.proteins() | other def __rand__(self, other): return self.proteins() & other def __rsub__(self, other): return self.proteins() - other
[docs] class Reaction(AttributeHandler):
[docs] def __init__(self, left, right, left_attrs, right_attrs, source=[], parent=None): super(Reaction, self).__init__() self.parent = parent self.left = ReactionSide(left, source, parent=self.parent) self.right = ReactionSide(right, source, parent=self.parent) self.left.merge_attrs(left_attrs) self.right.merge_attrs(right_attrs) self.attrs = {} self.sources = set([]) self.add_source(source) self.is_expanded = False
def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new) self.left.reload() self.right.reload() def __repr__(self): return self.__str__() def __str__(self): return 'Reaction: LEFT(%s) --> RIGHT(%s)' % \ (self.left.__str__(), self.right.__str__()) def __hash__(self): return hash(self.__str__()) def __eq__(self, other): return self.left == other.left and self.right == other.right def __iadd__(self, other): self = AttributeHandler.__iadd__(self, other) self.left += other.left self.right += other.right return self def expand(self): if self.is_expanded: for i in [1]: yield self else: expanded = [] lAllProteins = self.left.proteins() rAllProteins = self.right.proteins() diffAllProteins = rAllProteins ^ lAllProteins lefts = list(self.left.expand()) rights = list(self.right.expand()) diffs = set([]) for t in xrange(2): if t == 1: try: minDiff = min(diffs) except ValueError: print('Empty sequence error: %s' % self.__str__()) for left in lefts: for right in rights: r = Reaction( left[0], right[0], left[1], right[1], source=self.sources, parent=self.parent) lProteins = r.left.proteins() rProteins = r.right.proteins() diffProteins = lProteins ^ rProteins diff = len(diffProteins) diffs.add(diff) if t == 1 and diff == minDiff: r.left.is_expanded = True r.right.is_expanded = True r.is_expanded = True r.attrs = self.attrs yield r else: del r
# #minDiff = min(map(lambda e: e[1], expanded)) # for r, d in expanded: # if d == minDiff: # yield r
[docs] class Control(AttributeHandler):
[docs] def __init__(self, er, ed, source=[], parent=None): super(Control, self).__init__() self.controller = er self.controlled = ed self.attrs = {} self.sources = set([]) self.add_source(source) self.parent = parent self.is_expanded = False
def reload(self): modname = self.__class__.__module__ mod = __import__(modname, fromlist=[modname.split('.')[0]]) imp.reload(mod) new = getattr(mod, self.__class__.__name__) setattr(self, '__class__', new) def __str__(self): return 'Control: C.ER(%s) --> C.ED(%s)' % \ ( self.controller.__str__(), self.controlled.__str__() ) def __repr__(self): return self.__str__() def __hash__(self): return hash(self.__str__()) def __eq__(self, other): return self.controller == other.controller \ and self.controlled == other.controlled def expand(self): if self.is_expanded: for i in [self]: yield self else: for ed in self.controlled.expand(): for er, erattrs in self.controller.itermembers(): c = Control( er, ed, source=self.sources, parent=self.parent) c.attrs = self.attrs yield c