Source code for pypath.inputs.topdb

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from past.builtins import xrange, range

import collections
import itertools

from lxml import etree

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.utils.taxonomy as taxonomy



[docs]
def topdb_annotations(ncbi_tax_id = 9606):

    TopdbAnnotation = collections.namedtuple(
        'TopdbAnnotation',
        ['membrane', 'topology', 'score', 'tmregions'],
    )

    result = collections.defaultdict(set)

    url = urls.urls['topdb']['url']
    c = curl.Curl(
        url,
        large = True,
        default_mode = 'rb',
        silent = False,
    )

    parser = etree.iterparse(c.fileobj, events = ('start', 'end'))

    result = collections.defaultdict(set)
    root = next(parser)
    used_elements = []

    for ev, elem in parser:

        if ev == 'end' and elem.tag == 'TOPDB':

            used_elements.append(elem)

            organism = elem.find('Organism').text
            organism = taxonomy.ensure_ncbi_tax_id(organism)

            if not organism:

                continue

            tag_uniprots = elem.find('./CrossRef/UniProt')

            if tag_uniprots is None:
                continue

            uniprots = [u.text for u in tag_uniprots.findall('AC')]
            uniprots = set(
                mapping.map_name0(
                    u,
                    'uniprot',
                    'uniprot',
                    ncbi_tax_id = ncbi_tax_id,
                )
                for u in uniprots
            )

            if not uniprots:
                continue

            membranes = set(
                mem
                for tag_mem in elem.findall('Membrane')
                for mem in tag_mem.text.split(';')
            )

            ntm = 0
            score = 0
            topologies = ()
            tag_topo = elem.find('Topology')

            if tag_topo is not None:
                ntm = int(tag_topo.find('Numtm').attrib['Count'])
                score = int(tag_topo.find('Reliability').text)

                topologies = set(
                    tag_reg.attrib['Loc']
                    for tag_reg in tag_topo.findall('./Regions/Region')
                )

            if not membranes:
                membranes = (None,)

            if not topologies:
                topologies = (None,)

            for topology, membrane, uniprot in itertools.product(
                topologies,
                membranes,
                uniprots,
            ):

                if uniprot is None:

                    continue

                result[uniprot].add(
                    TopdbAnnotation(
                        membrane = membrane,
                        topology = topology,
                        tmregions = ntm,
                        score = score,
                    )
                )

        # removing used elements to keep memory low
        if len(used_elements) > 2000:
            for _ in xrange(1000):
                e = used_elements.pop(0)
                e.clear()

    # closing the XML
    c.fileobj.close()
    del c

    return dict(result)