Source code for pypath.inputs.topdb
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
from past.builtins import xrange, range
import collections
import itertools
from lxml import etree
import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.utils.taxonomy as taxonomy
[docs]
def topdb_annotations(ncbi_tax_id = 9606):
TopdbAnnotation = collections.namedtuple(
'TopdbAnnotation',
['membrane', 'topology', 'score', 'tmregions'],
)
result = collections.defaultdict(set)
url = urls.urls['topdb']['url']
c = curl.Curl(
url,
large = True,
default_mode = 'rb',
silent = False,
)
parser = etree.iterparse(c.fileobj, events = ('start', 'end'))
result = collections.defaultdict(set)
root = next(parser)
used_elements = []
for ev, elem in parser:
if ev == 'end' and elem.tag == 'TOPDB':
used_elements.append(elem)
organism = elem.find('Organism').text
organism = taxonomy.ensure_ncbi_tax_id(organism)
if not organism:
continue
tag_uniprots = elem.find('./CrossRef/UniProt')
if tag_uniprots is None:
continue
uniprots = [u.text for u in tag_uniprots.findall('AC')]
uniprots = set(
mapping.map_name0(
u,
'uniprot',
'uniprot',
ncbi_tax_id = ncbi_tax_id,
)
for u in uniprots
)
if not uniprots:
continue
membranes = set(
mem
for tag_mem in elem.findall('Membrane')
for mem in tag_mem.text.split(';')
)
ntm = 0
score = 0
topologies = ()
tag_topo = elem.find('Topology')
if tag_topo is not None:
ntm = int(tag_topo.find('Numtm').attrib['Count'])
score = int(tag_topo.find('Reliability').text)
topologies = set(
tag_reg.attrib['Loc']
for tag_reg in tag_topo.findall('./Regions/Region')
)
if not membranes:
membranes = (None,)
if not topologies:
topologies = (None,)
for topology, membrane, uniprot in itertools.product(
topologies,
membranes,
uniprots,
):
if uniprot is None:
continue
result[uniprot].add(
TopdbAnnotation(
membrane = membrane,
topology = topology,
tmregions = ntm,
score = score,
)
)
# removing used elements to keep memory low
if len(used_elements) > 2000:
for _ in xrange(1000):
e = used_elements.pop(0)
e.clear()
# closing the XML
c.fileobj.close()
del c
return dict(result)