Source code for pypath.inputs.locate

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from past.builtins import xrange, range

import collections

from lxml import etree

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.inputs.uniprot_db as uniprot_db
import pypath.utils.taxonomy as taxonomy
import pypath.utils.mapping as mapping



[docs]
def locate_localizations(
        organism = 9606,
        literature = True,
        external = True,
        predictions = False,
    ):

    record = collections.namedtuple(
        'LocateAnnotation',
        ('source', 'location', 'cls', 'pmid', 'score'),
    )
    record.__new__.__defaults__ = (None, None, None)

    organism_uniprots = set(
        uniprot_db.all_uniprots(organism = organism, swissprot = True)
    )

    organism_str = taxonomy.taxids[organism]
    url = urls.urls['locate']['url_rescued'] % organism_str
    fname = url.split('/')[-1][:-4]

    c = curl.Curl(
        url,
        large = True,
        default_mode = 'rb',
        silent = False,
        files_needed = [fname],
    )
    c.result[fname]

    parser = etree.iterparse(c.result[fname], events = ('start', 'end'))

    result = collections.defaultdict(set)
    root = next(parser)
    used_elements = []

    for ev, elem in parser:

        if ev == 'end' and elem.tag == 'LOCATE_protein':

            tag_protein = elem.find('protein')
            this_uniprot = None
            this_uniprots = None
            this_entrez  = None
            this_organism = (
                tag_protein.find('organism').text
                    if tag_protein is not None else
                None
            )
            this_class = (
                tag_protein.find('class').text
                    if tag_protein is not None else
                None
            )

            xrefs = elem.find('xrefs')

            if xrefs is None:
                continue

            for xref in xrefs.findall('xref'):
                src = xref.find('source')
                src_name = src.find('source_name').text

                if src_name == 'UniProtKB-SwissProt':
                    this_uniprot = src.find('accn').text

                if src_name == 'Entrez Gene':
                    this_entrez = src.find('accn').text

                if src_name == 'UniProt/SPTrEMBL' and this_uniprot is None:
                    this_uniprot = src.find('accn').text

            # if we don't know what it is, does not make sense to proceed
            if this_uniprot is None and this_entrez is None:
                continue

            if this_uniprot:
                this_uniprots = mapping.map_name(
                    this_uniprot,
                    'uniprot',
                    'uniprot',
                    ncbi_tax_id = organism,
                )

            if not this_uniprots and this_entrez:
                this_uniprots = mapping.map_name(
                    this_entrez,
                    'entrez',
                    'uniprot',
                    ncbi_tax_id = organism,
                )

            this_uniprots = set(this_uniprots) & organism_uniprots

            # if we don't know what it is, does not make sense to proceed
            if not this_uniprots:
                continue

            if external:
                # External database annotations
                extannot = elem.find('externalannot')

                if extannot is not None:
                    for extannotref in extannot.findall('reference'):
                        sources = []

                        for src in extannotref.findall('source'):
                            src_name = src.find('source_name')

                            if src_name is not None:
                                sources.append(src_name.text)

                        sources = ';'.join(sources) if sources else None
                        locations =  extannotref.find('locations')

                        if locations is not None:
                            for location in locations.findall('location'):
                                for loc in location.iterchildren():
                                    if loc.tag[:4] == 'tier':
                                        this_loc = loc.text.lower().split(',')

                                        for uniprot in this_uniprots:
                                            for _loc in this_loc:
                                                result[uniprot].add(record(
                                                    source = sources,
                                                    location = _loc.strip(),
                                                    cls = this_class,
                                                    score = None,
                                                ))

            if predictions:
                # Predictions
                sclpred = elem.find('scl_prediction')

                if sclpred is not None:
                    for sclpred_src in sclpred.findall('source'):
                        score = float(sclpred_src.find('evaluation').text)

                        if score == 0.0:
                            continue

                        this_src = sclpred_src.find('method').text
                        this_loc = sclpred_src.find('location').text.lower()

                        if this_loc == 'no prediction':
                            continue

                        for uniprot in this_uniprots:
                            result[uniprot].add(record(
                                source = this_src,
                                location = this_loc,
                                cls = this_class,
                                score = score,
                            ))

            if literature:
                # Literature curation
                lit = elem.find('literature')

                if lit is not None:

                    for litref in lit.findall('reference'):

                        locs = set()

                        for lloc in (
                            litref.find('locations').findall('location')
                        ):

                            for loc in lloc.iterchildren():
                                if loc.tag[:4] == 'tier':
                                    locs.add(loc.text.lower())

                        pmid = litref.find('source')
                        pmid = (
                            None if pmid is None else pmid.find('accn').text
                        )

                        for loc in locs:

                            for uniprot in this_uniprots:

                                result[uniprot].add(
                                    record(
                                        source = 'literature',
                                        location = loc,
                                        pmid = pmid,
                                        cls = this_class,
                                        score = None,
                                    )
                                )

        used_elements.append(elem)

        # removing used elements to keep memory low
        if len(used_elements) > 1000:
            for _ in xrange(500):
                e = used_elements.pop(0)
                e.clear()

    # closing the XML
    c.fileobj.close()
    del c

    return dict(result)