Source code for pypath.inputs.opm

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import re
import csv
import collections

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.utils.taxonomy as taxonomy



[docs]
def opm_annotations(organism = 9606):

    reparentheses = re.compile(r'\((.*)\)')
    regenesymbol  = re.compile(r' ([A-Z0-9]{3,}) ')


    def get_dict(name):

        result = {}
        url = urls.urls['opm'][name]
        c = curl.Curl(url, large = True, silent = False)
        data = csv.DictReader(c.result, delimiter = ',')

        for rec in data:
            result[rec['id']] = rec['name']

        return result


    OpmAnnotation = collections.namedtuple(
        'OpmAnnotation',
        ['membrane', 'family', 'transmembrane'],
    )


    result = collections.defaultdict(set)

    organism_name = (
        taxonomy.phosphoelm_taxids[organism]
            if organism in taxonomy.phosphoelm_taxids else
        None
    )

    types = get_dict('types')
    families = get_dict('families')

    url = urls.urls['opm']['proteins']
    c = curl.Curl(url, silent = False, large = True)

    data = csv.DictReader(c.result, delimiter = ',')

    for rec in data:

        if organism_name and rec['species_name_cache'] != organism_name:

            continue

        name = rec['name']

        names = [
            name,
            name.split('(')[0],
            name.split(',')[0],
        ]

        m = reparentheses.search(name)

        if m:
            names.append(m.groups()[0])

        genesymbols = regenesymbol.findall(name)

        for this_name in names:
            uniprot = mapping.map_name0(this_name, 'protein-name', 'uniprot')

            if uniprot:
                break

        if not uniprot:
            for gs in genesymbols:
                uniprot = (
                    mapping.map_name0(this_name, 'genesymbol', 'uniprot')
                )

                if uniprot:
                    break

        if not uniprot:
            continue

        result[uniprot].add(
            OpmAnnotation(
                membrane = rec['membrane_name_cache'],
                family = rec['family_name_cache'],
                transmembrane = types[rec['type_id']] == 'Transmembrane',
            )
        )

    return dict(result)