Source code for pypath.inputs.baccin2019

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import re
import collections
import itertools

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.utils.orthology as orthology_mod
import pypath.internals.intera as intera
import pypath.inputs.common as inputs_common



[docs]
def baccin2019_interactions(ncbi_tax_id = 9606):

    recamel = re.compile(r'(.+?)([A-Z][a-z])')
    recap = re.compile(r'(^[A-Z][a-z]|_[A-Z][a-z])(.+)')


    def camel_to_snake(value):

        return (
            recamel.sub(
                lambda m: m.group(1).lower() + '_' + m.group(2),
                value.strip()
            ).lower()
        )


    def id_translate(mouse_gs):

        uniprots = mapping.map_name(
            mouse_gs,
            'genesymbol',
            'uniprot',
            10090,
        )

        if ncbi_tax_id != 10090:

            uniprots = set(
                itertools.chain(*(
                    orthology_mod.translate(
                        uniprot,
                        target = ncbi_tax_id,
                        source = 10090,
                    )
                    for uniprot in uniprots
                ))
            )

        return uniprots


    def raw_to_uniprots(raw):

        components = raw.split('&')

        return set(
            itertools.product(
                *(id_translate(comp) for comp in components)
            )
        )


    def get_partners(components, sources, references):

        return {
            (
                comp[0]
                    if len(comp) == 1 else
                intera.Complex(
                    components = comp,
                    sources = sources,
                    references = references,
                )
            )
            for comp in components
        }


    Baccin2019Interaction = collections.namedtuple(
        'Baccin2019Interaction',
        [
            'ligand',
            'receptor',
            'correct',
            'ligand_location',
            'ligand_category',
            'resources',
            'references',
        ]
    )


    source_names = {
        'Baccin': 'Baccin2019',
        'Ramilowski': 'Ramilowski2015',
    }

    url = urls.urls['baccin2019']['url']
    c = curl.Curl(url, silent = False, large = True)
    data = inputs_common.read_xls(c.fileobj.name, sheet = 'SuppTable3')

    result = []

    for rec in data[3:]:

        if rec[4].strip().lower() == 'incorrect':

            continue

        ligand_components = raw_to_uniprots(rec[1])

        if not ligand_components:

            continue

        receptor_components = raw_to_uniprots(rec[2])

        if not receptor_components:

            continue

        sources = {'Baccin2019', rec[3].strip()}
        sources = {
            source_names[s] if s in source_names else s
            for s in sources
        }

        references = {
            _ref for _ref in
            (
                ref.strip().replace('.0', '')
                for ref in rec[7].split(',')
            )
            if _ref.isdigit()
        }

        ligands = get_partners(ligand_components, sources, references)
        receptors = get_partners(receptor_components, sources, references)

        for ligand, receptor in itertools.product(ligands, receptors):
            result.append(
                Baccin2019Interaction(
                    ligand = ligand,
                    receptor = receptor,
                    correct = rec[4].strip(),
                    ligand_location = camel_to_snake(rec[5]),
                    ligand_category = camel_to_snake(rec[6]),
                    resources = sources,
                    references = references,
                )
            )

    return result




[docs]
def baccin2019_annotations(ncbi_tax_id = 9606):


    Baccin2019Annotation = collections.namedtuple(
        'Baccin2019Annotation',
        [
            'mainclass',
            'subclass',
            'location',
        ]
    )


    ia_all = baccin2019_interactions(ncbi_tax_id = ncbi_tax_id)

    result = collections.defaultdict(set)

    for ia in ia_all:

        result[ia.ligand].add(
            Baccin2019Annotation(
                mainclass = 'ligand',
                subclass = ia.ligand_category,
                location = ia.ligand_location,
            )
        )

        result[ia.receptor].add(
            Baccin2019Annotation(
                mainclass = 'receptor',
                subclass = (
                    '%s_receptor' % ia.ligand_category
                        if ia.ligand_category != 'other' else
                    None
                ),
                location = None,
            )
        )

    return dict(result)