Source code for pypath.inputs.spike

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import itertools
import collections

from typing import Dict, List

import xml.etree.cElementTree as ET

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.internals.intera as intera



[docs]
def spike_interactions(min_confidence: int = 2) -> List[tuple]:
    """
    Args
        min_confidence:
            Confidence (integrity) levels in SPIKE span from 1 to 4, 1
            being the highest confidence.
    """

    url = urls.urls['spike']['url']
    c = curl.Curl(
        url,
        silent = False,
        large = True,
        files_needed = ['LatestSpikeDB.xml'],
    )
    spikexml = c.result

    xml = ET.parse(spikexml['LatestSpikeDB.xml'])

    xmlroot = xml.getroot()

    # iterating genes

    bblock = xmlroot.find('BuildingBlock')
    rblock = xmlroot.find('RegulationBlock')
    iblock = xmlroot.find('InteractionBlock')

    genes = {}
    result = []

    SpikeInteraction = collections.namedtuple(
        'SpikeInteraction',
        (
            'entrez_a',
            'genesymbol_a',
            'entrez_b',
            'genesymbol_b',
            'directed',
            'pmids',
            'integrity',
            'effect',
            'assay',
            'data_source',
            'description',
            'mechanism',
            'regulation',
        )
    )

    Gene = collections.namedtuple('Gene', ('entrez', 'genesymbol', 'type'))

    for gene in bblock.findall('Gene'):

        sy = '' if 'name' not in gene.attrib else gene.attrib['name']
        genes[gene.attrib['id']] = [
            Gene(
                entrez = gene.find('XRef').attrib['id'],
                genesymbol = sy,
                type = 'gene',
            )
        ]

    for grp in bblock.findall('Group'):

        if grp.attrib['type'] == 'Complex':

            members = [m.attrib['ref'] for m in grp.findall('Member')]

            if all(
                m in genes and
                genes[m][0].type != 'complex'
                for m in members
            ):

                uniprots = [
                    mapping.map_name(genes[m][0].entrez, 'entrez', 'uniprot')
                    for m in members
                ]

                genes[grp.attrib['id']] = [
                    Gene(
                        entrez = cplex,
                        genesymbol = cplex,
                        type = 'complex',
                    )
                    for cplex in
                    (
                        intera.Complex(
                            name = grp.attrib['name'],
                            components = ups,
                            sources = 'SPIKE',
                        )
                        for ups in itertools.product(*uniprots)
                    )
                ]

    for i in itertools.chain(
            rblock.findall('Regulation'),
            iblock.findall('Interaction'),
        ):

        regulation = i.tag == 'Regulation'
        src_tag = 'Source' if regulation else 'ProteinA'
        tgt_tag = 'PhysicalTarget' if regulation else 'ProteinB'

        ds = i.attrib['dataSource']
        itg = i.attrib['integrity']
        eff = i.attrib.get('effect', '')
        mec = i.attrib.get('mechanism', '')
        src = i.find(src_tag).attrib['ref']
        tgt = i.find(tgt_tag).attrib['ref']
        dcd = str(int(regulation))
        dsc = (
            ''
                if i.find('Description') is None else
            i.find('Description').text.replace('\n', ' ')
        )
        asy = (
            ''
                if 'biologicalAssay' not in i.attrib else
            i.attrib['biologicalAssay']
        )
        refs = i.findall('Reference')
        pmids = [r.attrib['pmid'] for r in refs]

        if src in genes and tgt in genes:

            if int(itg) <= min_confidence:

                for _src, _tgt in itertools.product(genes[src], genes[tgt]):

                    result.append(
                        SpikeInteraction(
                            entrez_a = _src.entrez,
                            genesymbol_a = _src.genesymbol,
                            entrez_b = _tgt.entrez,
                            genesymbol_b = _tgt.genesymbol,
                            directed = dcd,
                            pmids = ';'.join(pmids),
                            integrity = itg,
                            effect = eff,
                            assay = asy,
                            data_source = ds,
                            description = dsc,
                            mechanism = mec,
                            regulation = regulation,
                        )
                    )

    return result




[docs]
def spike_complexes(min_confidence: int = 2) -> Dict[str, intera.Complex]:

    interactions = spike_interactions(min_confidence = min_confidence)

    complexes = [
        getattr(i, attr)
        for i in interactions
        for attr in ('entrez_a', 'entrez_b')
        if isinstance(getattr(i, attr), intera.Complex)
    ]

    return dict((cplx.__str__(), cplx) for cplx in complexes)