Source code for pypath.inputs.intact

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from numbers import Number
from typing import List

import collections

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.progress as progress



[docs]
def intact_interactions(
        miscore: Number = .6,
        organism: int = 9606,
        complex_expansion: bool = False,
        only_proteins: bool = False,
        only_ids: bool = False,
    ) -> List[tuple]:
    """
    only_proteins : bool
        Keep only records of protein-protein interactions.
    only_ids : bool
        Load only the identifiers of interacting pairs
        (smaller memory footprint).
    """

    id_types = {
        'uniprotkb': 'uniprot',
    }

    IntactInteraction = collections.namedtuple(
        'IntactInteraction',
        (
            'id_a',
            'id_b',
            'id_type_a',
            'id_type_b',
            'pubmeds',
            'methods',
            'interaction_types',
            'mi_score',
            'isoform_a',
            'isoform_b',
        ),
    )
    IntactInteraction.__new__.__defaults__ = (None,) * 7


    def get_id_type(field):

        id_type = None if field == '-' else field.split(':')[0]

        return id_types[id_type] if id_type in id_types else id_type


    def get_uniprot_id(field):

        uniprot, isoform = _try_isoform(
            field.split(':')[1].replace('"', '')
        )

        uniprot = uniprot.split('-')[0]

        return uniprot, isoform

    def get_ebi_id(field):

        if field == '-':

            return None, None
        
        else:

            partner_id=field.split(':')[1]

            return partner_id, None


    def get_taxon(field):

        return (
            0
                if field == '-' else
            field.split('|')[0].split(':')[1].split('(')[0]
        )


    results = []
    url = urls.urls['intact']['mitab']

    if type(organism) is int:
        organism = '%u' % organism

    c = curl.Curl(
        url,
        silent = False,
        large = True,
        files_needed = ['intact.txt'],
        slow = True,
    )

    data = c.result['intact.txt']
    size = c.sizes['intact.txt']
    prg = progress.Progress(size, 'Reading IntAct MI-tab file', 99)

    for lnum, l in enumerate(data):

        prg.step(len(l))

        if lnum == 0:

            continue

        l = l.strip('\n\r ').split('\t')

        taxon_a = get_taxon(l[9])
        taxon_b = get_taxon(l[10])

        if (
            (
                organism is None or (
                    taxon_a == organism and
                    taxon_b == organism
                )
            ) and (
                complex_expansion or
                'expansion' not in l[15]
            )
        ):

            # finding mi-score and author
            sc = 0
            au = '0'

            for s in l[14].split('|'):

                if s.startswith('intact-miscore'):

                    sc = float(s.split(':')[1])

                if s.startswith('author'):

                    au = len(s.split(':')[1])

            # filtering for mi-score
            if sc < miscore:

                continue

            id_type_a = get_id_type(l[0])
            id_type_b = get_id_type(l[1])

            if (
                only_proteins and not (
                    id_type_a == 'uniprot' and
                    id_type_b == 'uniprot'
                )
            ):

                continue

            id_a, isoform_a = (
                get_uniprot_id(l[0])
                    if id_type_a == 'uniprot' else
                get_ebi_id(l[0])
            )

            id_b, isoform_b = (
                get_uniprot_id(l[1])
                    if id_type_b == 'uniprot' else
                get_ebi_id(l[1])
            )

            # key = tuple(sorted((id_a, id_b)))

            pubmeds = set(
                ref[1] for ref in (
                    ref.split(':')
                    for ref in l[8].split('|')
                )
                if ref[0] == 'pubmed'
            )
            methods = set(
                met.split('(')[1].strip(')"')
                for met in  l[6].split('|')
            )

            interaction_types= set(
                int_type.split('(')[1].strip(')"')
                for int_type in  l[11].split('|')
            )

            results.append(
                IntactInteraction(
                    id_a = id_a,
                    id_b = id_b,
                    id_type_a = id_type_a,
                    id_type_b = id_type_b,
                    pubmeds = pubmeds,
                    methods = methods,
                    interaction_types = interaction_types,
                    mi_score = sc,
                    isoform_a = isoform_a,
                    isoform_b = isoform_b,
                )
            )

    prg.terminate()

    return results



def _try_isoform(name):

    name = name.split('-')

    if len(name) > 1 and name[1].isdigit():

        isoform = int(name[1])
        main = name[0]

    else:

        main = '-'.join(name)
        isoform = None

    return main, isoform