Source code for pypath.inputs.string

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

from typing import Iterable, Literal

import collections

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.common as common


CONFIDENCE_THRESHOLDS = {
    'highest_confidence': 900,
    'high_confidence': 700,
    'medium_confidence': 400,
    'low_confidence': .150,
}



[docs]
def string_effects(
        ncbi_tax_id: int = 9606,
        stimulation: str | Iterable[str] = 'activation',
        inhibition: str | Iterable[str] = 'inhibition',
        exclude: str | Iterable[str] = 'expression',
        score_threshold: int = 0,
    ) -> list[tuple]:

    StringEffectsInteraction = collections.namedtuple(
        'StringEffectsInteraction',
        (
            'source',
            'target',
            'effect',
        ),
    )

    effects = []

    stimulation = common.to_set(stimulation)
    inhibition = common.to_set(inhibition)
    exclude = common.to_set(exclude)

    url = urls.urls['string']['actions'] % ncbi_tax_id
    c = curl.Curl(url, silent = False, large = True)
    _ = next(c.result)

    for l in c.result:

        if hasattr(l, 'decode'):

            l = l.decode('ascii')

        l = l.strip().split('\t')

        if l and l[4] == 't' and int(l[6]) >= score_threshold:

            effect = (
                '+'
                    if l[2] in stimulation else
                '-'
                    if l[2] in inhibition else
                '*'
                    if l[2] not in exclude else
                None
            )

            source = l[0].split('.')[1] if l[5] == 't' else l[1].split('.')[1]
            target = l[1].split('.')[1] if l[5] == 't' else l[0].split('.')[1]

            if effect is not None:

                effects.append(
                    StringEffectsInteraction(
                        source = source,
                        target = target,
                        effect = effect,
                    )
                )

    return effects




[docs]
def string_links_interactions(
        ncbi_tax_id: int = 9606,
        score_threshold: int | Literal[
            'highest_confidence',
            'high_confidence',
            'medium_confidence',
            'low_confidence',
            ] = 'highest_confidence',
        physical_interaction_score: bool = True,
    ) -> list[tuple]:
    """
    Downloads protein network data, including subscores per channel.
    The output contains both functional and physical protein associations.
    The combined physical interaction score is defined between the proteins
    for which we have evidence of their binding or forming a physical complex.

    Args
        score_threshold: Minimum required interaction score. user can use
            pre-defined confidence limits or can define a custom value.
    """

    StringLinksInteraction = collections.namedtuple(
        'StringLinksInteraction',
        (
            'protein_a',
            'protein_b',
            'neighborhood_score',
            'fusion',
            'cooccurence',
            'coexpression',
            'experimental',
            'database',
            'textmining',
            'combined_score',
            'physical_combined_score',
        ),
    )

    if physical_interaction_score:

        phy_links = dict(
            (
                (i.protein_a,i.protein_b),
                i.combined_score
            )
            for i in
            string_physical_interactions(
                ncbi_tax_id = ncbi_tax_id,
                score_threshold = 0,
            )
        )

    url = urls.urls['string']['links'] % ncbi_tax_id
    c = curl.Curl(url, silent = False, large = True)
    _ = next(c.result)

    min_score = CONFIDENCE_THRESHOLDS.get(score_threshold, score_threshold)

    for l in c.result:

        l = l.strip().split(' ')
        prot_a_id = l[0].split('.')[1]
        prot_b_id = l[1].split('.')[1]

        if int(l[9]) < min_score:

            continue

        phy_score = (
            phy_links.get((prot_a_id, prot_b_id), None)
                if physical_interaction_score else
            None
        )

        yield StringLinksInteraction(
            protein_a = prot_a_id,
            protein_b = prot_b_id,
            neighborhood_score = int(l[2]),
            fusion = int(l[3]),
            cooccurence = int(l[4]),
            coexpression = int(l[5]),
            experimental = int(l[6]),
            database = int(l[7]),
            textmining = int(l[8]),
            combined_score = int(l[9]),
            physical_combined_score = phy_score,
        )




[docs]
def string_physical_interactions(
        ncbi_tax_id: int = 9606,
        score_threshold: int | Literal[
            'highest_confidence',
            'high_confidence',
            'medium_confidence',
            'low_confidence',
        ] = 'highest_confidence',
    ) -> list[tuple]:
    """
    Downloads protein physical subnetwork data, including subscores per
    channel. The interactions indicate that the proteins are part of a
    physical complex.

    Args
        score_threshold: Minimum required interaction score. user can use
            pre-defined confidence limits or can define a custom value.
    """

    StringPhysicalInteraction = collections.namedtuple(
        'StringPhysicalInteraction',
        (
            'protein_a',
            'protein_b',
            'experimental',
            'database',
            'textmining',
            'combined_score',
        ),
    )

    links = []

    url = urls.urls['string']['physical_links'] % ncbi_tax_id
    c = curl.Curl(url, silent = False, large = True)
    _ = next(c.result)

    min_score = CONFIDENCE_THRESHOLDS.get(score_threshold, score_threshold)

    for l in c.result:

        l = l.strip().split(' ')

        if int(l[5]) >= min_score:

            links.append(
                StringPhysicalInteraction(
                    protein_a= l[0].split('.')[1],
                    protein_b= l[1].split('.')[1],
                    experimental= int(l[2]),
                    database= int(l[3]),
                    textmining= int(l[4]),
                    combined_score= int(l[5]),
                )
            )

    return links




[docs]
def string_species() -> dict[int, str]:
    """
    Downloads list of organisms in STRING.

    Returns
        Dict of tax ids as keys and scientific names of organisms as values.
    """

    species = {}

    url = urls.urls['string']['species']
    c = curl.Curl(url, silent = False, large = True)
    _ = next(c.result)

    for l in c.result:

        l = l.strip().split('\t')
        tax_id = l[0]
        official_name = l[3]

        species[tax_id] = official_name

    return species