Source code for pypath.inputs.trip

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import os
import bs4
import pickle

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.share.cache as cache
import pypath.share.session as session
import pypath.share.common as common

_logger = session.Logger(name = 'trip_input')
_log = _logger._log



[docs]
def take_a_trip(cachefile = None):
    """
    Downloads TRIP data from webpage and preprocesses it.
    Saves preprocessed data into `cachefile` and next
    time loads from this file.

    :arg cachefile str:
        Path to pickle dump of preprocessed TRIP database. If does not exist
        the database will be downloaded and saved to this file. By default
        the path queried from the ``settings`` module.
    """

    cachefile = cachefile or cache.cache_item('trip_preprocessed')

    if os.path.exists(cachefile):

        _log(
            'Loading preprocessed TRIP database '
            'content from `%s`' % cachefile
        )
        result = pickle.load(open(cachefile, 'rb'))

        return result

    _log('No cache found, downloading and preprocessing TRIP database.')

    result = {'sc': {}, 'cc': {}, 'vvc': {}, 'vtc': {}, 'fc': {}}
    intrs = {}
    titles = {
        'Characterization': 'cc',
        'Screening': 'sc',
        'Validation: In vitro validation': 'vtc',
        'Validation: In vivo validation': 'vvc',
        'Functional consequence': 'fc',
    }

    interactors = {}
    base_url = urls.urls['trip']['base_rescued']
    show_url = urls.urls['trip']['show_rescued']
    c = curl.Curl(base_url)
    mainhtml = c.result
    mainsoup = bs4.BeautifulSoup(mainhtml, 'html.parser')
    trppages = common.flat_list(
        [
            [a.attrs['href'] for a in ul.find_all('a')]
            for ul in mainsoup.
                find('div', id = 'trp_selector').
                find('ul').
                find_all('ul')
        ]
    )

    for trpp in trppages:

        trp = trpp.split('/')[-1]
        trpurl = show_url % trp
        c = curl.Curl(trpurl, silent = False)
        trphtml = c.result
        trpsoup = bs4.BeautifulSoup(trphtml, 'html.parser')
        trp_uniprot = trip_find_uniprot(trpsoup)

        if trp_uniprot is None or len(trp_uniprot) < 6:

            _log('Could not find UniProt for %s' % trp)

        for tab in trpsoup.find_all('th', colspan = ['11', '13']):

            ttl = titles[tab.text.strip()]
            tab = tab.find_parent('table')
            trip_process_table(tab, result[ttl], intrs, trp_uniprot)

    _log('Saving processed TRIP database content to `%s`' % cachefile)
    pickle.dump(result, open(cachefile, 'wb'))

    return result




[docs]
def trip_process_table(tab, result, intrs, trp_uniprot):
    """
    Processes one HTML table downloaded from TRIP webpage.

    @tab : bs4.element.Tag()
        One table of interactions from TRIP webpage.
    @result : dict
        Dictionary the data should be filled in.
    @intrs : dict
        Dictionary of already converted interactor IDs.
        This serves as a cache so do not need to look up
        the same ID twice.
    @trp_uniprot : str
        UniProt ID of TRP domain containing protein.
    """

    for row in tab.find_all('tr'):

        cells = row.find_all(['td', 'th'])

        if 'th' not in [c.name for c in cells]:

            intr = cells[2].text.strip()

            if intr not in intrs:

                intr_uniprot = trip_get_uniprot(intr)
                intrs[intr] = intr_uniprot

                if intr_uniprot is None or len(intr_uniprot) < 6:
                    _log('Could not find UniProt for %s' % intr)

            else:

                intr_uniprot = intrs[intr]

            if (trp_uniprot, intr_uniprot) not in result:

                result[(trp_uniprot, intr_uniprot)] = []

            result[(trp_uniprot, intr_uniprot)].append(
                [c.text.strip() for c in cells]
            )




[docs]
def trip_get_uniprot(syn):
    """
    Downloads table from TRIP webpage and UniProt attempts to
    look up the UniProt ID for one synonym.

    @syn : str
        The synonym as shown on TRIP webpage.
    """

    url = urls.urls['trip']['show_rescued'] % syn
    c = curl.Curl(url)

    if c.result:

        soup = bs4.BeautifulSoup(c.result, 'html.parser')

        return trip_find_uniprot(soup)




[docs]
def trip_find_uniprot(soup):
    """
    Looks up a UniProt name in table downloaded from TRIP
    webpage.

    @soup : bs4.BeautifulSoup
        The `BeautifulSoup` instance returned by
        ``pypath.inputs.trip.trip_get_uniprot``.
    """

    for tr in soup.find_all('div', id = 'tab2')[0].find_all('tr'):

        if (
            tr.find('td') is not None and
            tr.find('td').text.strip() == 'Human'
        ):

            uniprot = tr.find_all('td')[2].text.strip()

            return uniprot

    return None




[docs]
def trip_process(
        exclude_methods = ['Inference', 'Speculation'],
        predictions = False,
        species = 'Human',
        strict = False,
    ):
    """
    Downloads TRIP data by calling `pypath.dadio.take_a_trip()` and
    further provcesses it.
    Returns dict of dict with TRIP data.

    @exclude_methods : list
        Interaction detection methods to be discarded.
    @predictions : bool
        Whether to include predicted interactions.
    @species : str
        Organism name, e.g. `Human`.
    @strict : bool
        Whether include interactions with species not
        used as a bait or not specified.
    """

    nd = 'Not determined'
    spec = set([]) if strict \
        else set(['Not specified', 'Not used as a bait', ''])
    spec.add(species)
    result = {}
    data = take_a_trip()

    for uniprots in common.unique_list(
            common.flat_list([v.keys() for v in data.values()])):
        to_process = False
        refs = set([])
        mets = set([])
        tiss = set([])
        reg = set([])
        eff = set([])

        if uniprots in data['sc']:
            for sc in data['sc'][uniprots]:
                if sc[4] in spec and sc[6] in spec and \
                    (predictions or sc[9] != 'Prediction') and \
                        sc[3] not in exclude_methods:
                    refs.add(sc[10])
                    mets.add(sc[3])
                    tiss.add(sc[7])

        if uniprots in data['vtc']:
            for vtc in data['vtc'][uniprots]:
                if vtc[4] in spec and vtc[7] in spec and \
                        vtc[3] not in exclude_methods:
                    refs.add(vtc[10])
                    mets.add(vtc[3])

        if uniprots in data['vvc']:
            for vvc in data['vvc'][uniprots]:
                if vvc[6] in spec and vvc[8] in spec and \
                        vvc[3] not in exclude_methods:
                    refs.add(vvc[10])
                    mets.add(vvc[3])

                    if len(vvc[4]) > 0:
                        tiss.add(vvc[4])

                    if len(vvc[5]) > 0:
                        tiss.add(vvc[5])

        if uniprots in data['cc']:
            for cc in data['cc'][uniprots]:
                if cc[4] in spec and cc[6] in spec and \
                        cc[3] not in exclude_methods:
                    refs.add(cc[10])
                    mets.add(cc[3])

                    if (cc[5] != nd and len(cc[5]) > 0) or \
                            (cc[7] != nd and len(cc[7]) > 0):
                        reg.add((cc[5], cc[7]))

        if uniprots in data['fc']:
            for fc in data['fc'][uniprots]:
                mets.add(fc[3])
                refs.add(fc[7])

                if len(fc[5]) > 0:
                    eff.add(fc[5])

                if len(fc[6]) > 0:
                    eff.add(fc[6])

        if len(refs) > 0:
            result[uniprots] = {
                'refs': refs,
                'methods': mets,
                'tissues': tiss,
                'effect': eff,
                'regions': reg
            }

    return result




[docs]
def trip_interactions(
        exclude_methods = ['Inference', 'Speculation'],
        predictions = False,
        species = 'Human',
        strict = False,
    ):
    """
    Obtains processed TRIP interactions by
    calling ``pypath.inputs.trip.trip_process``
    and returns list of interactions. All arguments are passed to
    ``trip_process``, see their definition there.
    """

    data = trip_process(exclude_methods, predictions, species, strict)

    def trip_effect(eff):
        pos = {
            'Sensitization',
            'Activation',
            'Increase in plasma membrane level',
            'Increase in lysosomal membrane level',
            'New channel creation',
        }
        neg = {
            'Desensitization',
            'Decrease in plasma membrane level',
            'Inhibition',
            'Internalization from membrane by ligand',
            'Retain in the endoplasmic reticulum',
        }

        return (
            'stimulation'
                if len(eff & pos) > 0 else
            'inhibition'
                if len(eff & neg) > 0 else
            'unknown'
        )

    return [
        [
            unipr[0],
            unipr[1],
            ';'.join(d['refs']),
            ';'.join(d['methods']),
            trip_effect(d['effect'])
        ]
        for unipr, d in iteritems(data)
    ]