Source code for pypath.inputs.pepcyber

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import os
import collections

import bs4
import pandas as pd

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.progress as progress
import pypath.share.cache as cache_mod
import pypath.share.session as session

_logger = session.Logger(name = 'pepcyber_input')
_log = _logger._log



[docs]
def pepcyber_interactions(dataframe = False, cache = True):
    """
    Downloads phosphoprotein binding protein interactions
    from the PepCyber database (http://www.pepcyber.org/).

    Args
        dataframe (bool): Return a pandas data frame instead of list of
            tuples.
        cache (bool): Read the data from an intermediate cache file,
            if available.
    """

    PepcyberInteraction = collections.namedtuple(
        'PepcyberInteraction',
        (
            'ppdb_class',
            'ppdb_genesymbol',
            'substrate_genesymbol',
            'binding_seq',
            'binding_pos',
            'all_evidences',
            'n_records',
            'category',
            'substrate_residue',
            'ppdb_uniprot',
            'ppdb_refseq',
            'substrate_uniprot',
            'substrate_refseq',
            'evidence',
            'pmid',
        ),
    )

    def get_cells(row):

        cells = row.find_all('td')

        if len(cells) == 10:
            sp = cells[4].find('span')

            if (
                sp is not None and
                'class' in sp.attrs and
                'sequence' in sp.attrs['class']
            ):

                return cells


    cachefile = os.path.join(
        cache_mod.get_cachedir(),
        'pepcyber_details.tsv',
    )

    if cache and os.path.exists(cachefile):

        _log('Reading data from cache file `%s`.' % cachefile)
        tbl = pd.read_csv(cachefile, sep = '\t', dtype = {'pmid': 'string'})
        result = [
            PepcyberInteraction(
                *(f if pd.notna(f) else None for f in row)
            )
            for row in tbl.itertuples(index = False)
        ]


    else:

        _log('Downloading PepCyber data.')

        url = urls.urls['pepcyber']['rescued']
        # this is huge, takes a few minutes!
        c = curl.Curl(
            url,
            silent = False,
            timeout = 600,
            encoding = 'iso-8859-1',
        )
        data = c.result
        soup = bs4.BeautifulSoup(data, 'html.parser')
        rows = soup.find_all('tr')
        result = []

        prg = progress.Progress(
            len(rows),
            'Retrieving and processing PepCyber data',
            7,
        )

        for row in rows:

            prg.step()
            cells = get_cells(row)

            if cells is None:

                continue

            row_txt = [c.text.strip() for c in cells]

            if len(row_txt) > 9 and row_txt[5].isdigit():

                inum = int(row.find('a')['name'])
                row_txt[9] = (
                    None
                        if 'p' not in row_txt[4] else
                    row_txt[4][row_txt[4].index('p') + 1]
                )

                details = pepcyber_details(inum)

                row_txt.extend(
                    details[row_txt[2]]
                        if row_txt[2] in details else
                    [None, None]
                )
                row_txt.extend(
                    details[row_txt[3]]
                        if row_txt[3] in details else
                    [None, None]
                )

                refs = details['_refs'] or [(None,) * 3]

                for ref in refs:

                    this_record = row_txt[1:] + list(ref[1:])
                    this_record[4] = int(this_record[4])
                    this_record[6] = int(this_record[6])
                    result.append(
                        PepcyberInteraction(*this_record)
                    )

        tbl = pd.DataFrame.from_records(
            result,
            columns = PepcyberInteraction._fields,
        )
        _log('Saving data to `%s`.' % cachefile)
        tbl.to_csv(cachefile, sep = '\t', index = False)

    return tbl if dataframe else result




[docs]
def pepcyber_details(num):
    """
    Retrieves detailed information about an interaction from the PepCyber
    database.

    Returns
        Dict with gene symbols as keys and lists of length 2 as values,
        with UniProt ID and RefSeq protein ID. A special key `_refs`
        holds a list of dictionaries, each with category, evidence type and
        PubMed reference information.
    """

    PepcyberReference = collections.namedtuple(
        'PepcyberReference',
        ('category', 'evidence', 'reference')
    )

    result = {'_refs': []}
    url = urls.urls['pepcyber']['details_rescued'] % num
    c = curl.Curl(url, encoding = 'iso-8859-1')
    data = c.result

    if data:

        soup = bs4.BeautifulSoup(data, 'html.parser')
        gname = None
        prev = ''

        for td in soup.find_all('td'):

            if prev.startswith('Gene name'):

                gname = td.text.strip().split('(')[0]

            if prev.startswith('RefSeq'):

                refseq = td.text.strip()

            if prev.startswith('SwissProt') and gname is not None:

                swprot = td.text.strip()

                if gname and gname[0] != u'\xce':

                    result[gname] = [swprot, refseq]

                gname = None

            prev = td.text.strip()

        if soup.find(text = 'Records:'):

            refs = (
                soup.find(text = 'Records:').
                parent.
                parent.
                parent.
                next_sibling.
                find('table').
                find_all('tr')
            )[1:]

            result['_refs'] = [
                PepcyberReference(
                    *(
                        td.a.a.text if td.find('a') else td.text
                        for td in tr.find_all('td')
                    )
                )
                for tr in refs
            ]

    return result