Source code for pypath.inputs.pdzbase

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import re
import collections

import bs4

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.utils.taxonomy as taxonomy



[docs]
def pdzbase_interactions():
    """
    Downloads data from PDZbase. Parses data from the HTML tables.

    Returns
        List of named tuples with interaction data.
    """

    PDZbaseInteraction = collections.namedtuple(
        'PDZbaseInteraction',
        [
            'uniprot_pdz',
            'isoform_pdz',
            'uniprot_ligand',
            'isoform_ligand',
            'genesymbol_pdz',
            'genesymbol_ligand',
            'pdz_domain',
            'organism',
            'pubmed',
        ],
    )

    # UniProt ID with isoform e.g. O14754-1
    reupi = re.compile(r'([\w]{6,10})(?:-([0-9]{1,2}))?')

    url = urls.urls['pdzbase']['url_rescued']
    c = curl.Curl(url, silent = False)
    data = c.result
    soup = bs4.BeautifulSoup(data, 'html.parser')
    rows = (
        soup.find_all('table')[3].find('table').find('table').find_all('tr')
    )
    result = []

    del rows[0]

    for r in rows:

        r = [c.text.strip() for c in r.find_all('td')]

        uniprot_pdz, isoform_pdz = reupi.match(r[1]).groups()
        uniprot_ligand, isoform_ligand = reupi.match(r[4]).groups()

        result.append(
            PDZbaseInteraction(
                uniprot_pdz = uniprot_pdz,
                isoform_pdz = int(isoform_pdz) if isoform_pdz else 1,
                uniprot_ligand = uniprot_ligand,
                isoform_ligand = int(isoform_ligand) if isoform_ligand else 1,
                genesymbol_pdz = r[0],
                genesymbol_ligand = r[3],
                pdz_domain = int(r[2]),
                organism = taxonomy.ensure_ncbi_tax_id(r[5]),
                pubmed = int(r[6]),
            )
        )

    return result