Source code for pypath.inputs.csa

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

try:
    from cStringIO import StringIO
except ModuleNotFoundError:
    from io import StringIO

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.progress as progress
import pypath.inputs.pdb as pdb_input
import pypath.inputs.common as inputs_common
import pypath.utils.pdb as pdb_utils
import pypath.internals.intera as intera



[docs]
def get_csa(uniprots = None):
    """
    Downloads and preprocesses catalytic sites data.
    This data tells which residues are involved in the catalytic
    activity of one protein.
    """

    url = urls.urls['catalytic_sites']['url']
    c = curl.Curl(url, silent = False)
    data = c.result

    if data is None:

        return None

    u_pdb, pdb_u = pdb_input.pdb_chains()
    buff = StringIO()
    buff.write(data)
    cols = {
        'pdb': 0,
        'id': 1,
        'resname': 2,
        'chain': 3,
        'resnum': 4,
        'chem_fun': 5,
        'evidence': 6,
    }
    table = inputs_common.read_table(
        cols = cols,
        fileObject = buff,
        sep = ',',
        hdr = 1,
    )
    css = {}
    prg = progress.Progress(len(table), 'Processing catalytic sites', 11)

    for l in table:

        if l['pdb'] in pdb_u:

            if l['chain'] in pdb_u[l['pdb']]:

                uniprot = pdb_u[l['pdb']][l['chain']]['uniprot']

                if uniprots is None or uniprot in uniprots:

                    offset = pdb_u[l['pdb']][l['chain']]['offset']

                    if offset is not None:

                        l['resnum'] = int(l['resnum']) + offset

                    else:

                        this_res = pdb_utils.residue_pdb(
                            l['pdb'],
                            l['chain'],
                            l['resnum'],
                        )

                        if len(this_res) > 0:
                            l['resnum'] = int(this_res['UPCOUNT'])

                        else:
                            l['resnum'] = None

                    if l['resnum'] is not None:

                        if uniprot not in css:

                            css[uniprot] = {}

                        if l['pdb'] not in css[uniprot]:

                            css[uniprot][l['pdb']] = {}

                        if l['id'] not in css[uniprot][l['pdb']]:

                            css[uniprot][l['pdb']][l['id']] = []

                        css[uniprot][l['pdb']][l['id']].append(
                            intera.Residue(
                                name = l['resname'],
                                number = l['resnum'],
                                protein = uniprot,
                            )
                        )

        prg.step()

    prg.terminate()

    return css