Source code for pypath.inputs.pdb

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import re
import collections

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.internals.intera as intera
import pypath.utils.reflists as reflists



[docs]
def pdb_uniprot():
    """
    Mapping between UniProt and PDB identifiers.
    Returns two dictionaries:
    * UniProt to PDB mapping: keys are UniProt IDs, values are sets of
      tuples, each tuple with three values: the PDB structure ID, the
      structure analysis method and the structure resolution
    * PDB to UniProt mapping: keys are PDB IDs, values are sets of UniProt
      IDs
    """

    c = curl.Curl(urls.urls['uniprot_pdb']['url'], silent = False)
    data = c.result

    if data is None:
        return None, None

    data = data.split('\n')
    u_pdb = collections.defaultdict(set)
    pdb_u = collections.defaultdict(set)
    pdb = None
    pdb_re = re.compile(r'[0-9A-Z]{4}')

    for l in data:

        l = re.split(
            '[ ]{2,}',
            re.sub(
                '[ ]+,[ ]+', ',',
                re.sub(
                    r'[ ]*\(', '(',
                    l
                )
            )
        )

        if len(l[0]) == 4 and pdb_re.match(l[0]):

            pdb = l[0].lower()
            res = None if l[2] == '-' else float(l[2].replace(' A', ''))
            met = l[1]

        if pdb is not None and len(l) > 1:

            uniprots = l[1] if len(l) < 4 else l[3]
            uniprots = {
                u.split('(')[1].replace(')', '')
                for u in uniprots.split(',')
                if '(' in u
            }
            pdb_u[pdb].update(uniprots)

            for u in uniprots:

                u_pdb[u].add((pdb, met, res))

    return dict(u_pdb), dict(pdb_u)




[docs]
def pdb_chains():
    """
    Amino acid chain level mapping between PDB and UniProt.
    Returns two dictionaries:
    * The first has UniProt IDs as keys and lists of dicts as values. Each
      of these dicts defines a mapping between UniProt and PDB amino acid
      chains with the chain identifier, PDB structure identifier and the
      start and end of the chain in the UniProt sequence and the PDB
      structure; the offset value is an integer if the PDB and the UniProt
      chain are the same length, otherwise None.
    * The second dict has PDB IDs as keys and dicts of chain mapping dicts
      as values, which are similar to the ones in the previous point, but
      here the chain identifiers are the keys.
    """

    def to_int(i):

        if i == 'None':

            return None

        return int(non_digit.sub('', i))


    c = curl.Curl(urls.urls['pdb_chains']['url'], silent = False)
    chains = c.result

    if chains is None:

        return None, None

    chains = chains.replace('\r', '').split('\n')
    del chains[0]
    del chains[0]
    pdb_u = {}
    u_pdb = {}
    non_digit = re.compile(r'[^\d.-]+')

    for l in chains:

        l = l.split('\t')

        if len(l) > 8:

            if l[0] not in pdb_u:

                pdb_u[l[0]] = {}

            pdb_u[l[0]][l[1]] = {
                'uniprot': l[2],
                'chain_beg': to_int(l[3]),
                'chain_end': to_int(l[4]),
                'pdb_beg': to_int(l[5]),
                'pdb_end': to_int(l[6]),
                'uniprot_beg': to_int(l[7]),
                'uniprot_end': to_int(l[8])
            }

            if (
                pdb_u[l[0]][l[1]]['pdb_end'] is not None and
                pdb_u[l[0]][l[1]]['pdb_beg'] is not None and
                pdb_u[l[0]][l[1]]['uniprot_beg'] is not None and
                pdb_u[l[0]][l[1]]['uniprot_end'] is not None and
                (
                    pdb_u[l[0]][l[1]]['pdb_end'] -
                    pdb_u[l[0]][l[1]]['pdb_beg'] ==
                    pdb_u[l[0]][l[1]]['uniprot_end'] -
                    pdb_u[l[0]][l[1]]['uniprot_beg']
                )
            ):

                pdb_u[l[0]][l[1]]['offset'] = (
                    pdb_u[l[0]][l[1]]['uniprot_beg'] -
                    pdb_u[l[0]][l[1]]['pdb_beg']
                )

            else:

                pdb_u[l[0]][l[1]]['offset'] = None

            if l[2] not in u_pdb:

                u_pdb[l[2]] = []

            u_pdb[l[2]].append({
                'pdb': l[0],
                'chain': l[1],
                'chain_beg': to_int(l[3]),
                'chain_end': to_int(l[4]),
                'pdb_beg': to_int(l[5]),
                'pdb_end': to_int(l[6]),
                'uniprot_beg': to_int(l[7]),
                'uniprot_end': to_int(l[8]),
                'offset': pdb_u[l[0]][l[1]]['offset']
            })

    return u_pdb, pdb_u




[docs]
def pdb_complexes(organism = None):
    """
    Extracts protein complex data from PDB. The complexes are returned in
    a dict with string keys and ``pypath.internals.intera.Complex``
    objects as values. These latter carry their constitution, stoichiometry
    and the PDB identifiers.
    """

    complexes = {}

    uniprot_pdb, pdb_uniprot = pdb_chains()
    del uniprot_pdb

    for pdb_id, chains in iteritems(pdb_uniprot):

        uniprots = tuple(chain['uniprot'] for chain in chains.values())

        if len(uniprots) == 1:
            continue

        # if the organism set and any of the UniProt IDs does not
        # belong to this organism we drop the complex
        if organism and reflists.is_not(uniprots, 'uniprot', organism):

            continue

        cplex = intera.Complex(
            components = uniprots,
            sources = 'PDB',
            ids = pdb_id,
        )

        if cplex.__str__() in complexes:

            complexes[cplex.__str__()] += cplex

        else:

            complexes[cplex.__str__()] = cplex

    return complexes