Source code for pypath.inputs.pisa

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems
from past.builtins import xrange, range

import os
import sys
import re
import collections

try:
    import cPickle as pickle
except:
    import pickle

import bs4

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.share.cache as cache
import pypath.share.progress as progress
import pypath.utils.pdb as pdb_utils
import pypath.inputs.pdb as pdb_input
import pypath.internals.intera as intera


PisaBond = collections.namedtuple(
    'PisaBond',
    (
        'uniprot_1',
        'chain_1',
        'residue_1',
        'seqnum_1',
        'uniprot_2',
        'chain_2',
        'residue_2',
        'seqnum_2',
    ),
)


[docs] def pisa_bonds(bonds, chains): """ To be refactored in the future. If you are interested in using this function, please contact the authors. """ non_digit = re.compile(r'[^\d.-]+') result = [] for bond in bonds.find_all('bond'): seqnum1 = int(non_digit.sub('', bond.find('seqnum-1').text)) seqnum2 = int(non_digit.sub('', bond.find('seqnum-2').text)) res1 = bond.find('res-1').text res1 = common.aaletters.get(res1, res1) res2 = bond.find('res-2').text res2 = common.aaletters.get(res2, res2) chain1 = bond.find('chain-1').text chain2 = bond.find('chain-2').text uniprot1 = chains.get(chain1, None) uniprot2 = chains.get(chain2, None) if uniprot1 and uniprot2: result.append( PisaBond( uniprot_1 = uniprot1, chain_1 = chain1, residue_1 = res1, seqnum_1 = seqnum1, uniprot_2 = uniprot2, chain_2 = chain2, residue_2 = res2, seqnum_2 = seqnum2, ) ) return result
[docs] def pisa_interfaces(pdbs, return_unmapped = False): """ To be refactored in the future. If you are interested in using this function, please contact the authors. Args pdbs (set): A set of PDB IDs to query. return_unmapped (bool): Return also a list of unmapped residues. In this case, a tuple is returned, its first element is a dict of interfaces, while its second element is a list of unmapped residues (normally empty, if all residues could be mapped between PDB and UniProt sequences). Returns A dict of dicts with interfaces. The upper level keys are PDB structure IDs, the lower level keys are tuples of UniProt IDs, the values are ``pypath.internals.intera.Interface`` objects. """ bond_types = { 'hbonds': 'h-bonds', 'sbridges': 'salt-bridges', 'covbonds': 'cov-bonds', 'ssbonds': 'ss-bonds' } interfaces = collections.defaultdict(dict) cachefile = os.path.join(cache.get_cachedir(), 'pisa.pickle') u_pdb, pdb_u = pdb_input.pdb_chains() if os.path.exists(cachefile): try: interfaces = pickle.load(open(cachefile, 'rb')) except: pass errors = [] unmapped_residues = [] p = 5 pdbs = list(set(pdbs) - set(interfaces.keys())) prg = progress.Progress( len(pdbs) / p, 'Downloading data from PDBe PISA', 1, ) pdbs = sorted(common.to_set(pdbs)) for i in xrange(0, len(pdbs), p): to = i + p thisPart = pdbs[i:to] url = urls.urls['pisa_interfaces']['url'] + ','.join(thisPart) c = curl.Curl(url, cache = False) data = c.result if not data: msg = 'Could not download: \n\t\t%s' % url errors.append(msg) continue soup = bs4.BeautifulSoup(data, 'html.parser') for pdb in soup.find_all('pdb_entry'): pdb_id = pdb.find('pdb_code').text.lower() interfaces[pdb_id] = {} chains = {} resconv = pdb_utils.ResidueMapper() if pdb_id in pdb_u: for chain, chain_data in iteritems(pdb_u[pdb_id]): chains[chain] = chain_data['uniprot'] for interface in pdb.find_all('interface'): for b, t in iteritems(bond_types): bonds = interface.find(t) if bonds: bonds = pisa_bonds(bonds, chains) for bond in bonds: uniprots = ( bond.uniprot_1, bond.uniprot_2, ) if uniprots not in interfaces[pdb_id]: css = common.non_digit.sub( '', interface.find('css').text) css = ( None if len(css) == 0 else float(css) ) area = common.non_digit.sub( '', interface.find('int_area').text) area = None if len(area) == 0 else float( area) solv_en = common.non_digit.sub( '', interface.find('int_solv_en').text ) solv_en = ( None if len(solv_en) == 0 else float(solv_en) ) stab_en = common.non_digit.sub( '', interface.find('stab_en').text ) stab_en = ( None if len(stab_en) == 0 else float(stab_en) ) interfaces[pdb_id][uniprots] = ( intera.Interface( uniprots[0], uniprots[1], source = 'PISA', pdb = pdb_id, css = css, solv_en = solv_en, area = area, stab_en = stab_en, ) ) res1 = resconv.get_residue( pdb_id, bond.seqnum_1, ) res2 = resconv.get_residue( pdb_id, bond.seqnum_2, ) if ( res1 and res2 and res1.uniprot == uniprots[0] and res2.uniprot == uniprots[1] ): interfaces[pdb_id][uniprots].add_residues( ( res1.resnum, bond.residue_1, uniprots[0], ), ( res2.resnum, bond.residue_2, uniprots[1], ), typ = b, ) else: unmapped_residues.append( ( pdb_id, bond.seqnum_1, bond.seqnum_2, uniprots[0], uniprots[1], ) ) prg.step() prg.terminate() pickle.dump(interfaces, open(cachefile, 'wb'), 2) interfaces = dict(interfaces) if len(errors) > 0: sys.stdout.write( '\t:: Failed to download %u files of total %u:\n\n' % ( len(errors), len(pdbs)('P00968', 'P0A6F1') ) ) for e in errors: sys.stdout.write('\t' + e + '\n') sys.stdout.flush() if return_unmapped: return interfaces, unmapped_residues else: return interfaces