Source code for pypath.utils.residues

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import re
try:
    import urllib2
except:
    import urllib.request as urllib2

import gzip
import bs4
try:
    from cStringIO import StringIO
except:
    try:
        from StringIO import StringIO
    except:
        from io import StringIO


[docs] class ResidueMapper(object): """ This class stores and serves the PDB --> UniProt residue level mapping. Attempts to download the mapping, and stores it for further use. Converts PDB residue numbers to the corresponding UniProt ones. """
[docs] def __init__(self): self.url = 'http://pdb.org/pdb/rest/das/pdb_uniprot_mapping/alignment?query=%s' self.pdb_lst = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/'\ 'pdb_chain_uniprot.tsv.gz' self.uniprot_pdb = None self.clean() self.download_errors = []
def load_mapping(self, pdb): data = None non_digit = re.compile(r'[^\d.-]+') pdb = pdb.lower() url = self.url % pdb for i in range(5): try: data = urllib2.urlopen(url, timeout=60) break except: continue if not data: self.download_errors.append(pdb) mapper = {} soup = bs4.BeautifulSoup(data.read()) for block in soup.find_all('block'): seg = block.find_all('segment') chain = seg[0]['intobjectid'].split('.')[1] uniprot = seg[1]['intobjectid'] pdbstart = int(non_digit.sub('', seg[0]['start'])) pdbend = int(non_digit.sub('', seg[0]['end'])) uniprotstart = int(non_digit.sub('', seg[1]['start'])) uniprotend = int(non_digit.sub('', seg[1]['end'])) if chain not in mapper: mapper[chain] = {} mapper[chain][pdbend] = { 'uniprot': uniprot, 'pdbstart': pdbstart, 'uniprotstart': uniprotstart, 'uniprotend': uniprotend } if uniprot not in mapper: mapper[uniprot] = {} if chain not in mapper[uniprot]: mapper[uniprot][chain] = {} mapper[uniprot][chain][uniprotend] = { 'pdbstart': pdbstart, 'pdbend': pdbend, 'uniprotstart': uniprotstart } self.mappers[pdb] = mapper def chains(self, chains): if type(chains) in [str, unicode]: chains = [chains] if type(chains) is list: chains = list(set(chains)) return chains def pdb2uniprot(self, pdb, resnum, chains=None): chains = self.chains(chains) results = {} pdb = pdb.lower() if pdb not in self.mappers: self.load_mapping(pdb) if pdb in self.mappers: for ch, data in self.mappers[pdb].iteritems(): if len(ch) == 1 and (chains is None or ch in chains): pdbends = data.keys() if resnum <= max(pdbends): pdbend = min([ x for x in [e - resnum for e in pdbends] if x >= 0 ]) + resnum seg = data[pdbend] if seg['pdbstart'] <= resnum: offset = seg['uniprotstart'] - seg['pdbstart'] residue = { 'resnum': resnum + offset, 'offset': offset, 'uniprot': seg['uniprot'] } results[ch] = residue return results def uniprot2pdb(self, uniprot, resnum, chains=None, pdbs=None): chains = self.chains(chains) if self.uniprot_pdb is None: self.get_pdb_chains() results = {} # one uniprot can occure in more pdbs, first # we need to find out, which pdb files should we look at: if pdbs is None: pdbs = [] if uniprot in self.uniprot_pdb: for updb in self.uniprot_pdb[uniprot]: pdbs.append(updb['pdb']) elif type(pdbs) in [str, unicode]: pdbs = [pdbs] pdbs = list(set(pdbs)) # now find the residue number in each of the pdb's: for pdb in pdbs: if pdb not in self.mappers: self.load_mapping(pdb) if pdb in self.mappers and uniprot in self.mappers[pdb]: for ch, up in self.mappers[pdb][uniprot].iteritems(): if chains is None or ch in chains: uniprotends = up.keys() if resnum <= max(uniprotends): uniprotend = min([ x for x in [e - resnum for e in uniprotends] if x >= 0 ]) + resnum seg = up[uniprotend] if seg['uniprotstart'] <= resnum: offset = seg['pdbstart'] - seg['uniprotstart'] residue = { 'resnum': resnum + offset, 'offset': offset } if pdb not in results: results[pdb] = {} results[pdb][ch] = residue return results def get_residue(self, ac, resnum, chains=None, pdbs=None): if len(ac.strip()) == 4: return self.pdb2uniprot(ac, resnum, chains) else: return self.uniprot2pdb(ac, resnum, chains, pdbs)
[docs] def clean(self): ''' Removes cached mappings, freeing up memory. ''' self.mappers = {} self.uniprot_pdb = None self.pdb_uniprot = None
def get_pdb_chains(self): gzfile = urllib2.urlopen(self.pdb_lst) buff = StringIO(gzfile.read()) chains = gzip.GzipFile(fileobj=buff, mode='rb').read() chains = chains.replace('\r', '').split('\n') del chains[0] del chains[0] self.pdb_uniprot = {} self.uniprot_pdb = {} non_digit = re.compile(r'[^\d.-]+') for l in chains: l = l.split('\t') if len(l) > 8: if l[0] not in self.pdb_uniprot: self.pdb_uniprot[l[0]] = {} self.pdb_uniprot[l[0]][l[1]] = { 'uniprot': l[2], 'chain_beg': int(non_digit.sub('', l[3])), 'chain_end': int(non_digit.sub('', l[4])), 'pdb_beg': int(non_digit.sub('', l[5])), 'pdb_end': int(non_digit.sub('', l[6])), 'uniprot_beg': int(non_digit.sub('', l[7])), 'uniprot_end': int(non_digit.sub('', l[8])) } if self.pdb_uniprot[l[0]][l[1]]['pdb_end'] - \ self.pdb_uniprot[l[0]][l[1]]['pdb_beg'] == \ self.pdb_uniprot[l[0]][l[1]]['uniprot_end'] - \ self.pdb_uniprot[l[0]][l[1]]['uniprot_beg']: self.pdb_uniprot[l[0]][l[1]]['offset'] = \ (self.pdb_uniprot[l[0]][l[1]]['uniprot_beg'] - self.pdb_uniprot[l[0]][l[1]]['pdb_beg']) else: self.pdb_uniprot[l[0]][l[1]]['offset'] = None if l[2] not in self.uniprot_pdb: self.uniprot_pdb[l[2]] = [] self.uniprot_pdb[l[2]].append({ 'pdb': l[0], 'chain': l[1], 'chain_beg': int(non_digit.sub('', l[3])), 'chain_end': int(non_digit.sub('', l[4])), 'pdb_beg': int(non_digit.sub('', l[5])), 'pdb_end': int(non_digit.sub('', l[6])), 'uniprot_beg': int(non_digit.sub('', l[7])), 'uniprot_end': int(non_digit.sub('', l[8])), 'offset': self.pdb_uniprot[l[0]][l[1]]['offset'] })