#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#fromfuture.utilsimportiteritemsimportreimportcollectionsimportpypath.resources.urlsasurlsimportpypath.share.curlascurlimportpypath.internals.interaasinteraimportpypath.utils.reflistsasreflists
[docs]defpdb_uniprot():""" Mapping between UniProt and PDB identifiers. Returns two dictionaries: * UniProt to PDB mapping: keys are UniProt IDs, values are sets of tuples, each tuple with three values: the PDB structure ID, the structure analysis method and the structure resolution * PDB to UniProt mapping: keys are PDB IDs, values are sets of UniProt IDs """c=curl.Curl(urls.urls['uniprot_pdb']['url'],silent=False)data=c.resultifdataisNone:returnNone,Nonedata=data.split('\n')u_pdb=collections.defaultdict(set)pdb_u=collections.defaultdict(set)pdb=Nonepdb_re=re.compile(r'[0-9A-Z]{4}')forlindata:l=re.split('[ ]{2,}',re.sub('[ ]+,[ ]+',',',re.sub(r'[ ]*\(','(',l)))iflen(l[0])==4andpdb_re.match(l[0]):pdb=l[0].lower()res=Noneifl[2]=='-'elsefloat(l[2].replace(' A',''))met=l[1]ifpdbisnotNoneandlen(l)>1:uniprots=l[1]iflen(l)<4elsel[3]uniprots={u.split('(')[1].replace(')','')foruinuniprots.split(',')if'('inu}pdb_u[pdb].update(uniprots)foruinuniprots:u_pdb[u].add((pdb,met,res))returndict(u_pdb),dict(pdb_u)
[docs]defpdb_chains():""" Amino acid chain level mapping between PDB and UniProt. Returns two dictionaries: * The first has UniProt IDs as keys and lists of dicts as values. Each of these dicts defines a mapping between UniProt and PDB amino acid chains with the chain identifier, PDB structure identifier and the start and end of the chain in the UniProt sequence and the PDB structure; the offset value is an integer if the PDB and the UniProt chain are the same length, otherwise None. * The second dict has PDB IDs as keys and dicts of chain mapping dicts as values, which are similar to the ones in the previous point, but here the chain identifiers are the keys. """defto_int(i):ifi=='None':returnNonereturnint(non_digit.sub('',i))c=curl.Curl(urls.urls['pdb_chains']['url'],silent=False)chains=c.resultifchainsisNone:returnNone,Nonechains=chains.replace('\r','').split('\n')delchains[0]delchains[0]pdb_u={}u_pdb={}non_digit=re.compile(r'[^\d.-]+')forlinchains:l=l.split('\t')iflen(l)>8:ifl[0]notinpdb_u:pdb_u[l[0]]={}pdb_u[l[0]][l[1]]={'uniprot':l[2],'chain_beg':to_int(l[3]),'chain_end':to_int(l[4]),'pdb_beg':to_int(l[5]),'pdb_end':to_int(l[6]),'uniprot_beg':to_int(l[7]),'uniprot_end':to_int(l[8])}if(pdb_u[l[0]][l[1]]['pdb_end']isnotNoneandpdb_u[l[0]][l[1]]['pdb_beg']isnotNoneandpdb_u[l[0]][l[1]]['uniprot_beg']isnotNoneandpdb_u[l[0]][l[1]]['uniprot_end']isnotNoneand(pdb_u[l[0]][l[1]]['pdb_end']-pdb_u[l[0]][l[1]]['pdb_beg']==pdb_u[l[0]][l[1]]['uniprot_end']-pdb_u[l[0]][l[1]]['uniprot_beg'])):pdb_u[l[0]][l[1]]['offset']=(pdb_u[l[0]][l[1]]['uniprot_beg']-pdb_u[l[0]][l[1]]['pdb_beg'])else:pdb_u[l[0]][l[1]]['offset']=Noneifl[2]notinu_pdb:u_pdb[l[2]]=[]u_pdb[l[2]].append({'pdb':l[0],'chain':l[1],'chain_beg':to_int(l[3]),'chain_end':to_int(l[4]),'pdb_beg':to_int(l[5]),'pdb_end':to_int(l[6]),'uniprot_beg':to_int(l[7]),'uniprot_end':to_int(l[8]),'offset':pdb_u[l[0]][l[1]]['offset']})returnu_pdb,pdb_u
[docs]defpdb_complexes(organism=None):""" Extracts protein complex data from PDB. The complexes are returned in a dict with string keys and ``pypath.internals.intera.Complex`` objects as values. These latter carry their constitution, stoichiometry and the PDB identifiers. """complexes={}uniprot_pdb,pdb_uniprot=pdb_chains()deluniprot_pdbforpdb_id,chainsiniteritems(pdb_uniprot):uniprots=tuple(chain['uniprot']forchaininchains.values())iflen(uniprots)==1:continue# if the organism set and any of the UniProt IDs does not# belong to this organism we drop the complexiforganismandreflists.is_not(uniprots,'uniprot',organism):continuecplex=intera.Complex(components=uniprots,sources='PDB',ids=pdb_id,)ifcplex.__str__()incomplexes:complexes[cplex.__str__()]+=cplexelse:complexes[cplex.__str__()]=cplexreturncomplexes