#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#"""This module provides classes to represent and handlestructural details of protein interactionsi.e. residues, post-translational modifications,short motifs, domains, domain-motifs anddomain-motif interactions, binding interfaces."""fromfuture.utilsimportiteritemsfrompast.builtinsimportxrange,range,reduceimportreimportsysimportimportlibasimpimportcollectionsimportitertools# from pypath:importpypath.share.commonascommonimportpypath_common._constantsas_constimportpypath.utils.mappingasmappingimportpypath.core.evidenceasevidenceimportpypath.core.entityasentityimportpypath.utils.taxonomyastaxonomy__all__=['Residue','Ptm','Motif','Domain','DomainDomain','DomainMotif','Interface',]if'unicode'notin__builtins__:unicode=strCOMPLEX_SEP='_'
def__hash__(self):returnhash((self.number,self.name,self.protein))def__eq__(self,other):return(self.protein==other.proteinandself.number==other.numberandself.name==other.name)def__ne__(self,other):returnnotself.__eq__(other)def__str__(self):return'Residue %s-%u in protein %s-%u%s\n'%(self.name,self.number,self.protein.identifier,self.isoform,' (mutated)'ifself.mutatedelse'')def__repr__(self):return'<Residue %s-%u:%s%u>'%(self.protein.label,self.isoform,self.name,self.number,)defserialize(self):return'%s%u'%(self.name,self.number)defin_isoform(self,isoform,seq=None):seq=seqorself.seqifseqandseq.has_isoform(isoform):ifseq.get(self.number,isoform=isoform)==self.name:res=Residue(number=self.number,name=self.name,protein=self.protein,id_type=self.id_type,isoform=isoform,mutated=self.mutated,)returnresreturnNone
[docs]def__init__(self,domain_a,domain_b,pdbs=None,sources=None,refs=None,contact_residues=None,):self.domains=[domain_a,domain_b]self.sources=set([])self.refs=set([])self.pdbs=set([])self.add_sources(sources)self.add_refs(refs)self.add_pdbs(pdbs)'''This can be found from 3DComplexes; floating point numbers show the number of residues in contact. Other two numbers in the tuple are the length of domain sequences.'''self.contact_residues=contact_residues
def__hash__(self):returnhash((self.domain_a,self.domain_b))def__eq__(self,other):ifself.__dict__==other.__dict__:returnTrueelse:returnFalsedef__ne__(self,other):returnnotself.__eq__(other)def__contains__(self,other):returnotherinself.domains[0]orotherinself.domains[1]defadd_sources(self,source):ifsourceisNone:returnNoneeliftype(source)in_const.CHAR_TYPES:self._add_source(source)else:forsinsource:self._add_source(s)def_add_source(self,source):self.sources.add(source)defadd_refs(self,refs):self.refs=common.add_to_set(self.refs,refs)defadd_pdbs(self,pdbs):self.pdbs=common.add_to_set(self.pdbs,pdbs)defserialize(self):return'|'.join([self.domains[0].serialize(),self.domains[1].serialize(),','.join(self.sources),','.join(self.refs),','.join(self.pdbs)])# domain1|domain2|sources|references|pdbdef__str__(self):return'Domain-domain interaction:\n'\
' %s%s\n'\
' Data sources: %s\n'\
' References: %s\n'\
' 3D structures: %s\n'%(self.domains[0].__str__(),self.domains[1].__str__(),', '.join(self.sources),', '.join(self.refs),','.join(self.pdbs))
[docs]defkey(self):""" Returns a unique key which is a tuple of the proteins, the residue and the modification type. """return(self.domain.protein,self.ptm.protein,self.ptm.residue.name,self.ptm.residue.number,self.ptm.typ,)
[docs]defget_line(self,resources_only_primary=False):""" Returns a list intended to be a row in a data frame of enzyme-substrate relationships. Elements of the list: - enzyme - enzyme_genesymbol - substrate - substrate_genesymbol - isoforms - residue_type - residue_offset - modification - sources - references - curation_effort """return[self.domain.protein.identifier,self.domain.protein.label,self.ptm.protein.identifier,self.ptm.protein.label,';'.join(map(lambdai:'%u'%i,sorted(self.ptm.isoforms))),self.ptm.residue.name,'%u'%self.ptm.residue.number,self.ptm.typ,';'.join(sorted(self.resources(only_primary=resources_only_primary))),self.references_by_resource_str(),self.evidences.count_curation_effort(),]
[docs]def__init__(self,components,ncbi_tax_id=9606,name=None,ids=None,sources=None,interactions=None,references=None,proteins=None,attrs=None,):""" Represents a molecular complex. components : list,dict Either a list of identifiers or a dict with identifiers as keys and stoichiometric coefficients as values. List of identifiers also assumed to represent stoichiometry by repetition of identifiers. ncbi_tax_id : int NCBI taxonomy identifier of the complex. It implies all members of the complex belong to the same organism. Support for multi- organism complexes will be implemented in the future. name : str A custom name or identifier of the complex. ids : dict Identifiers. If ``sources`` is a set, list or tuple it should be a dict with database names as keys and set of identifiers as values. If ``sources`` is a string, it can be a set of identifiers or a single identifier. sources : set,str Database(s) the complex has been defined in. interactions : list,dict Interactions between the components of the complex. Either a list of tuples of component IDs or a dict with tuples as keys and custom interaction properties as values. proteins : list,dict Synonym for `components`, kept for compatibility. """components=componentsorproteinsifnotisinstance(components,dict):self.components=dict(collections.Counter(components))else:self.components=componentsself.proteins=self.componentsself.name=nameself.ids=collections.defaultdict(set)self.add_ids(ids,source=sources)self.sources=common.to_set(sources)self.references=common.to_set(references)self.ncbi_tax_id=taxonomy.ensure_ncbi_tax_id(ncbi_tax_id)self.attrs={}ifisinstance(attrs,dict):self.attrs.update(attrs)self.interactions=interactions
[docs]defmerge(self,other):""" Adds the annotations (sources, references, attrs) of the other ``Complex`` instance to this one. If the other ``Complex`` has different components it does nothing. """ifself!=other:returnif(set(self.components.values())=={1}andset(other.components.values())!={1}):# this complex has no stoichiometry information# but the other hasself.components=other.componentsself.sources.update(other.sources)self.references.update(other.references)self.add_ids(other.ids)fork,viniteritems(other.attrs):ifknotinself.attrs:self.attrs[k]=velifisinstance(self.attrs[k],(dict,set)):self.attrs[k].update(v)
[docs]def__init__(self,id_a,id_b,source,id_type='uniprot',pdb=None,css=None,stab_en=None,solv_en=None,area=None,isoform_a=1,isoform_b=1):''' This class is to store residue level information of protein-protein interfaces. '''self.source=sourceself.isoform_a=isoform_aiftype(isoform_a)isint \
elseint(non_digit.sub('',isoform_a))self.isoform_b=isoform_biftype(isoform_b)isint \
elseint(non_digit.sub('',isoform_b))self.pdb=pdbself.id_a=id_aself.id_b=id_bself.id_type=id_typeself.types=['undefined','hbonds','sbridges','ssbonds','covbonds']fortinself.types:self.__dict__[t]={id_a:[],id_b:[]}self.area=areaself.stab_en=stab_enself.solv_en=solv_enself.css=css
[docs]defadd_residues(self,res_a,res_b,typ='undefined'):''' Adds one pair of residues of type `typ`, where `res_a` and `res_b` are tuples of residue number in sequence and residue type, e.g. (124, 'S') -- (means Serine #124) `typ` can be undefined, hbonds, sbridges, ssbonds or covbonds '''iftype(res_a)isnottupleortype(res_b)isnottuple \
ortype(res_a[0])isnotintortype(res_b[0])isnotint \
or(type(res_a[1])isnotunicodeandtype(res_a[1])isnotstr) \
or(type(res_b[1])isnotunicodeandtype(res_b[1])isnotstr) \
ortypnotinself.__dict__:sys.stdout.write('\tWrong parameters for Interface.add_residues()\n')else:self.__dict__[typ][self.id_a].append(Residue(res_a[0],res_a[1],res_a[2],self.id_type))self.__dict__[typ][self.id_b].append(Residue(res_b[0],res_b[1],res_b[2],self.id_type))
[docs]defnumof_residues(self):''' Returns the number of residue pairs by bound type '''nbonds={}fortinself.types:nbonds[t]=len(self.__dict__[t][self.id_a])returnnbonds
[docs]defbond_types(self):''' Returns the bond types present in this interface '''types=[]fortinself.types:iflen(self.__dict__[t][self.id_a])>0:types.append(t)returntypes
[docs]defget_bonds(self,typ=None,mode=None):''' Gives a generator to iterate throught bonds in this interface. If no type given, bonds of all types returned. '''iftypisNone:typ=self.typesiftype(typ)isstr:typ=[typ]fortintyp:iftinself.__dict__:foriinrange(0,len(self.__dict__[t][self.id_a])):ifmode=='dict':yield{self.id_a:self.__dict__[t][self.id_a][i],self.id_b:self.__dict__[t][self.id_b][i],'type':t,}else:yield((self.id_a,)+(self.__dict__[t][self.id_a][i].serialize(),)+(self.id_b,)+(self.__dict__[t][self.id_b][i].serialize(),)+(t,))
defserialize(self):res=[]fortinself.types:ifself.__dict__[t][self.id_a]andself.__dict__[t][self.id_b]:res.append('%s:%s+%s'%(t,','.join(self.__dict__[t][self.id_a].serialize()),','.join(self.__dict__[t][self.id_b].serialize()),))return('%s-%u:%s-%u:%s:%s:%s'%(self.id_a,self.isoform_a,self.id_b,self.isoform_b,self.source,self.pdb,':'.join(res),))def__str__(self):nbonds=self.numof_residues()return('Molecular interface between %s and %s,\n''as observed in PDB structure %s\n\n'' Data source: %s\n'' Number of residues in contact: %u\n'' Hydrogene bonds: %u\n'' Covalent bonds: %u\n'' Saltbridges: %u\n'' S-S bonds: %u\n'' Stable energy: %s\n'' Solvation energy: %s\n'' Surface area: %s\n'' Complexation significance score: %s\n'%(self.id_a,self.id_b,self.pdb,self.source,sum(nbonds.values()),nbonds['hbonds'],nbonds['covbonds'],nbonds['sbridges'],nbonds['ssbonds'],'n/a'ifself.stab_enisNoneelsestr(self.stab_en),'n/a'ifself.solv_enisNoneelsestr(self.solv_en),'n/a'ifself.areaisNoneelsestr(self.area),'n/a'ifself.cssisNoneelsestr(self.css),))def__repr__(self):nbonds=self.numof_residues()return('Interface [%s-%s, %u bonds]'%(self.id_a,self.id_b,sum(nbonds.values()),))