#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#fromfuture.utilsimportiteritemsfrompast.builtinsimportxrange,rangeimportsysimportimportlibasimpimportitertoolsimportcollectionsimportpickleimporttracebackimportpandasaspdimportpypath.share.commonascommonimportpypath_common._constantsas_constimportpypath.utils.mappingasmappingimportpypath.utils.orthologyasorthologyimportpypath.inputs.uniprotasuniprot_inputimportpypath.internals.interaasinteraimportpypath.share.progressasprogressimportpypath.share.sessionassession_modimportpypath.utils.taxonomyastaxonomyimportpypath.inputsasinputsimportpypath.core.evidenceasevidenceimportpypath.core.entityasentityimportpypath.resourcesasresources
[docs]def__init__(self,input_param=None,input_method=None,ncbi_tax_id=None,trace=False,id_type_enzyme=None,id_type_substrate=None,name=None,allow_mixed_organisms=None,organisms_supported=False,**kwargs):""" Processes enzyme-substrate interaction data from various databases. Provides generators to iterate over these interactions. For organisms other than human obtains the organism specific interactions from databases. :param str input_method: Either a method name in the ``inputs`` module or a database name e.g. `PhosphoSite` or a callable which returns data in list of dicts format. :param int ncbi_tax_id: NCBI Taxonomy ID used at the database lookups. :param bool trace: Keep data about ambiguous ID mappings and PTM data in mismatch with UniProt sequences. :param pypath.mapping.Mapper: A `Mapper` instance. If `None` a new instance will be created. :param str id_type_enzyme: The ID type of the enzyme in the database. :param str id_type_substrate: The ID type of the substrate in the database. :param bool nonhuman_direct_lookup: Use direct lookup at non-human target species. :param **kwargs: Args to be forwarded to the input method. """ifnothasattr(self,'_logger'):session_mod.Logger.__init__(self,name='enz_sub')self.mammal_taxa={9606,10090,10116}self.nomatch=[]self.kin_ambig={}self.sub_ambig={}self.input_param=input_paramself.name=nameself.id_type_enzyme=id_type_enzymeself.id_type_substrate=id_type_substrateself.allow_mixed_organisms=allow_mixed_organismsself.input_method=input_methodself.trace=traceself.ncbi_tax_id=ncbi_tax_idself.organisms_supported=organisms_supportedself.setup()orthology.SequenceContainer.__init__(self)self.load_seq(self.ncbi_tax_id)ifself.allow_mixed_organisms:fortaxoninself.mammal_taxa:self.load_seq(taxon=taxon)orthology.Proteomes.__init__(self)self.set_inputargs(**kwargs)self.load_enz_sub()
[docs]defset_method(self):""" Selects the input method. """defempty_input(*args,**kwargs):return[]# attempting to look up the method in the inputs moduleifnothasattr(self.input_method,'__call__'):self.input_method=(inputs.get_method(self.input_method)orempty_input)self.name=self.nameorself.input_method.__name__
[docs]defset_inputargs(self,**inputargs):""" Sets the arguments to be provided for the input method. """self.inputargs=inputargs
[docs]defload_data(self):""" Loads the data by the defined input method. """input_method_name='%s.%s'%(self.input_method.__module__,self.input_method.__name__,)self._log('Calling `%s` with arguments %s.'%(input_method_name,str(self.inputargs)))self.data=self.input_method(**self.inputargs)self._log('Loaded data by `%s`, resulted %u records.'%(input_method_name,len(self.data),))
def_phosphosite_setup(self):if'strict'notinself.inputargs:self.inputargs['strict']=Falseifself.inputargs['organism']intaxonomy.taxids:self.inputargs['organism']=(taxonomy.taxids[self.inputargs['organism']])def_phosphoelm_setup(self):ifself.ncbi_tax_id!=9606and'ltp_only'notinself.inputargs:self.inputargs['ltp_only']=Falsedef_setup(self):setupmethod='_%s_setup'%self.name.lower()self._organism_setup()ifhasattr(self,setupmethod):getattr(self,setupmethod)()def_organism_setup(self):ifself.organisms_supported:ifself.ncbi_tax_idintaxonomy.taxa:self.ncbi_tax_id=taxonomy.taxa[self.ncbi_tax_id]self.inputargs['organism']=self.ncbi_tax_idself.load_proteome(self.ncbi_tax_id,False)def_process(self,p):# human leukocyte antigenes result a result an# extremely high number of combinationsif(notp['kinase']or(isinstance(p['substrate'],str)andp['substrate'].startswith('HLA'))):returnifnotisinstance(p['kinase'],list):p['kinase']=[p['kinase']]kinase_ups=mapping.map_names(p['kinase'],self.id_type_enzyme,'uniprot',ncbi_tax_id=self.ncbi_tax_id,)substrate_ups_all=set()forsub_id_typeinself.id_type_substrate:ifisinstance(sub_id_type,(list,tuple)):sub_id_type,sub_id_attr=sub_id_typeelse:sub_id_attr='substrate'substrate_ups_all.update(set(mapping.map_name(p[sub_id_attr],sub_id_type,'uniprot',self.ncbi_tax_id,)))# looking up sequences in all isoforms:substrate_ups=[]forsinsubstrate_ups_all:if'substrate_isoform'inpandp['substrate_isoform']:substrate_ups.append((s,p['substrate_isoform']))else:se=self.get_seq(s)ifseisNone:continueforisofinse.isoforms():if'instance'inpandp['instance']isnotNone:ifse.match(p['instance'],p['start'],p['end'],isoform=isof,):substrate_ups.append((s,isof))else:ifse.match(p['resaa'],p['resnum'],isoform=isof,):substrate_ups.append((s,isof))ifself.trace:ifp['substrate']notinself.sub_ambig:self.sub_ambig[p['substrate']]=substrate_upsforkinp['kinase']:ifknotinself.kin_ambig:self.kin_ambig[k]=kinase_ups# generating report on non matching substratesiflen(substrate_ups)==0:forsinsubstrate_ups_all:se=self.get_seq(s[0])ifseisNone:continueself.nomatch.append((s[0],s[1],(p['substrate_refseq']if'substrate_refseq'inpelse'',s,p['instance'],se.get(p['start'],p['end']),),))# building objects representing the enzyme-substrate interaction(s)if'typ'notinp:p['typ']='phosphorylation'_resources=tuple((self.input_param.get_via(name)ifhasattr(self.input_param,'get_via')elsename)fornamein(p['databases']if'databases'inpelse()))_resources+=((self.name,)ifisinstance(self.input_param,str)else(self.input_param,))# collecting the evidencesevidences=evidence.Evidences(evidence.Evidence(resource=_res,references=p['references']if'references'inpelseNone)for_resin_resources)forsinsubstrate_ups:# building the objects representing the substratese=self.get_seq(s[0])ifseisNone:continueres=intera.Residue(p['resnum'],p['resaa'],s[0],isoform=s[1],ncbi_tax_id=self.ncbi_tax_id,)if'instance'notinporp['instance']isNone:reg=se.get_region(p['resnum'],p['start']if'start'inpelseNone,p['end']if'end'inpelseNone,isoform=s[1],)ifregisnotNone:p['start'],p['end'],p['instance']=regmot=intera.Motif(s[0],p['start'],p['end'],instance=p['instance'],isoform=s[1],ncbi_tax_id=self.ncbi_tax_id,)ptm=intera.Ptm(s[0],motif=mot,residue=res,typ=p['typ'],evidences=evidences,isoform=s[1],ncbi_tax_id=self.ncbi_tax_id,)forkinkinase_ups:if(notself.allow_mixed_organismsand(self.get_taxon(k)!=self.ncbi_tax_idorself.get_taxon(s[0])!=self.ncbi_tax_id)):continue# the enzyme (kinase)dom=intera.Domain(protein=k,ncbi_tax_id=self.ncbi_tax_id,)dommot=intera.DomainMotif(domain=dom,ptm=ptm,evidences=evidences,)ifhasattr(self.input_param,'extra_attrs'):forattr,keyiniteritems(self.input_param.extra_attrs):ifkeyinp:setattr(dommot,attr,p[key])yielddommotdefinput_is(self,i,op='__eq__'):return(type(self.name)in_const.CHAR_TYPESandgetattr(i,op)(self.name.lower()))def__iter__(self):""" Iterates through the enzyme-substrate interactions. """forpinself.data:forenz_subinself._process(p):yieldenz_subdef__len__(self):returnlen(self.data)ifhasattr(self,'data')else0def__repr__(self):return'<Enzyme-substrate processor: %u records>'%len(self)
[docs]def__init__(self,ncbi_tax_id,input_param=None,input_method=None,map_by_orthology_from=None,trace=False,id_type_enzyme=None,id_type_substrate=None,name=None,orthology_only_swissprot=True,ptm_orthology_strict=False,**kwargs):""" Unifies a `pypath.core.enz_sub.EnzymeSubstrateProcessor` and a `pypath.utils.orthology.PtmOrthology` object to build a set of enzyme-substrate interactions from a database and subsequently translate them by orthology to one different organism. Multiple organism can be chosen as the source of the enzyme-substrate interactions. For example if you want mouse interactions, you can translate them from human and from rat. To get the original mouse interactions themselves, use an other instance of the `EnzymeSubstrateProcessor`. To have both the original and the orthology translated set, and also from multiple databases, whatmore all these merged into a single set, use the `EnzymeSubstrateAggregator`. :param str input_method: Data source for `EnzymeSubstrateProcessor`. :param int ncbi_tax_id: The NCBI Taxonomy ID the interactions should be translated to. :param bool orthology_only_swissprot: Use only SwissProt (i.e. not Trembl) at orthology translation. :param bool ptm_orthology_strict: Use only those homologous PTM pairs which are in PhosphoSite data, i.e. do not look for residues with same offset in protein sequence. See further options at `EnzymeSubstrateProcessor`. """ifnothasattr(self,'_logger'):session_mod.Logger.__init__(self,name='enz_sub_orthology')self.target_taxon=ncbi_tax_idself.map_by_orthology_from=(map_by_orthology_fromor{9606,10090,10116})self.map_by_orthology_from=common.to_set(self.map_by_orthology_from)self.map_by_orthology_from.discard(self.target_taxon)self.input_param=input_paramself.input_method=input_methodself.trace=traceself.id_type_enzyme=id_type_enzymeself.id_type_substrate=id_type_substrateself.name=nameself.ptmprocargs=kwargsorthology.PtmOrthology.__init__(self,target=ncbi_tax_id,only_swissprot=orthology_only_swissprot,strict=ptm_orthology_strict,)
def__iter__(self):""" Iterates through enzyme-substrate interactions translated to another organism by orthology. """forsource_taxoninself.map_by_orthology_from:self._log('Translating enzyme-substrate interactions ''from organism %u to %u.'%(source_taxon,self.target_taxon,))self.set_default_source(source_taxon)EnzymeSubstrateProcessor.__init__(self,input_param=self.input_param,input_method=self.input_method,ncbi_tax_id=source_taxon,trace=self.trace,id_type_enzyme=self.id_type_enzyme,id_type_substrate=self.id_type_substrate,name=self.name,allow_mixed_organisms=True,**self.ptmprocargs,)self._log('Enzyme-substrate interactions loaded from resource `%s` ''for organism %s, %u raw records.'%(self.name,source_taxon,len(self),))foresinEnzymeSubstrateProcessor.__iter__(self):fortarget_esinself.translate(es):yieldtarget_esdef__repr__(self):return('<Enzyme-substrate orthology processor, ''target taxon: %u, source taxon(s): %s>'%(self.target_taxon,', '.join(str(tax)fortaxinself.map_by_orthology_from),))
[docs]def__init__(self,input_param=None,exclude=None,ncbi_tax_id=9606,map_by_orthology_from=None,trace=False,orthology_only_swissprot=True,ptm_orthology_strict=False,nonhuman_direct_lookup=True,inputargs=None,pickle_file=None,):""" Docs not written yet. """session_mod.Logger.__init__(self,name='enz_sub')fork,viniteritems(locals()):setattr(self,k,v)self.main()
defreload(self):modname=self.__class__.__module__mod=__import__(modname,fromlist=[modname.split('.')[0]])imp.reload(mod)new=getattr(mod,self.__class__.__name__)setattr(self,'__class__',new)defmain(self):ifself.pickle_file:self.load_from_pickle(pickle_file=self.pickle_file)else:self.build()defload_from_pickle(self,pickle_file=None):self._log('Loading from file `%s`.'%pickle_file)withopen(self.pickle_file,'rb')asfp:self.enz_sub,self.references=pickle.load(fp)self.update_ptm_lookup_dict()defsave_to_pickle(self,pickle_file):self._log('Saving to file file `%s`.'%pickle_file)withopen(pickle_file,'wb')asfp:pickle.dump(obj=(self.enz_sub,self.references,),file=fp,)defbuild(self):self.inputargs=self.inputargsor{}self.map_by_orthology_from=(({9606,10090,10116}ifself.ncbi_tax_id!=9606elseset())ifself.map_by_orthology_fromisNoneelseself.map_by_orthology_from)self.map_by_orthology_from=set(self.map_by_orthology_from)self.map_by_orthology_from.discard(self.ncbi_tax_id)self.set_inputs()self.build_list()self.unique()def__iter__(self):forptminitertools.chain(*self.enz_sub.values()):yieldptmdef__len__(self):returnsum([len(esub)foresubinself.enz_sub.values()])def__repr__(self):return'<Enzyme-substrate database: %s relationships>'%len(self)def__getitem__(self,*args):args=args[0]ifisinstance(args[0],tuple)elseargsreturnself.get_enzyme_substrate(*args)defget_enzyme_substrate(self,enzyme,substrate):enzyme=entity.Entity(enzyme)substrate=entity.Entity(substrate)key=(enzyme,substrate)ifkeyinself.enz_sub:returnself.enz_sub[key]defset_inputs(self):self.input_param=(self.input_paramorresources.get_controller().collect_enzyme_substrate())
[docs]defbuild_list(self):""" Builds a full list of enzyme-substrate interactions from all the requested sources. This list might contain redundant elements which later will be merged by `unique`. This 'full list' is organised into a dict by pairs of proteins in order to make it more efficient to compile a unique set for each pair. """defextend_lists(enz_sub):foresinenz_sub:key=(es.domain.protein,es.ptm.protein)ifkeynotinself.enz_sub:self.enz_sub[key]=[]self.enz_sub[key].append(es)forevines.evidences:resource_key=(ev.resource.name,ev.resource.via)self.references[resource_key][es.key()].update(ev.references)self._log('Starting to build enzyme-substrate ''database for organism `%u`.'%self.ncbi_tax_id)self.enz_sub={}self.references=collections.defaultdict(lambda:collections.defaultdict(set))forinput_paraminself.input_param:name=(input_param['name']ifisinstance(input_param,dict)elseinput_param.name)try:input_method=(input_param['input_method']ifisinstance(input_param,dict)elseinput_param.input_method)self._log('Loading enzyme-substrate interactions ''from resource `%s` by method `%s`.'%(name,input_method,))args=(input_paramifisinstance(input_param,dict)else{'input_param':input_param})if(self.ncbi_tax_id==9606or(self.nonhuman_direct_lookupandinput_param.organisms_supported)):self._log('Loading enzyme-substrate interactions ''for taxon `%u`.'%self.ncbi_tax_id)proc=EnzymeSubstrateProcessor(ncbi_tax_id=self.ncbi_tax_id,trace=self.trace,**args,)extend_lists(proc.__iter__())ifself.map_by_orthology_from:source_taxons_str=', '.join('%u'%taxfortaxinself.map_by_orthology_from)self._log('Mapping `%s` by orthology from taxons %s to %u.'%(input_method,source_taxons_str,self.ncbi_tax_id,))proc=EnzymeSubstrateOrthologyProcessor(ncbi_tax_id=self.ncbi_tax_id,map_by_orthology_from=self.map_by_orthology_from,trace=self.trace,orthology_only_swissprot=self.orthology_only_swissprot,ptm_orthology_strict=self.ptm_orthology_strict,**args)extend_lists(proc.__iter__())self._log('Finished translating `%s` by orthology ''from %s to %u.'%(input_method,source_taxons_str,self.ncbi_tax_id,))self._log('Finished loading enzyme-substrate data ''from resource `%s`.'%name)exceptExceptionase:self._log('Failed to load resource `%s`.'%name)self._log_traceback()try:traceback.print_tb(e.__traceback__,file=sys.stdout,)exceptExceptionase:self._log('Failed handling exception.')self._log_traceback()self.references=dict(self.references)self.update_ptm_lookup_dict()self._log('Finished building enzyme-substrate database ''for organism `%u`, resulted %u relationships.'%(self.ncbi_tax_id,len(self),))
[docs]defunique(self):""" Merges the redundant elements of the interaction list. Elements are redundant if they agree in all their attributes except the sources, references and isoforms. """self.unique_list=set()forkey,enz_subiniteritems(self.enz_sub):self.enz_sub[key]=self.uniq_enz_sub(enz_sub)
@staticmethoddefuniq_enz_sub(enz_sub):enz_sub_uniq=[]foresinenz_sub:merged=Falsefori,es_uinenumerate(enz_sub_uniq):ifes==es_u:enz_sub_uniq[i].merge(es)merged=Trueifnotmerged:enz_sub_uniq.append(es)returnenz_sub_uniqdefmake_df(self,tax_id=False,resources_only_primary=False):self._log('Creating enzyme-substrate interaction data frame.')hdr=['enzyme','enzyme_genesymbol','substrate','substrate_genesymbol','isoforms','residue_type','residue_offset','modification','sources','references','curation_effort',]self.df=pd.DataFrame([dm.get_line(resources_only_primary=resources_only_primary)fordminself],columns=hdr,).astype({'enzyme':'category','substrate':'category','isoforms':'category','residue_type':'category','residue_offset':'int32','modification':'category','sources':'category','references':'category','curation_effort':'int32',})self.df=self.df.loc[:,hdr]iftax_id:self.df['ncbi_tax_id']=[self.ncbi_tax_id]*self.df.shape[0]self._log('Created enzyme-substrate interaction data frame. ''Memory usage: %s.'%common.df_memory_usage(self.df))defexport_table(self,fname):self.make_df()self.df.to_csv(fname,sep='\t',index=False)
[docs]defassign_to_network(self,pa):""" Assigns enzyme-substrate interactions to the edges of a network in a py:class:``pypath.legacy.main.PyPath`` instance. """pa.update_vname()if'ptm'notinpa.graph.es.attributes():pa.graph.es['ptm']=[[]for_inpa.graph.es]forkey,ptmsiniteritems(self.enz_sub):nodes=pa.get_node_pair(key[0],key[1],directed=pa.graph.is_directed())e=Noneifnodes:e=pa.graph.get_eid(nodes[0],nodes[1],error=False)ifisinstance(e,int)ande>0:ifpa.graph.es[e]['ptm']isNone:pa.graph.es[e]['ptm']=[]pa.graph.es[e]['ptm'].extend(ptms)