#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#from__future__importannotationsfromfuture.utilsimportiteritemsfrompast.builtinsimportxrange,rangefromtypingimportLiteralimportosimportsysimportitertoolsimportfunctoolsimportcollectionsimportimportlibasimpimportreimporttimeimportdatetimeimportjsonimportpickleimportcopyimportabcimportinspectimporttypesas_typesimporttimeloopimportpandasaspdimportpypath.utils.mappingasmappingimportpypath.share.commonascommonimportpypath_common._constantsas_constimportpypath.internals.interaasinteraimportpypath.resources.urlsasurlsimportpypath.share.curlascurlimportpypath.inputs.uniprotasuniprot_inputimportpypath.inputs.uniprot_dbasuniprot_dbimportpypath.inputs.homologeneashomologene_inputimportpypath.inputs.omaasoma_inputimportpypath.inputs.biomartasbiomartimportpypath.utils.seqas_seimportpypath.share.sessionassessionimportpypath.share.settingsassettingsimportpypath.utils.taxonomyastaxonomyimportpypath.share.cacheascache_mod_orthology_cleanup_timeloop=timeloop.Timeloop()_orthology_cleanup_timeloop.logger.setLevel(9999)_logger=session.Logger(name='orthology')_log=_logger._log
[docs]def__init__(self,cleanup_period:int=10,lifetime:int=300,**kwargs):session.Logger.__init__(self,name='orthology')@_orthology_cleanup_timeloop.job(interval=datetime.timedelta(seconds=cleanup_period))def_cleanup():self._remove_expired()_orthology_cleanup_timeloop.start(block=False)self.lifetime=lifetimeself.tables={}self.expiry={}self._param={k:kwargs.get(k,None)forkinself.TRANSLATION_PARAM}self._log('OrthologyManager has been created.')
[docs]@common.ignore_unhashable@functools.lru_cache(maxsize=int(1e5))deftranslate(self,identifiers:str|Iterable[str],target:str|int,source:str|int=9606,id_type:str='uniprot',only_swissprot:bool=True,oma:bool=None,homologene:bool=None,ensembl:bool=None,oma_rel_type:(set[Literal['1:1','1:n','m:1','m:n']]|None)=None,oma_score:float|None=None,ensembl_hc:bool=True,ensembl_types:(list[Literal['one2one','one2many','many2many']]|None)=None,full_records:bool=False,):""" Translate one or more identifiers by orthologous gene pairs. Args: identifiers: One or more identifers of the source organism, of ID type `id_type`. target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The identifier type to use. only_swissprot: Use only SwissProt IDs. oma Use orthology information from the Orthologous Matrix (OMA). Currently this is the recommended source for orthology data. homologene: Use orthology information from NCBI HomoloGene. ensembl: Use orthology information from Ensembl. oma_rel_type: Restrict relations to certain types. oma_score: Lower threshold for similarity metric. ensembl_hc: Use only the high confidence orthology relations from Ensembl. ensembl_types: Ensembl orthology relation types to use. Possible values are `one2one`, `one2many` and `many2many`. By default only `one2one` is used. full_records: Include not only the identifiers, but also some properties of the orthology relationships. Returns: Set of identifiers of orthologous genes or proteins in the target taxon. """target=taxonomy.ensure_ncbi_tax_id(target)source=taxonomy.ensure_ncbi_tax_id(source)param=self._translation_param(locals())proc=(lambdax:x)iffull_recordselse(lambdax:x.id)result=set()forresource,keysinself.RESOURCE_PARAM.items():ifnotparam[resource]:continuetable=self.which_table(target=target,source=source,only_swissprot=only_swissprot,id_type=id_type,resource=resource,)result.update(table.translate(identifiers,full_records=full_records,**{k:vfork,vinparam.items()ifkinkeys},))returnresult
[docs]defget_dict(self,target:str|int,source:str|int=9606,id_type:str='uniprot',only_swissprot:bool=True,oma:bool=None,homologene:bool=None,ensembl:bool=None,oma_rel_type:(set[Literal['1:1','1:n','m:1','m:n']]|None)=None,oma_score:float|None=None,ensembl_hc:bool=True,ensembl_types:(list[Literal['one2one','one2many','many2many']]|None)=None,full_records:bool=False,)->dict[str,set[OrthologBase]]:""" Create a dictionary for one source organism and ID type. Args: target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The identifier type to use. only_swissprot: Use only SwissProt IDs. oma Use orthology information from the Orthologous Matrix (OMA). Currently this is the recommended source for orthology data. homologene: Use orthology information from NCBI HomoloGene. ensembl: Use orthology information from Ensembl. oma_rel_type: Restrict relations to certain types. oma_score: Lower threshold for similarity metric. ensembl_hc: Use only the high confidence orthology relations from Ensembl. ensembl_types: Ensembl orthology relation types to use. Possible values are `one2one`, `one2many` and `many2many`. By default only `one2one` is used. full_records: Include not only the identifiers, but also some properties of the orthology relationships. Returns: A dict with identifiers of the source organism as keys, and sets of their orthologs as values. """target=taxonomy.ensure_ncbi_tax_id(target)source=taxonomy.ensure_ncbi_tax_id(source)param=self._translation_param(locals())result=collections.defaultdict(set)forresource,keysinself.RESOURCE_PARAM.items():ifnotparam[resource]:continuetable=self.which_table(target=target,source=source,only_swissprot=only_swissprot,id_type=id_type,resource=resource,)dct=table.asdict(full_records=full_records,**{p:vforp,vinparam.items()ifpinkeys})fors,oindct.items():result[s].update(o)returndict(result)
[docs]defget_df(self,target:str|int,source:str|int=9606,id_type:str='uniprot',only_swissprot:bool=True,oma:bool=None,homologene:bool=None,ensembl:bool=None,oma_rel_type:(set[Literal['1:1','1:n','m:1','m:n']]|None)=None,oma_score:float|None=None,ensembl_hc:bool=True,ensembl_types:(list[Literal['one2one','one2many','many2many']]|None)=None,full_records:bool=False,**kwargs)->pd.DataFrame:""" Create a data frame for one source organism and ID type. Args: target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The identifier type to use. only_swissprot: Use only SwissProt IDs. oma Use orthology information from the Orthologous Matrix (OMA). Currently this is the recommended source for orthology data. homologene: Use orthology information from NCBI HomoloGene. ensembl: Use orthology information from Ensembl. oma_rel_type: Restrict relations to certain types. oma_score: Lower threshold for similarity metric. ensembl_hc: Use only the high confidence orthology relations from Ensembl. ensembl_types: Ensembl orthology relation types to use. Possible values are `one2one`, `one2many` and `many2many`. By default only `one2one` is used. full_records: Include not only the identifiers, but also some properties of the orthology relationships. kwargs: Ignored. Returns: A data frame with pairs of orthologous identifiers, in two columns: "source" and "target". """target=taxonomy.ensure_ncbi_tax_id(target)source=taxonomy.ensure_ncbi_tax_id(source)param=self._translation_param(locals())result=[]forresource,keysinself.RESOURCE_PARAM.items():ifnotparam[resource]:continuetable=self.which_table(target=target,source=source,only_swissprot=only_swissprot,id_type=id_type,resource=resource,)result.append(table.df(full_records=full_records,**{p:vforp,vinparam.items()ifpinkeys}))returnpd.concat(result)
[docs]deftranslate_df(self,df:pd.DataFrame,target:str|int,source:str|int=9606,cols:str|list[str]|dict[str,str]|None=None,id_type:str='uniprot',only_swissprot:bool=True,oma:bool=None,homologene:bool=None,ensembl:bool=None,oma_rel_type:(set[Literal['1:1','1:n','m:1','m:n']]|None)=None,oma_score:float|None=None,ensembl_hc:bool=True,ensembl_types:(list[Literal['one2one','one2many','many2many']]|None)=None,**kwargs:str|tuple[str,str])->pd.DataFrame:""" Translate columns in a data frame. Args: df: A data frame. cols: One or more columns to be translated. It can be a single column name, an iterable of column names or a dict where keys are column names and values are ID types. Except this last case, identifiers are assumed to be `id_type`. target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The default identifier type to use, will be used for all columns where ID type is not specified. only_swissprot: Use only SwissProt IDs. oma Use orthology information from the Orthologous Matrix (OMA). Currently this is the recommended source for orthology data. homologene: Use orthology information from NCBI HomoloGene. ensembl: Use orthology information from Ensembl. oma_rel_type: Restrict relations to certain types. oma_score: Lower threshold for similarity metric. ensembl_hc: Use only the high confidence orthology relations from Ensembl. ensembl_types: Ensembl orthology relation types to use. Possible values are `one2one`, `one2many` and `many2many`. By default only `one2one` is used. kwargs: Same as providing a dict to ``cols``, but beware, keys (column names) can not match existing argument names of this function. Returns: A data frame with the same column layout as the input, and the identifiers translated as demanded. Rows that could not be translated are omitted. """ifnotisinstance(cols,dict):cols=dict((col,id_type)forcolincommon.to_list(cols))kwargs.update(cols)id_types=set(kwargs.values())for_id_typeinset(cols.values()):args=locals().copy()args.pop('self')args['id_type']=_id_typeortho_df=self.get_df(**args)table=self.which_table(target=target,source=source,only_swissprot=only_swissprot,id_type=_id_type,resource='oma',)df=table.translate_df(df=df,cols=[cforc,iincols.items()ifi==_id_type],ortho_df=ortho_df,)returndf
def_translation_param(self,loc:dict)->dict:param={}forresource,keysinself.RESOURCE_PARAM.items():enabled=common.first_value(loc[resource],self._param[resource],settings.get(f'orthology_{resource}'),)param[resource]=enabledifenabled:forkeyinkeys:param[key]=common.first_value(loc[f'{resource}_{key}'],self._param[f'{resource}_{key}'],settings.get(f'orthology_{resource}_{key}'),)returnparamdef_remove_expired(self):forkey,last_usedinlist(self.expiry.items()):iftime.time()-last_used>self.lifetimeandkeyinself.tables:self._log('Removing orthology table from taxon %u to %u ''(only SwissProt: %s; resource: %s; ID type: %s)'%key)delself.tables[key]delself.expiry[key]def__del__(self):ifhasattr(_orthology_cleanup_timeloop,'stop'):_orthology_cleanup_timeloop.stop()
[docs]def__init__(self,preload_seq=[],isoforms=True):""" This is an object to store sequences of multiple organisms and select the appropriate one. """ifnothasattr(self,'_logger'):session.Logger.__init__(self,name='orthology')self.seq_isoforms=isoformsfortaxoninpreload_seq:self.load_seq(taxon)
[docs]def__init__(self,target:str|int,source:str|int|None=9606,id_type:str='uniprot',only_swissprot:bool=True,**kwargs):""" This class translates between homologous UniProt IDs of two organisms based on NCBI HomoloGene and Ensembl data. In case of HomoloGene, the UniProt-UniProt translation table is created by translating the source organism UniProts to RefSeq and Entrez IDs, finding the homologues (orthologues) for these IDs, and then translating them to the target organism UniProt IDs. In case of Ensembl, we obtain data with Ensembl protein identifiers and translate those to UniProt. Args target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The identifier type to use. only_swissprot: Use only SwissProt IDs. kwargs: Resource specific parameters. """self.data={}self.target=taxonomy.ensure_ncbi_tax_id(target)self.source=taxonomy.ensure_ncbi_tax_id(source)self.id_type=id_typeself._resource_l=self.resource.lower()Proteomes.__init__(self,only_swissprot=only_swissprot)self.load_proteome(self.source)self._set_param(kwargs,*self._param)self.load()
[docs]deftranslate(self,identifier:str|Iterable[str],full_records:bool=False,**kwargs)->set[str]:""" For one UniProt ID of the source organism returns all orthologues from the target organism. Args: identifier: An identifier corresponding to the ID type and source organism of the instance. full_records: Include not only the identifiers, but also some properties of the orthology relationships. kwargs: Resource specific translation parameters. Returns: A set of identifiers of orthologues in the target taxon. """identifier=((identifier,)ifhasattr(identifier,'components')elsecommon.to_list(identifier))result=set.union(*(self.data.get(i,set())foriinidentifier))ifnotfull_records:result={o.idforoinresult}returnresult
[docs]defasdict(self,full_records:bool=False,**kwargs)->dict[str,set[OrthologBase]]:""" Create a dictionary from the translation table. Args: full_records: Include not only the identifiers, but also some properties of the orthology relationships. kwargs: Resource specific filtering criteria. Returns: A dict with identifiers of the source organism as keys, and sets of their orthologs as values. """proc=(lambdax:x)iffull_recordselse(lambdax:x.id)return{s:{proc(o)foroinorthologsifself.match(o,**kwargs)}fors,orthologsinself.data.items()}
[docs]defdf(self,full_records:bool=False,**kwargs)->pd.DataFrame:""" Orthologous pairs as data frame. Args: full_records: Include not only the identifiers, but also some properties of the orthology relationships. kwargs: Resource specific filtering criteria. Returns: A data frame with pairs of orthologous identifiers, in two columns: "source" and "target". """_log('Creating translation data frame between 'f'organisms `{self.source}` and `{self.target}`, 'f'ID type `{self.id_type}`.')df=(pd.DataFrame(self.asdict(full_records=full_records,**kwargs).items(),columns=['source','target'],).explode('target',ignore_index=True).dropna().reset_index(drop=True))iffull_records:# some beautiful pandas code againdf=(pd.concat([df.source,pd.DataFrame(df.target.tolist()),],axis=1,).rename(columns={'id':'target'}))returndf
[docs]deftranslate_df(self,df:pd.DataFrame,cols:str|list[str]|None=None,ortho_df:pd.DataFrame|None=None,**kwargs):""" Translate columns in a data frame. Args: df: A data frame. cols: One or more columns to be translated. It can be a single column name, an iterable of column names or a dict where keys are column names and values are ID types. Except this last case, identifiers are assumed to be UniProt. ortho_df: Override the translation data frame. If provided, the parameters in `kwargs` won't have an effect. Must have columns "source" and "target". kwargs: Resource specific translation parameters. Returns: A data frame with the same column layout as the input, and the identifiers translated as demanded. Rows that could not be translated are omitted. """_log(f'Translating data frame column(s) from 'f'organism `{self.source}` to `{self.target}`.')ortho_df=((self.df(**kwargs)ifortho_dfisNoneelseortho_df).rename({'source':'pypath_internal_source','target':'pypath_internal_target',},axis=1,))col_order=df.columnscols=common.to_list(cols)forcolincols:_log(f'Translating `{self.id_type}` IDs of organism `{self.source}` 'f'in column `{col}` to organism `{self.target}`.')df=(df.merge(ortho_df.rename({'pypath_internal_source':col},axis=1),on=col,how='inner',).drop(col,axis=1).rename({'pypath_internal_target':col},axis=1))returndf[col_order]
def_translation_param(self,loc:dict)->dict:returndict((p,loc[p])forpinOrthologyManager.TRANSLATION_PARAM)def_set_param(self,loc:dict,*params:str):forparaminparams:key=f'orthology_{self._resource_l}_{param}'setattr(self,param,common.first_value(loc.get(param,None),settings.get(key)),)defmatch(self,ortholog:OrthologBase,**kwargs)->bool:returnTruedef_from_pickle(self)->bool:if(settings.get('orthology_cache')andos.path.exists(self.pickle_path)):withopen(self.pickle_path,'rb')asfp:self.data=pickle.load(fp)_log('Orthology table from taxon %u to %u (only SwissProt: %s; ''resource: %s; ID type: %s) has been loaded from `%s`.'%(self.key+(self.pickle_path,)))returnTruereturnFalsedef_to_pickle(self):withopen(self.pickle_path,'wb')asfp:pickle.dump(self.data,fp)_log('Orthology table from taxon %u to %u (only SwissProt: %s; ''resource: %s; ID type: %s) has been saved to `%s`.'%(self.key+(self.pickle_path,)))@propertydefkey(self):returnOrthologyTableKey(source=self.source,target=self.target,only_swissprot=self.only_swissprot,resource=self._resource_l,id_type=self.id_type,)@propertydefpickle_path(self):returnos.path.join(cache_mod.get_cachedir(),f'{common.md5(json.dumps(self.key))}.pickle',)def__len__(self):returnsum(map(len,self.data.values()))def__repr__(self):return(f'<{self.resource} Orthology table from {self.source} to 'f'{self.target}: {self.id_type} IDs, {len(self)} relationships>')
[docs]defload(self):""" Load orthology data from NCBI HomoloGene. Builds orthology translation table as dict based on NCBI HomoloGene data. If the `id_type` is supported by HomoloGene (Gene Symbol, RefSeq, Entrez, GI), the data will be simply loaded. For other ID types it translates HomoloGene Gene Symbol, RefSeq and Entrez tables to UniProt and then translates the orthologous UniProt pairs to the desired ID type. """ifself._from_pickle():returnifself.id_typein('genesymbol','refseq','refseqp','entrez','gi'):data=homologene_input.homologene_dict(self.source,self.target,self.id_type,)self.data={s:{HomologeneOrtholog(t)fortintarget_ids}fors,target_idsindata.items()}returnhg={id_type:homologene_input.homologene_dict(self.source,self.target,id_type,)forid_typein('genesymbol','refseq','entrez')}_log('Loading orthology data from NCBI HomoloGene 'f'between organisms `{self.source}` and `{self.target}`.')self.data=collections.defaultdict(set)foruinself._proteomes[(self.source,self.only_swissprot)]:target_uniprots=set()forid_type,hgdatainhg.items():hg_source_ids=mapping.map_name(u,'uniprot',id_type,ncbi_tax_id=self.source,)ifnothg_source_ids:continuehg_target_ids=set.union(*(hgdata.get(s,set())forsinhg_source_ids))ifnothg_target_ids:continuetarget_uniprots.update(mapping.map_names(hg_target_ids,id_type,'uniprot',ncbi_tax_id=self.target,))ifself.id_type=='uniprot':source_ids=(u,)target_ids=target_uniprotselse:source_ids=mapping.map_name(u,'uniprot',self.id_type,ncbi_tax_id=self.source,)target_ids=mapping.map_names(target_uniprots,'uniprot',self.id_type,ncbi_tax_id=self.target,)forsinsource_ids:self.data[s].update({HomologeneOrtholog(t)fortintarget_ids})self.data=dict(self.data)self._to_pickle()
[docs]def__init__(self,target:int|str,source:int|str=9606,id_type:str='uniprot',only_swissprot:bool=None,hc:bool=None,types:list[Literal['one2one','one2many','many2many']]=None,):""" Orthology translation with Ensembl data. Args target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The identifier type to use. only_swissprot: Use only SwissProt IDs. hc: Use only high confidence orthology relations from Ensembl. By default it is True. You can also set it by the `ensembl_hc` attribute. types: The Ensembl orthology relationship types to use. Possible values are `one2one`, `one2many` and `many2many`. By default only `one2one` is used. You can also set this parameter by the `ensembl_types` attribute. """ProteinOrthology.__init__(**locals())
defload(self):target_organism=taxonomy.ensure_ensembl_name(self.target)source_organism=taxonomy.ensure_ensembl_name(self.source)_log('Loading orthology data from Ensembl 'f'between organisms `{self.source}` and `{self.target}`.')ifself._from_pickle():returnifnottarget_organismornotsource_organism:_log('No Ensembl orthology data available between 'f'organisms `{self.source}` and `{self.target}`.')returntarget_prefix=f'{target_organism}_homolog_'attr_target_ensp=f'{target_prefix}ensembl_peptide'attr_conf=f'{target_prefix}orthology_confidence'attr_type=f'{target_prefix}orthology_type'ensembl_data=biomart.biomart_homology(source_organism=self.source,target_organism=self.target,)_id_types={'target':{'genesymbol':f'{target_prefix}associated_gene_name','ensp':f'{target_prefix}ensembl_peptide','ensg':f'{target_prefix}ensembl_gene',},'source':{'genesymbol':'external_gene_name','ensp':'ensembl_peptide_id','ensg':'ensembl_gene_id',},}attr_tgt_id=_id_types['target'].get(self.id_type,f'{target_prefix}ensembl_peptide',)attr_src_id=_id_types['source'].get(self.id_type,'ensembl_peptide_id',)self.data=collections.defaultdict(set)ifself.id_typein_id_types['target']:forrinensembl_data:self.data[getattr(r,attr_src_id)].add(EnsemblOrtholog(id=getattr(r,attr_tgt_id),hc=getattr(r,attr_conf)=='1',types=getattr(r,attr_type).split('_')[-1],))forrinensembl_data:ids={}forside,attr_idin(('source',attr_src_id),('target',attr_tgt_id)):uniprots=mapping.map_name(getattr(r,attr_id),'ensp','uniprot',ncbi_tax_id=getattr(self,side),)ids[side]=mapping.map_names(uniprots,'uniprot',self.id_type,ncbi_tax_id=getattr(self,side),uniprot_cleanup=False,)ifnotids[side]:continueforsinids['source']:self.data[s].update({EnsemblOrtholog(id=t,hc=getattr(r,attr_conf)=='1',types=getattr(r,attr_type).split('_')[-1],)fortinids['target']})self.data=dict(self.data)self._to_pickle()
[docs]defmatch(self,ortholog:OrthologBase,**kwargs)->bool:""" Check an ortholog against filtering criteria. Args: ortholog: An ortholog record. kwargs: Override default filtering parameters. Returns: True if the ortholog meets the criteria. """kwargs={k:vfork,vinkwargs.items()ifvisnotNone}hc=kwargs.get('hc',self.hc)types=kwargs.get('types',self.types)return((nothcorortholog.hc)and(nottypesorortholog.typesintypes))
[docs]def__init__(self,target:int|str,source:int|str=9606,id_type:str='uniprot',only_swissprot:bool=None,rel_type:(set[Literal['1:1','1:n','m:1','m:n']]|None)=None,score:float|None=None,):""" Orthology translation with Ensembl data. Args target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The identifier type to use. only_swissprot: Use only SwissProt IDs. rel_type: Restrict relations to certain types. score: Lower threshold for similarity metric. """ProteinOrthology.__init__(**locals())
defload(self):_log('Loading orthology data from OMA 'f'between organisms `{self.source}` and `{self.target}`.')ifself._from_pickle():returnoma_data=oma_input.oma_orthologs(organism_a=self.source,organism_b=self.target,id_type=self.id_type,)self.data=collections.defaultdict(set)forrecinoma_data:self.data[rec.a.id].add(OmaOrtholog(id=rec.b.id,score=rec.score,rel_type=rec.rel_type,))self.data=dict(self.data)self._to_pickle()
[docs]defmatch(self,ortholog:OrthologBase,**kwargs)->bool:""" Check an ortholog against filtering criteria. Args: ortholog: An ortholog record. kwargs: Override default filtering parameters. Returns: True if the ortholog meets the criteria. """kwargs={k:vfork,vinkwargs.items()ifvisnotNone}score=kwargs.get('score',self.score)rel_type=kwargs.get('rel_type',self.rel_type)return((scoreisNoneorortholog.score>=score)and(notrel_typeorortholog.rel_typeinrel_type))
[docs]deftranslate_site(self,protein:str|intera.Protein,res:str,offset:int,isoform:int=1,typ:str='phosphorylation',source_organism:str|int|None=None,)->set[tuple]:""" Translates one PTM site. Args: protein: A protein identifier or an intera.Protein object. res: Single letter code of the residue. offset: Sequence offset of the site. isoform: Sequence isoform. typ: Modification type. source_organism: Name or NCBI Taxonomy ID of the source organism. Returns: A list of tuples with the identifier, isoform, residue, offset, taxon and modification type of the orthologous PTM sites. """result=set()source=self._get_source(source_organism)protein_id=getattr(protein,'identifier',protein)sourceptm=(protein_id,isoform,res,offset,source,typ)ifself.get_taxon(protein_id)==self.target:result.add(sourceptm)returnresultifsourceptminself.ptmortho:ifself.targetinself.ptmortho[sourceptm]:result=self.ptmortho[sourceptm]ifnotresultandnotself.strict:tsubs=self.manager.translate(identifiers=protein_id,target=self.target,source=source,only_swissprot=self.only_swissprot,id_type=self.id_type,**self.orthology_args)fortsubintsubs:se=self.get_seq(tsub,taxon=self.target)ifseisNone:continuefortoffsetinxrange(offset,offset+3):foriinse.isoforms():tres=se.get(toffset,isoform=i)iftres==res:result.add((tsub,i,tres,toffset,self.target,typ,))ifresult:breakreturnresult
[docs]deftranslate(self,x,return_strings=False,**kwargs):""" Translates anything: string notation, intera objects, tuples. - one PTM provided as tuple of (UniProt, amino acid, offest) - one PTM provided as string (e.g. `P00533_S231`) - instance from pypath.intera: DomainMotif, Domain or Ptm Additional arguments can be isoform and typ (modification type). """result=[]iftype(x)istuple:result=self.translate_site(*x,**kwargs)eliftype(x)in_const.CHAR_TYPES:ptm=self.reptm.match(x)ifptmisnotNone:result=self.translate_site(ptm[1],ptm[2],int(ptm[3]),**kwargs)ifreturn_strings:result=['%s_%s%u'%(r[0],r[2],r[3])forrinresult]elifisinstance(x,intera.Ptm):result=self.translate_ptm(x)elifisinstance(x,intera.Domain):result=self.translate_domain(x)elifisinstance(x,intera.DomainMotif):result=self.translate_domain_motif(x)returnresult
[docs]defptm_orthology(self):""" Load PTM orthology data from PhosphoSite. Creates an orthology translation dict of phosphosites based on phosphorylation sites table from PhosphoSitePlus. In the result all PTMs represented by a tuple of the following 6 elements: UniProt ID, isoform (int), residue one letter code, residue number (int), NCBI Taxonomy ID (int), modification type. """self.ptmortho={}nondigit=re.compile(r'[^\d]+')unknown_taxa=set()fortypincommon.psite_mod_types:groups={}url=urls.urls['psite_%s'%typ[0]]['url']c=curl.Curl(url,silent=False,large=True)data=c.resultfor_inxrange(4):null=next(data)forrindata:r=r.split('\t')iflen(r)<10:continueuniprot=r[2]isoform=(1if'-'notinuniprotelseint(uniprot.split('-')[1]))uniprot=uniprot.split('-')[0]aa=r[4][0]num=int(nondigit.sub('',r[4]))ifr[6]notintaxonomy.taxa:unknown_taxa.add(r[6])continuetax=taxonomy.taxa[r[6]]group=int(r[5])this_site=(uniprot,isoform,aa,num,tax,typ[1])ifgroupnotingroups:groups[group]=set([])groups[group].add(this_site)forgroup,sitesiniteritems(groups):forsite1insites:forsite2insites:ifsite1[4]==site2[4]:continueifsite1notinself.ptmortho:self.ptmortho[site1]={}ifsite2[4]notinself.ptmortho[site1]:self.ptmortho[site1][site2[4]]=set([])self.ptmortho[site1][site2[4]].add(site2)iflen(unknown_taxa):self._log('Unknown taxa encountered: %s'%(', '.join(sorted(unknown_taxa))))
def_get_source(self,source:str|int|None)->int:""" Returns the NCBI Taxonomy ID of the source taxon. """ncbi_tax_id=taxonomy.ensure_ncbi_tax_id(source)orself.sourceifnotncbi_tax_id:msg=(f'No source taxon provided (argument: `{source}`, 'f'instance: `{self.source}`)')self._log(msg)raiseValueError(msg)returnncbi_tax_iddef__len__(self):returnlen(getattr(self,'ptmortho',()))def__repr__(self):returnf'<PTM Orthology: {len(self)} sites>'
[docs]definit():""" Initialize the orthology manager. Creates an instance of the orthology manager. Stores it in the module namespace. """globals()['manager']=OrthologyManager()
[docs]defget_manager():""" Access the orthology manager. Returns the orthology manager, an object which loads and unloads the orthology lookup tables as necessary, and provides the interface for querying the orthology data. Normally an instance of the manager belongs to the module, and if it does not exist yet, will be created automatically. """if'manager'notinglobals():init()returnglobals()['manager']
[docs]deftranslate(identifiers:str|Iterable[str],target:str|int,source:str|int=9606,id_type:str='uniprot',only_swissprot:bool=True,oma:bool=None,homologene:bool=None,ensembl:bool=None,oma_rel_type:(set[Literal['1:1','1:n','m:1','m:n']]|None)=None,oma_score:float|None=None,ensembl_hc:bool=True,ensembl_types:(list[Literal['one2one','one2many','many2many']]|None)=None,full_records:bool=False,):""" Translate one or more identifiers by orthologous gene pairs. Args: identifiers: One or more identifers of the source organism, of ID type `id_type`. target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The identifier type to use. only_swissprot: Use only SwissProt IDs. oma Use orthology information from the Orthologous Matrix (OMA). Currently this is the recommended source for orthology data. homologene: Use orthology information from NCBI HomoloGene. ensembl: Use orthology information from Ensembl. oma_rel_type: Restrict relations to certain types. oma_score: Lower threshold for similarity metric. ensembl_hc: Use only the high confidence orthology relations from Ensembl. ensembl_types: Ensembl orthology relation types to use. Possible values are `one2one`, `one2many` and `many2many`. By default only `one2one` is used. full_records: Include not only the identifiers, but also some properties of the orthology relationships. Returns: Set of identifiers of orthologous genes or proteins in the target taxon. """manager=get_manager()args=locals().copy()args.pop('manager')returnmanager.translate(**args)
[docs]defget_dict(target:str|int,source:str|int=9606,id_type:str='uniprot',only_swissprot:bool=True,oma:bool=None,homologene:bool=None,ensembl:bool=None,oma_rel_type:(set[Literal['1:1','1:n','m:1','m:n']]|None)=None,oma_score:float|None=None,ensembl_hc:bool=True,ensembl_types:(list[Literal['one2one','one2many','many2many']]|None)=None,full_records:bool=False,)->dict[str,set[OrthologBase]]:""" Create a dictionary for one source organism and ID type. Args: target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The identifier type to use. only_swissprot: Use only SwissProt IDs. oma Use orthology information from the Orthologous Matrix (OMA). Currently this is the recommended source for orthology data. homologene: Use orthology information from NCBI HomoloGene. ensembl: Use orthology information from Ensembl. oma_rel_type: Restrict relations to certain types. oma_score: Lower threshold for similarity metric. ensembl_hc: Use only the high confidence orthology relations from Ensembl. ensembl_types: Ensembl orthology relation types to use. Possible values are `one2one`, `one2many` and `many2many`. By default only `one2one` is used. full_records: Include not only the identifiers, but also some properties of the orthology relationships. Returns: A dict with identifiers of the source organism as keys, and sets of their orthologs as values. """manager=get_manager()args=locals().copy()args.pop('manager')returnmanager.get_dict(**args)
[docs]defget_df(target:str|int,source:str|int=9606,id_type:str='uniprot',only_swissprot:bool=True,oma:bool=None,homologene:bool=None,ensembl:bool=None,oma_rel_type:(set[Literal['1:1','1:n','m:1','m:n']]|None)=None,oma_score:float|None=None,ensembl_hc:bool=True,ensembl_types:(list[Literal['one2one','one2many','many2many']]|None)=None,full_records:bool=False,**kwargs)->pd.DataFrame:""" Create a data frame for one source organism and ID type. Args: target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. id_type: The identifier type to use. only_swissprot: Use only SwissProt IDs. oma Use orthology information from the Orthologous Matrix (OMA). Currently this is the recommended source for orthology data. homologene: Use orthology information from NCBI HomoloGene. ensembl: Use orthology information from Ensembl. oma_rel_type: Restrict relations to certain types. oma_score: Lower threshold for similarity metric. ensembl_hc: Use only the high confidence orthology relations from Ensembl. ensembl_types: Ensembl orthology relation types to use. Possible values are `one2one`, `one2many` and `many2many`. By default only `one2one` is used. full_records: Include not only the identifiers, but also some properties of the orthology relationships. kwargs: Ignored. Returns: A data frame with pairs of orthologous identifiers, in two columns: "source" and "target". """manager=get_manager()args=locals().copy()args.pop('manager')args.pop('kwargs')returnmanager.get_df(**args)
[docs]deftranslate_df(df:pd.DataFrame,target:str|int,source:str|int=9606,cols:str|list[str]|dict[str,str]|None=None,id_type:str='uniprot',only_swissprot:bool=True,oma:bool=None,homologene:bool=None,ensembl:bool=None,oma_rel_type:(set[Literal['1:1','1:n','m:1','m:n']]|None)=None,oma_score:float|None=None,ensembl_hc:bool=True,ensembl_types:(list[Literal['one2one','one2many','many2many']]|None)=None,**kwargs:str|tuple[str,str])->pd.DataFrame:""" Translate columns in a data frame. Args: df: A data frame. target: Name or NCBI Taxonomy ID of the target organism. source: Name or NCBI Taxonomy ID of the source organism. cols: One or more columns to be translated. It can be a single column name, an iterable of column names or a dict where keys are column names and values are ID types. Except this last case, identifiers are assumed to be `id_type`. id_type: The default identifier type to use, will be used for all columns where ID type is not specified. only_swissprot: Use only SwissProt IDs. oma Use orthology information from the Orthologous Matrix (OMA). Currently this is the recommended source for orthology data. homologene: Use orthology information from NCBI HomoloGene. ensembl: Use orthology information from Ensembl. oma_rel_type: Restrict relations to certain types. oma_score: Lower threshold for similarity metric. ensembl_hc: Use only the high confidence orthology relations from Ensembl. ensembl_types: Ensembl orthology relation types to use. Possible values are `one2one`, `one2many` and `many2many`. By default only `one2one` is used. kwargs: Same as providing a dict to ``cols``, but beware, keys (column names) can not match existing argument names of this function. Returns: A data frame with the same column layout as the input, and the identifiers translated as demanded. Rows that could not be translated are omitted. """manager=get_manager()args=locals().copy()args.pop('manager')args.pop('kwargs')returnmanager.translate_df(**args,**kwargs)