#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#fromfuture.utilsimportiteritemsfrompast.builtinsimportxrange,rangeimportosimportsysimportimportlibasimpimportrefromcollectionsimportCounter,OrderedDictimportnumpyasnpimportitertoolstry:importcPickleaspickleexcept:importpickleimportpypath.share.cacheascacheimportpypath.inputs.goasgo_inputimportpypath.share.progressasprogressimportpypath.share.commonascommonfrompypath.share.commonimport*importpypath.share.sessionassession_modimportpypath.share.settingsassettings# this is for GO terms parsing:_reexprterm=re.compile(r'and|or|not|\(|\)|GO:[0-9]{7}')_reexprname=re.compile(r'(?!\s)'# no space at the beginningr'(?:AND|OR|NOT|\(|\)|'# either AND, OR, NOT or parenthesesr'(?:(?!OR|AND|NOT|\s{2:})(?:[-\w: ]))+)'# or something else# (words with spaces)r'(?<!\s)'# no space at the end)ROOT_NODES={'cellular_component':'GO:0005575','biological_process':'GO:0008150','molecular_function':'GO:0003674',}ROOT_ACS=set(ROOT_NODES.values())
[docs]def__init__(self,terms=None,ancestors=None,descendants=None,aspect=None,term=None,name=None,):""" Loads data about Gene Ontology terms and their relations. """session_mod.Logger.__init__(self,name='go')self._terms_provided=termsself._ancestors_provided=ancestorsself._descendants_provided=descendantsself._aspect_provided=aspectself._term_provided=termself._name_provided=nameself._load()
[docs]defreload(self):"""Reloads the object from the module level."""modname=self.__class__.__module__mod=__import__(modname,fromlist=[modname.split('.')[0]])imp.reload(mod)new=getattr(mod,self.__class__.__name__)setattr(self,'__class__',new)
def_load(self):self._log('Populating Gene Ontology: ontology.')self._load_terms()self._load_tree()self._set_aspect()self._set_name()self._set_term()# delattr(self, '_terms')self._log('Gene Ontology: ontology populated.')def_load_terms(self):self._terms=self._terms_providedorgo_input.go_terms_quickgo()def_load_tree(self):self._log('Gene Ontology: building the ontology tree.')self.ancestors=(self._ancestors_providedorself._merge_aspects(go_input.go_ancestors_quickgo()))self.descendants=(self._descendants_providedorself._merge_aspects(go_input.go_descendants_quickgo()))def_set_aspect(self):self.aspect=(self._aspect_providedordict((term,asp)forasp,termsiniteritems(self._terms)forterminterms.keys()))def_set_name(self):self._log('Collecting short names of GO terms.')self.name=(self._name_providedordict(iforiiinself._terms.values()foriiniteritems(ii)))def_set_term(self):self.term=(self._term_providedordict(reversed(i)foriiniteritems(self.name)))
[docs]defis_term(self,term):""" Tells if ``term`` is a GO accession number. """returnterminself.name
[docs]defis_name(self,name):""" Tells if ``name`` is a GO term name. """returnnameinself.term
[docs]defget_name(self,term):""" For a GO accession number returns the name of the term. If ``term`` is already a GO term name returns it unchanged. """return(termifself.is_name(term)elseNoneiftermnotinself.nameelseself.name[term])
[docs]defget_term(self,name):""" For a GO term name returns its GO accession number. If ``name`` is a GO accession returns it unchanged. """result=(nameifself.is_term(name)elseNoneifnamenotinself.termelseself.term[name])ifresultisNone:self._log('Could not find GO term name: `%s`.'%name)returnresult
[docs]defterms_to_names(self,terms):""" For a list of GO names returns a list of tuples with the terms and their names. """return[(term,self.get_name(term))forterminterms]
[docs]defterms_to_names_aspects(self,terms):""" For a list of GO terms returns a list of tuples with the terms, their names and the ontology aspect. """return[(term,self.get_name(term),self.get_aspect(term))forterminterms]
[docs]defnames_to_terms(self,names):""" For a list of GO terms returns a list of tuples with the terms and their names. """return[(self.get_term(name),name)fornameinnames]
[docs]defnames_to_terms_aspects(self,names):""" For a list of GO namess returns a list of tuples with the terms, their names and ontology aspects. """return[(self.get_term(name),name,self.aspect_from_name(name))fornameinnames]
[docs]defaspect_from_name(self,name):""" Tells about a Gene Ontology term name which aspect does it belong to. """term=self.get_term(name)ifterm:returnself.get_aspect(term)
[docs]defsubgraph_nodes(self,direction,terms,relations=None,include_seed=True,):""" Returns a set of all nodes either in the subgraph of ancestors or descendants of a single term or a set of terms. :param str direction: Possible values: `ancestors` or `descendants`. :param bool include_seed: Include ``terms`` in the subgraph or only the related nodes. """relations=relationsorself.all_relationsifisinstance(terms,str):terms={terms}graph=getattr(self,direction)subgraph=set(terms)ifinclude_seedelseset()forterminterms:iftermnotingraph:iftermnotinROOT_ACS:self._log('GO term without known %ss: `%s`.'%(direction,term))continueforrelated,relationingraph[term]:ifrelationnotinrelations:continueifrelatednotinsubgraph:subgraph.update(self.subgraph_nodes(direction,related,relations))subgraph.add(related)returnsubgraph
[docs]defget_all_ancestors(self,terms,relations=None,include_seed=True):""" Returns a set of all ancestors of a single term or a set of terms. """terms=self.set_of_terms(terms)returnself.subgraph_nodes(direction='ancestors',terms=terms,relations=relations,include_seed=include_seed,)
[docs]defget_all_descendants(self,terms,relations=None,include_seed=True,):""" Returns a set of all descendants of a single term or a set of terms. """terms=self.set_of_terms(terms)returnself.subgraph_nodes(direction='descendants',terms=terms,relations=relations,include_seed=include_seed,)
[docs]defget_aspect(self,term):""" For a GO term tells which aspect does it belong to. Returns `None` if the term is not in the ontology. """ifterminself.aspect:returnself.aspect[term]
[docs]defall_from_aspect(self,aspect):""" Returns the set of all GO terms of one aspect. """returnset(termforterm,aspiniteritems(self.aspect)ifasp==aspect)
[docs]defis_root(self,term):""" Tells if a term is the root of the graph i.e. it has no ancestors. """returnterminself.ancestorsandbool(self.ancestors[term])
[docs]defis_leaf(self,term):""" Tells if a term is a leaf of the graph i.e. it has no descendants. """return((terminself.ancestorsandtermnotinself.descendants)ornotbool(self.descendants[term]))
[docs]deflowest(self,terms,*args):""" From a set of terms returns the lowest level ones, removing all which are parents of some others in the set. """returnself.flatten(terms,*args)
[docs]defhighest(self,terms,*args):""" From a set of terms returns the highest level ones, removing all which are descendants of some others in the set. """returnself.flatten(terms,*args,lowest=False)
[docs]defflatten(self,terms,*args,lowest=True):""" Returns a set of terms by removing either all redundant ancestors or descendants from the provided set terms. By removing the ancestors you get the lowest level set of terms, by removing the descendants the result will be the highest level non-redundant terms. :param str direction: Either `lowest` or `highest`. """terms=self.set_of_terms(terms,*args)method=getattr(self,'get_all_%s'%('ancestors'iflowestelse'descendants'))return(terms-set.union(*(method(term,include_seed=False)forterminterms)))
[docs]defset_of_terms(self,terms_names,*args):""" Converts anything to a set of terms. ``terms_names`` can be either a single term or name or an iterable of terms and names. """returnself.set_of(terms_names,*args)
[docs]defset_of_names(self,terms_names,*args):""" Converts anything to a set of names. ``terms_names`` can be either a single term or name or an iterable of terms and names. """returnself.set_of(terms_names,*args,to_terms=False)
[docs]defset_of(self,terms_names,*args,to_terms=True):""" Converts anything to a set of terms or names. ``terms_names`` can be either a single term or name or an iterable of terms and names. :param bool to_terms: The target identifier type is `term`; if ``False`` the target will be `name`. """ifisinstance(terms_names,str):terms_names={terms_names}elifnotisinstance(terms_names,set):terms_names=set(terms_names)ifargs:terms_names.update(set(args))method=getattr(self,'get_term'ifto_termselse'get_name')return{method(term)forterminterms_names}
[docs]def__init__(self,organism=9606,ontology=None,pickle_file=None,use_pickle_cache=True,):""" For one organism loads Gene Ontology annotations, in addition it accepts or creates a ``GeneOntology`` object. """session_mod.Logger.__init__(self,name='go')self.organism=organismself._pickle_file=pickle_fileself._use_pickle_cache=use_pickle_cacheifself._pickle_cache_load_hook():returnself.ontology=ontologyorGeneOntology()self._log('Populating Gene Ontology: ''annotations for organism `%u`.'%organism)annot=go_input.go_annotations_goa(organism=organism)self.c=annot['C']self.f=annot['F']self.p=annot['P']self._ancestors_annotate()self._merge_annotations()self._pickle_cache_save_hook()
[docs]defreload(self):""" Reloads the object from the module level. """modname=self.__class__.__module__mod=__import__(modname,fromlist=[modname.split('.')[0]])imp.reload(mod)new=getattr(mod,self.__class__.__name__)setattr(self,'__class__',new)
[docs]defget_name(self,term):""" For a GO accession number returns the name of the term. """returnself.ontology.get_name(term)
[docs]defget_term(self,name):""" For a GO term name returns its GO accession number. """returnself.ontology.get_term(name)
[docs]defget_annot(self,uniprot,aspect):""" For a UniProt ID returns its direct annotations from one aspect of Gene Ontology. Returns set. """annot=getattr(self,aspect.lower())returnannot[uniprot]ifuniprotinannotelseset()
[docs]defget_annots(self,uniprot):""" For a UniProt ID returns its direct annotations from all aspects of Gene Ontology. Returns set. """returnself.all[uniprot]ifuniprotinself.allelseset()
[docs]defget_annot_ancestors(self,uniprot,aspect):""" For a UniProt ID returns its annotations including lowest level terms and their ancestors from one aspect of Gene Ontology. Returns set. """annot=getattr(self,'%s_full'%aspect.lower())returnannot[uniprot]ifuniprotinannotelseset()
[docs]defget_annots_ancestors(self,uniprot):""" For a UniProt ID returns its annotations including lowest level terms and their ancestors from all aspects of Gene Ontology. Returns set. """returnself.all_full[uniprot]ifuniprotinself.all_fullelseset()
[docs]defhas_term(self,uniprot,term):""" Tells if an UniProt ID is annotated with a GO term. """returnuniprotinself.all_fullandterminself.all_full[uniprot]
[docs]defhas_any_term(self,uniprot,terms):""" Tells if an UniProt ID is annotated with any of a set of GO terms. """returnuniprotinself.all_fullandterm&self.all_full[uniprot]
[docs]defall_uniprots(self):""" Returns all UniProt IDs having annotations. """returnset.union(*(set(getattr(self,asp.lower()).keys())foraspinself.aspects))
[docs]defi_select_by_term(self,term,uniprots=None):""" Accepts a list of UniProt IDs and one or more gene ontology terms and returns a set of indices of those UniProts which are annotated with any of the terms. If no UniProts given all annotated UniProts considered. :param str,set term: A single GO term or set of terms. """uniprots=uniprotsorsorted(self.all_uniprots())method=self.has_any_termifisinstance(term,set)elseself.has_termreturnset(ifori,uniprotinenumerate(uniprots)ifmethod(uniprot,term))
[docs]defselect_by_name(self,name,uniprots=None,return_uniprots=False):""" Accepts a list of UniProt IDs and one or more gene ontology names and returns the UniProts which are annotated with any of the names. If no UniProts given all annotated UniProts returned. :param str,set name: A single GO term name or set of names. :param bool return_uniprots: By default returns list of indices; if ``True`` returns a set of the selected UniProt IDs. """ifisinstance(name,str):term=self.ontology.get_term(name)else:term=set(i[0]foriinself.ontology.names_to_terms(name))returnself.select(term,uniprots=uniprots,return_uniprots=return_uniprots,)
[docs]defselect_by_term(self,term,uniprots=None):""" Accepts a list of UniProt IDs and one or more gene ontology terms and returns the UniProts which are annotated with any of the terms. If no UniProts given all annotated UniProts returned. :param str,set term: A single GO term or set of terms. """uniprots=uniprotsorsorted(self.all_uniprots())returnset(np.array(uniprots)[list(self.i_select_by_term(term,uniprots))])
[docs]defexpr_names_to_terms(self,expr):""" Processes an expression built by names to expressions of terms. :arg str expr: An expression using Gene Ontology names, parentheses and logical operators. """not_name={'(',')','AND','OR','NOT'}tokens_names=_reexprname.findall(expr)tokens_terms=[]iftokens_names:fortintokens_names:t=t.strip()ifnott:continuetokens_terms.append((tift[:3]=='GO:'elset.lower()iftinnot_nameelseself.get_term(t)))returntokens_terms
[docs]defselect_by_expr(self,expr,uniprots=None,return_uniprots=False,):""" Selects UniProts based on an expression of Gene Ontology terms. Operator precedence not considered, please use parentheses. Return indices of the selected elements in the ``uniprots`` list or the set of selected UniProt IDs. :param str expr: An expression of Gene Ontology terms and names. E.g. ``'(GO:0005576 and not GO:0070062) or GO:0005887'``. Parentheses and operators ``and``, ``or`` and ``not`` can be used. Another example: ``hormone binding AND (cell surface OR GO:0009897)``. :param bool return_uniprots: By default returns list of indices; if ``True`` returns a set of the selected UniProt IDs. """expr_terms=self.expr_names_to_terms(expr)result=self.select_by_expr_terms(expr=expr_terms,uniprots=uniprots,return_uniprots=return_uniprots,)ifany(eisNoneforeinresult):self._log('Could not process Gene Ontology expression: `%s`. ''Please check if the expression consists only of ''GO terms and/or ACs, the operators `AND`, `OR` and ''`NOT`, and braces. Whitespaces and newlines are OK. ''If you think the expression is correct please open ''an issue for `pypath`.'%expr,-9,)returnresult
[docs]defselect_by_expr_terms(self,expr,uniprots=None,return_uniprots=False,):""" Selects UniProts based on an expression of Gene Ontology terms. Operator precedence not considered, please use parentheses. Return indices of the selected elements in the ``uniprots`` list or the set of selected UniProt IDs. :param str expr: An expression of Gene Ontology terms. E.g. ``'(GO:0005576 and not GO:0070062) or GO:0005887'``. Parentheses and operators ``and``, ``or`` and ``not`` can be used. :param bool return_uniprots: By default returns list of indices; if ``True`` returns a set of the selected UniProt IDs. """ops={'and':'intersection','or':'union',}# if no UniProts provided does not make sense to return indicesreturn_uniprots=return_uniprotsoruniprotsisNoneuniprots=uniprotsorsorted(self.all_uniprots())ifisinstance(expr,str):# tokenizing expression if it is a string# (method is recursive)expr=_reexprterm.findall(expr)ifany(eisNoneforeinexpr):self._log('Could not process Gene Ontology expression: `%s`. ''Please check if the expression consists only of ''GO terms and/or ACs, the operators `AND`, `OR` and ''`NOT`, and braces. Whitespaces and newlines are OK. ''If you think the expression is correct please open ''an issue for `pypath`.'%expr,-9,)returnset()# initial valuesresult=set()stack=[]sub=Falsenegate=Falseop=Nonethis_set=Noneforitinexpr:# processing expression by tokens# we are in a sub-selection partifsub:ifit==')':# token is a closing parenthesis# execute sub-selectionthis_set=self.select_by_expr_terms(expr=stack,uniprots=uniprots,)# empty stackstack=[]sub=Falseelse:# token is something else# add to sub-selection stackstack.append(it)elifitisNone:self._log('One part of a Gene Ontology the expression failed to ''translate to GO AC. Substituting with empty set, this ''will alter your results. Check for more specific ''information earlier in the log.')this_set=set()# we do actual processing of the expressionelifit.lower()=='not':# token is negation# turn on negation for the next setnegate=Truecontinue# open a sub-selection partelifit=='(':# token is a parenthesis# start a new sub-selectionsub=Truecontinueelifit[:3]=='GO:':# token is a GO term# get the vertex selection by the single term methodthis_set=self.i_select_by_term(it,uniprots=uniprots)ifnegate:# take the inverse of the current setthis_set=set(xrange(len(uniprots)))-this_set# set negation again to Falsenegate=Falseelifit.lower()inops:# token is an operator# set it for use at the next operationop=ops[it.lower()]# we found a setifthis_setisnotNone:# and an operatorifopisnotNone:result=getattr(result,op)(this_set)# this normally happens only at the first setelse:result=this_setthis_set=Noneop=Nonereturnself._uniprot_return(result,uniprots,return_uniprots)
[docs]defselect(self,terms,uniprots=None,return_uniprots=False):""" Retrieves the UniProt IDs annotated with any Gene Ontology terms or their descendants, or evaluates string expression (see ``select_by_expr``). Returns indices of the selected elements in the ``uniprots`` list or the set of selected UniProt IDs. :param str,set terms: A single GO term, a set of GO terms or an expression with GO terms. :param bool return_uniprots: By default returns list of indices; if ``True`` returns a set of the selected UniProt IDs. """return_uniprots=return_uniprotsoruniprotsisNoneuniprots=uniprotsorsorted(self.all_uniprots())# this is not an individual term but an expressionif(isinstance(terms,str)andnotterms.startswith('GO')):result=self.select_by_expr(terms,uniprots=uniprots)# either one term or a set of termselse:result=self.i_select_by_term(terms,uniprots=uniprots)returnself._uniprot_return(result,uniprots,return_uniprots)
[docs]defselect_by_all(self,terms,uniprots=None,return_uniprots=False):""" Selects the nodes annotated by all GO terms in ``terms``. Returns indices of the selected elements in the ``uniprots`` list or the set of selected UniProt IDs. :param list terms: List, set or tuple of GO terms. :param bool return_uniprots: By default returns list of indices; if ``True`` returns a set of the selected UniProt IDs. """return_uniprots=return_uniprotsoruniprotsisNoneuniprots=uniprotsorsorted(self.all_uniprots())idx=set.intersection(*[self.select_by_term(term)forterminterms])returnself._uniprot_return(idx,uniprots,return_uniprots)
def_uniprot_return(self,idx,uniprots,return_uniprots):ifreturn_uniprots:returnset(np.array(uniprots)[list(idx)])returnidxdef_pickle_cache_load_hook(self):ifnotself._use_pickle_cache:returnself._set_pickle_path()self.load_from_pickle()returnos.path.exists(self._pickle_file)def_pickle_cache_save_hook(self):self._set_pickle_path()self.save_to_pickle()def_set_pickle_path(self):self._pickle_file=(self._pickle_fileoros.path.join(cache.get_cachedir(),settings.get('go_pickle_cache_fname')%self.organism,))defsave_to_pickle(self,pickle_file=None):pickle_file=pickle_fileorself._pickle_fileifnotisinstance(pickle_file,str):self._log('Pickle file path must be a string: `%s`.'%str(pickle_file))returnself._log('Saving to pickle `%s`.'%pickle_file)withopen(pickle_file,'wb')asfp:pickle.dump(obj=(self.c_full,self.p_full,self.f_full,self.all_full,self.c,self.p,self.f,self.all,self.ontology._terms,self.ontology.ancestors,self.ontology.descendants,self.ontology.term,self.ontology.name,),file=fp,)self._log('Saved to pickle `%s`.'%pickle_file)defload_from_pickle(self,pickle_file=None):pickle_file=pickle_fileorself._pickle_fileifnotos.path.exists(pickle_file):self._log('Pickle file does not exist: `%s`.'%str(pickle_file))returnself._log('Loading from pickle `%s`.'%pickle_file)withopen(pickle_file,'rb')asfp:(self.c_full,self.p_full,self.f_full,self.all_full,self.c,self.p,self.f,self.all,ontology_terms,ontology_ancestors,ontology_descendants,ontology_term,ontology_name,)=pickle.load(fp)self.ontology=GeneOntology(terms=ontology_terms,ancestors=ontology_ancestors,descendants=ontology_descendants,term=ontology_term,name=ontology_name,)self._log('Loaded from pickle `%s`.'%pickle_file)
[docs]def__init__(self,categories,go_annot=None,ncbi_tax_id=9606,):""" Provides annotations by a custom set of GO terms or expressions built from multiple terms. :arg dict categories: A dict with custom category labels as keys and single GO terms or names or complex expressions as values. Alternatively a set of GO terms, in this case the term names will be used as labels. :arg pypath.go.GOAnnotation go_annot: A :class:``pypath.go.GOAnnotation`` object. """session_mod.Logger.__init__(self,name='go')self.go_annot=go_annotorget_db()# TODO: consider ncbi_tax_id at# selection DBself._categories=categoriesself.process_categories()
[docs]defreload(self):""" Reloads the object from the module level. """modname=self.__class__.__module__mod=__import__(modname,fromlist=[modname.split('.')[0]])imp.reload(mod)new=getattr(mod,self.__class__.__name__)setattr(self,'__class__',new)
[docs]defprocess_categories(self):""" Translates GO term names listed in categories to GO terms ACs. """# if the categories are grouped by aspectsif(isinstance(self._categories,dict)andnotset(self._categories.keys())-set(self.go_annot.aspects)):ifisinstance(list(self._categories.values())[0],set):self._categories=set.union(*self._categories.values())elifisinstance(list(self._categories.values())[0],dict):self._categories=dict(itertools.chain(*(d.items()fordinself._categories.values())))# if a set provided we use names as keys# and accessions as valuesifisinstance(self._categories,set):self._categories=dict((self.go_annot.get_name(cat),self.go_annot.get_term(cat))forcatinself._categories)self.categories=self._categories
[docs]defget_annotation(self,category,uniprots=None):""" For a category name returns a set of UniProt IDs annotated with the corresponding Gene Ontology terms or expression. :arg str category: The category name, should be a key in the ``categories`` dict. :arg set uniprots: A set or list of UniProt IDs. If ``None``, annotations based on all UniProts in GO annotation will be returned. """returnself.go_annot.select(self.categories[category],uniprots=uniprots,return_uniprots=True,)
[docs]defget_annotations(self,uniprots=None):""" Returns a dict with set of UniProt IDs for each category. :arg set uniprots: A set or list of UniProt IDs. If ``None``, annotations based on all UniProts in GO annotation will be returned. """returndict((category,self.get_annotation(category,uniprots=uniprots))forcategoryinself.categories.keys())
[docs]defannotate(graph,organism=9606,aspects=('C','F','P')):""" Adds Gene Ontology annotations to the nodes of a graph. :param igraph.Graph graph: Any ``igraph.Graph`` object with uniprot IDs in its ``name`` vertex attribute. """aspects=aspectsiftype(aspects)in{list,tuple}else(aspects,)graph.vs['go']=[{'C':set(),'F':set(),'P':set()}for_inxrange(graph.vcount())]terms,annot=go_input.go_annotations_goa(organism=organism)prg=progress.Progress(graph.vcount(),'Loading GO annotations',9)forvingraph.vs:prg.step()foraspinaspects:ifv['name']inannot[asp]:v['go'][asp]=annot[asp][v['name']]prg.terminate()
# old name as synonymload_go=annotate
[docs]definit_db(organism=9606,pickle_file=None,use_pickle_cache=True):""" Initializes or reloads the GO annotation database. The database will be assigned to the ``db`` attribute of this module. """if'db'notinglobals():globals()['db']={}globals()['db'][organism]=GOAnnotation(organism,pickle_file=pickle_file,use_pickle_cache=use_pickle_cache,)
[docs]defget_db(organism=9606,pickle_file=None,use_pickle_cache=True):""" Retrieves the current database instance and initializes it if does not exist yet. """# TODO: consider organism# TODO: delete the DB if not used in order to free memory# TODO: introduce pickle cache to make it load quickerif'db'notinglobals()ororganismnotinglobals()['db']:init_db(organism,pickle_file=pickle_file,use_pickle_cache=use_pickle_cache,)returnglobals()['db'][organism]