#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#from__future__importannotationsfrompast.builtinsimportxrange,rangefromfuture.utilsimportiteritemsimportosimportreimportjsonimportcollectionsfromlxmlimportetreeimportpypath.resources.urlsasurlsimportpypath.share.curlascurlimportpypath.share.progressasprogressimportpypath.share.sessionassessionimportpypath.share.commonascommonimportpypath.utils.taxonomyastaxonomy_logger=session.Logger(name='uniprot_input')_log=_logger._logHEADER_ACCEPT_TSV={'Accept':'text/tsv'}HEADER_ACCEPT_JSON={'Accept':'application/json'}
[docs]defgo_annotations_uniprot(organism=9606,swissprot='yes'):""" Deprecated, should be removed soon. """rev=''ifswissprotisNone \
else' AND reviewed:%s'%swissprotquery='organism:%u%s'%(int(organism),rev)url=urls.urls['uniprot_basic']['url']get={'query':query,'format':'tab','columns':'id,go-id'}c=curl.Curl(url,get=get,silent=False)data=c.resultreturndict([(x[0],[go.strip()forgoinx[1].split(';')])forxin[x.split('\t')forxindata.split('\n')]iflen(x)>1])
[docs]defgo_annotations_goa(organism='human',evidence_codes=False,):""" Downloads GO annotation from UniProt GOA. Args: organism: Organism name or NCBI Taxonomy ID. evidence_codes: Include evidence codes in the output. """organism=taxonomy.ensure_common_name(organism)annot=dict((asp,collections.defaultdict(set))foraspin('C','P','F'))url=urls.urls['goa']['ebi_url']%(organism.upper(),organism.lower())c=curl.Curl(url,silent=False,large=True)forlineinc.result:ifnotlineorline[0]=='!':continueline=line.strip().split('\t')ifevidence_codes:annot[line[8]][line[1]].add((line[4],line[6]))else:annot[line[8]][line[1]].add(line[4])returndict((k,dict(v))fork,viniteritems(annot))
# synonym for the default methodgo_annotations=go_annotations_goa
[docs]defgo_ancestors_goose(aspects=('C','F','P')):""" Queries the ancestors of GO terms by AmiGO goose. Returns dict of sets where keys are GO accessions and values are sets of their ancestors. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """aspects_part=''respaces=re.compile(r'[\s\n]+')ontologies={'C':'cellular_component','F':'molecular_function','P':'biological_process',}ifset(aspects)!={'C','F','P'}:aspects_part='WHERE (%s)'%(' OR '.join('term.term_type = "%s"'%ontologies[asp]foraspinaspects))sql_path=os.path.join(common.DATA,'goose_ancestors.sql')withopen(sql_path,'r')asfp:query=fp.read()query=query%aspects_partquery=respaces.sub(r' ',query).strip()url=urls.urls['goose']['url']%queryc=curl.Curl(url,silent=False,large=True)ancestors=collections.defaultdict(set)forlinc.result:l=l.strip().split('\t')ancestors[l[0]].add(l[1])returnancestors
[docs]defgo_ancestors_quickgo(aspects=('C','F','P')):""" Queries the ancestors of GO terms by QuickGO REST API. Returns dict of sets where keys are GO accessions and values are sets of their ancestors. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """desc=go_descendants_quickgo(aspects=aspects)returngo_descendants_to_ancestors(desc)
# synonym for the default methodgo_ancestors=go_ancestors_quickgo
[docs]defgo_descendants_to_ancestors(desc):""" Turns a dict of descendants to dict of ancestors by swapping the relationships. This way descendants will be the keys and their ancestors will be the values. """ancestors={}forasp,dctiniteritems(desc):ancestors[asp]=collections.defaultdict(set)foranc_term,desiniteritems(dct):fordes_term,relindes:ancestors[asp][des_term].add((anc_term,rel))ancestors[asp]=dict(ancestors[asp])returnancestors
[docs]defgo_descendants_goose(aspects=('C','F','P')):""" Queries descendants of GO terms by AmiGO goose. IMPORTANT: This is not the preferred method any more to get descendants. Recently the preferred method to access GO annotations is ``pypath.inputs.go.go_descendants_quickgo``. The data in GO MySQL instances has not been updated since Dec 2016. Unfortunately the providers ceased to support MySQL, the most flexible and highest performance access to GO data. The replacement is Solr which is far from providing the same features as MySQL, for example it is unable to provide GO graph relationships. Other service is QuickGO which is up to date and has nice ways to query the ontology. Returns dict of sets where keys are GO accessions and values are sets of their descendants. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """desc=collections.defaultdict(set)anc=go_ancestors_goose(aspects=aspects)forterm,ancsiniteritems(anc):fortermainancs:desc[terma].add(term)returndesc
[docs]defgo_descendants_quickgo(aspects=('C','F','P'),terms=None,relations=None,quickgo_download_size=500,):""" Queries descendants of GO terms by QuickGO REST API. Returns dict of sets where keys are GO accessions and values are sets of their descendants. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. :param dict terms: Result from ``go_terms_solr``. If ``None`` the method will be called. """defdownload_in_chunks(terms,chunk_size,target=None):target=targetorcollections.defaultdict(set)paginator=common.paginate(terms,chunk_size)forp,terms_partinenumerate(paginator):url=urls.urls['quickgo_rest']['desc']%(','.join(terms_part),'?relations = %s'%relations_part,)c=curl.Curl(url,req_headers=HEADER_ACCEPT_JSON,silent=True,large=True,)try:result=json.load(c.fileobj)exceptjson.decoder.JSONDecodeError:done=chunk_size*premaining=terms[done:]new_chunk_size=chunk_size//2ifnew_chunk_size<10:_log('Failed to download QuickGO, tried to decrease the ''number of terms in each query, went below 10 terms ''per query but still getting erroneous JSON. ''This might be due to very slow network connection. ''You might increase the timeout of CURL. ''But then it will take forever.')returntargetreturndownload_in_chunks(terms=remaining,chunk_size=new_chunk_size,target=taret,)forresinresult['results']:if'children'notinres:continuetarget[res['id']].update(set((child['id'],child['relation'])forchildinres['children']))returntargetdesc={}terms=termsorgo_terms_quickgo(aspects=aspects)relations=relationsor('is_a','part_of','occurs_in','regulates',)relations_part=','.join(relations)foraspinaspects:desc[asp]=download_in_chunks(terms=list(terms[asp].keys()),chunk_size=quickgo_download_size,)returndesc
# synonym for the default methodgo_descendants=go_descendants_quickgo
[docs]defgo_terms_solr(aspects=('C','F','P')):""" Queries GO terms by AmiGO Solr. Returns dict of dicts where upper level keys are one letter codes of the aspects `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. Lower level keys are GO accessions and values are names of the terms. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """reamp=re.compile(r'[\s\n\r]+([&\?])')relin=re.compile(r'[\s\n\r]+')ontologies={'C':'cellular_component','F':'molecular_function','P':'biological_process',}ontol_short=dict(reversed(i)foriinontologies.items())terms=dict((a,{})forainaspects)query=''' ?q = document_category:"ontology_class" AND idspace:GO AND is_obsolete:0 &rows = 9999999 &start = 0 &fl = annotation_class,annotation_class_label,source '''query=relin.sub(' ',reamp.sub(r'\1',query.strip()))# downloading dataurl=urls.urls['golr']['url']%queryc=curl.Curl(url,silent=False,large=True)# parsing XML by lxml.etree.iterparseparser=etree.iterparse(c.fileobj,events=('start','end'))root=next(parser)used_elements=[]forev,eleminparser:ifev=='end'andelem.tag=='doc':asp=elem.find('.//str[@name="source"]').textasp=ontol_short[asp]ifaspnotinaspects:continueterm=elem.find('.//str[@name="annotation_class"]').textname=elem.find('.//str[@name="annotation_class_label"]').textterms[asp][term]=nameused_elements.append(elem)# removing used elements to keep memory lowiflen(used_elements)>1000:for_inxrange(500):e=used_elements.pop(0)e.clear()# closing the XMLc.fileobj.close()delcreturnterms
[docs]defgo_terms_quickgo(aspects=('C','F','P')):""" Queries GO terms by the QuickGO REST API. Return dict of dicts where upper level keys are one letter codes of the aspects `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. Lower level keys are GO accessions and values are names of the terms. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """ontologies={'C':'cellular_component','F':'molecular_function','P':'biological_process',}ontol_short=dict(reversed(i)foriinontologies.items())result=dict((a,{})forainaspects)url=urls.urls['quickgo_rest']['terms']last_page=9999999this_page=1prg=progress.Progress(name='Downloading data from QuickGO',interval=1,)whilethis_page<=last_page:page_url=url%this_pagec=curl.Curl(page_url,silent=True)this_result=json.loads(c.result)last_page=this_result['pageInfo']['total']forresinthis_result['results']:if'aspect'notinres:continueasp=ontol_short[res['aspect']]ifres['isObsolete']oraspnotinaspects:continueresult[asp][res['id']]=res['name']ifprg.totalisNone:prg.set_total(last_page)prg.step()this_page+=1returnresult
# synonym for the default methodgo_terms=go_terms_quickgo
[docs]defgo_terms_goose(aspects=('C','F','P')):""" Queries GO terms by AmiGO goose. Return dict of dicts where upper level keys are one letter codes of the aspects `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. Lower level keys are GO accessions and values are names of the terms. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """aspects_part=''respaces=re.compile(r'[\s\n]+')ontologies={'C':'cellular_component','F':'molecular_function','P':'biological_process',}ontol_short=dict(reversed(i)foriinontologies.items())ifset(aspects)!={'C','F','P'}:aspects_part='WHERE (%s)'%(' OR '.join('term.term_type = "%s"'%ontologies[asp]foraspinaspects))sql_path=os.path.join(common.DATA,'goose_terms.sql')withopen(sql_path,'r')asfp:query=fp.read()query=query%aspects_partquery=respaces.sub(r' ',query).strip()url=urls.urls['goose']['url']%queryc=curl.Curl(url,silent=False,large=True)terms={'P':{},'C':{},'F':{}}forlinc.result:l=l.strip().split('\t')ifl[1]notinontol_short:continueaspect=ontol_short[l[1]]terms[aspect][l[2]]=l[0]returnterms
[docs]defgo_annotations_quickgo(organism=9606,aspects=('C','F','P'),relations=('is_a','part_of'),):""" Queries GO annotations by QuickGO REST API. IMPORTANT: Recently the preferred method to access GO annotations is ``pypath.inputs.go.go_annotations_goa``. Contrary to its name QuickGO is super slow, otherwise it should yield up to date data, identical to the GOA file. Returns terms in dict of dicts and annotations in dict of dicts of sets. In both dicts the keys are aspects by their one letter codes. In the term dicts keys are GO accessions and values are their names. In the annotation dicts keys are UniProt IDs and values are sets of GO accessions. :param int organism: NCBI Taxonomy ID of one organism. Default is human (9606). :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. :param list uniprots: Optionally a list of UniProt IDs. If `None`, results for all proteins returned. """annot=dict((a,collections.defaultdict(set))forainaspects)ontologies={'C':'cellular_component','F':'molecular_function','P':'biological_process',}ontol_short=dict(reversed(i)foriinontologies.items())url=urls.urls['quickgo_rest']['annot']aspects_part=','.join(ontologies[a]forainaspects)relations_part=','.join(relations)page=1whileTrue:this_url=url%(aspects_part,# aspectrelations_part,# goUsageRelationshipsorganism,# taxonIdpage,)c=curl.Curl(url=this_url,req_headers=HEADER_ACCEPT_TSV,silent=False,large=True)_=next(c.result)# the header rowforlineinc.result:line=line.strip().split('\t')ifline[3]notinrelations:continueannot[line[5]][line[1]].add(line[4])page+=1returnannot
[docs]defgo_annotations_solr(organism=9606,aspects=('C','F','P'),references=False,):""" Queries GO annotations by AmiGO Solr. Before other methods have been provided to access GO. Now this is the preferred method to get annotations. Returns terms in dict of dicts and annotations in dict of dicts of sets. In both dicts the keys are aspects by their one letter codes. In the term dicts keys are GO accessions and values are their names. In the annotation dicts keys are UniProt IDs and values are sets of GO accessions. :param int organism: NCBI Taxonomy ID of one organism. Default is human (9606). :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. :param bool references: Retrieve the references (PubMed IDs) for the annotations. Currently not implemented. """reamp=re.compile(r'[\s\n\r]+([&\?])')relin=re.compile(r'[\s\n\r]+')annot=dict((a,collections.defaultdict(set))forainaspects)ontologies={'C':'cellular_component','F':'molecular_function','P':'biological_process',}ontol_short=dict(reversed(i)foriinontologies.items())# assembling the queryiflen(aspects)<3:aspects_part=' AND (%s)'%(' OR '.join('aspect:%s'%aforainaspects))else:aspects_part=''refs_part=',reference'ifreferenceselse''query=''' ?q = taxon:"NCBITaxon:%u" AND type:protein AND document_category:annotation AND source:UniProtKB%s &rows = 9999999 &start = 0 &fl = bioentity,annotation_class,aspect%s '''%(organism,aspects_part,refs_part)query=relin.sub(' ',reamp.sub(r'\1',query.strip()))# downloading dataurl=urls.urls['golr']['url']%queryc=curl.Curl(url,silent=False,large=True)# parsing XML by lxml.etree.iterparseparser=etree.iterparse(c.fileobj,events=('start','end'))root=next(parser)used_elements=[]forev,eleminparser:ifev=='end'andelem.tag=='doc':id_=elem.find('.//str[@name="bioentity"]').textifnotid_.startswith('UniProtKB:'):continueasp=elem.find('.//str[@name="aspect"]').textifaspnotinaspects:continueterm=elem.find('.//str[@name="annotation_class"]').textid_=id_[10:]# removing the `UniProtKB:` prefix# adding the term to the annotation dictannot[asp][id_].add(term)used_elements.append(elem)# removing used elements to keep memory lowiflen(used_elements)>1000:for_inxrange(500):e=used_elements.pop(0)e.clear()# closing the XMLc.fileobj.close()delcreturnterms,annot
[docs]defgo_annotations_goose(organism=9606,aspects=('C','F','P'),uniprots=None):""" Queries GO annotations by AmiGO goose. IMPORTANT: This is not the preferred method any more to get terms and annotations. Recently the preferred method to access GO annotations is ``pypath.inputs.go.go_annotations_solr``. The data in GO MySQL instances has not been updated since Dec 2016. Unfortunately the providers ceased to support MySQL, the most flexible and highest performance access to GO data. The replacement is Solr which is far from providing the same features as MySQL. Returns terms in dict of dicts and annotations in dict of dicts of sets. In both dicts the keys are aspects by their one letter codes. In the term dicts keys are GO accessions and values are their names. In the annotation dicts keys are UniProt IDs and values are sets of GO accessions. :param int organism: NCBI Taxonomy ID of one organism. Default is human (9606). :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. :param list uniprots: Optionally a list of UniProt IDs. If `None`, results for all proteins returned. """aspects_part=''uniprot_part=''respaces=re.compile(r'[\s\n]+')ontologies={'C':'cellular_component','F':'molecular_function','P':'biological_process',}ontol_short=dict(reversed(i)foriinontologies.items())ifset(aspects)!={'C','F','P'}:aspects_part='(%s) AND'%(' OR '.join('term.term_type="%s"'%ontologies[asp]foraspinaspects))ifuniprotsisnotNone:uniprot_part='dbxref.xref_key IN (%s) AND'%(','.join('"%s"'%uniprotforuniprotinuniprots))sql_path=os.path.join(common.DATA,'goose_annotations.sql')withopen(sql_path,'r')asfp:query=fp.read()query=query%(organism,aspects_part,uniprot_part)query=respaces.sub(r' ',query).strip()url=urls.urls['goose']['url']%queryc=curl.Curl(url,silent=False,large=True)terms={'P':{},'C':{},'F':{}}annot={'C':collections.defaultdict(set),'F':collections.defaultdict(set),'P':collections.defaultdict(set),}forlinc.result:l=l.strip().split('\t')aspect=ontol_short[l[1]]terms[aspect][l[2]]=l[0]annot[aspect][l[5]].add(l[2])returnterms,annot
[docs]defget_go_desc(go_ids,organism=9606):""" Deprecated, should be removed soon. """go_ids=(','.join(sorted(go_ids))iftype(go_ids)in{list,tuple,set}elsego_ids)url=urls.urls['quickgo_desc']['url']%(organism,go_ids)c=curl.Curl(url,silent=False,large=True,req_headers=HEADER_ACCEPT_TSV,)_=c.result.readline()returnset(l.split('\t')[1]forlinc.result)
[docs]defget_go_quick(organism=9606,slim=False,names_only=False,aspects=('C','F','P'),):""" Deprecated, should be removed soon. Loads GO terms and annotations from QuickGO. Returns 2 dicts: `names` are GO terms by their IDs, `terms` are proteins GO IDs by UniProt IDs. """ontologies={'C':'cellular_component','F':'molecular_function','P':'biological_process',}terms={'C':collections.defaultdict(set),'F':collections.defaultdict(set),'P':collections.defaultdict(set),}names={}aspects_param=','.join(sorted(ontologies[a]forainaspects))url=urls.urls['quickgo']['url']%(organism,aspects_param,'&goUsage = slim'ifslimelse'',)c=curl.Curl(url,silent=False,large=True,req_headers=HEADER_ACCEPT_TSV,keep_failed=True,)_=next(c.result)forlinc.result:l=l.split('\t')ifnotnames_only:terms[l[5]][l[1]].add(l[4])return{'terms':terms,'names':names}