Source code for pypath.inputs.go

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#  This file is part of the `pypath` python module
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#  Website:

from __future__ import annotations

from past.builtins import xrange, range
from future.utils import iteritems

import os
import re
import json
import collections

from lxml import etree

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.progress as progress
import pypath.share.session as session
import pypath.share.common as common
import pypath.utils.taxonomy as taxonomy

_logger = session.Logger(name = 'uniprot_input')
_log = _logger._log

HEADER_ACCEPT_TSV = {'Accept': 'text/tsv'}
HEADER_ACCEPT_JSON = {'Accept': 'application/json'}

[docs] def go_annotations_uniprot(organism = 9606, swissprot = 'yes'): """ Deprecated, should be removed soon. """ rev = '' if swissprot is None \ else ' AND reviewed:%s' % swissprot query = 'organism:%u%s' % (int(organism), rev) url = urls.urls['uniprot_basic']['url'] get = {'query': query, 'format': 'tab', 'columns': 'id,go-id'} c = curl.Curl(url, get = get, silent = False) data = c.result return dict([(x[0], [go.strip() for go in x[1].split(';')]) for x in [x.split('\t') for x in data.split('\n')] if len(x) > 1])
[docs] def go_annotations_goa( organism = 'human', evidence_codes=False, ): """ Downloads GO annotation from UniProt GOA. Args: organism: Organism name or NCBI Taxonomy ID. evidence_codes: Include evidence codes in the output. """ organism = taxonomy.ensure_common_name(organism) annot = dict( (asp, collections.defaultdict(set)) for asp in ('C', 'P', 'F') ) url = urls.urls['goa']['ebi_url'] % (organism.upper(), organism.lower()) c = curl.Curl(url, silent = False, large = True) for line in c.result: if not line or line[0] == '!': continue line = line.strip().split('\t') if evidence_codes: annot[line[8]][line[1]].add((line[4], line[6])) else: annot[line[8]][line[1]].add(line[4]) return dict((k, dict(v)) for k, v in iteritems(annot))
# synonym for the default method go_annotations = go_annotations_goa
[docs] def go_annotations_all( organism: int | str = 'human', fields: str | list[str] | None = None ) -> dict[str, set[tuple[str]]]: if organism != '*': organism = taxonomy.ensure_common_name(organism) all_fields = ( 'db', 'db_object_id', 'db_object_symbol', 'qualifier', 'go_id', 'reference', 'evidence_code', 'with_or_from', 'aspect', 'db_object_name', 'db_object_synonym', 'db_object_type', 'taxon_and_interacting_taxon', 'date', 'assigned_By', 'annotation_extension', 'gene_product_form_id' ) fields = fields or all_fields fields = common.to_list(fields) if organism in ('*', None): url = urls.urls['goa']['ebi_url'] % ('UNIPROT', 'uniprot_gcrp') else: url = urls.urls['goa']['ebi_url'] % (organism.upper(), organism.lower()) c = curl.Curl(url, silent = False, large = True) result = collections.defaultdict(set) record = collections.namedtuple('GoAnnotation', fields) for line in c.result: if not line.strip() or line[0] == '!': continue line = dict(zip(all_fields, line.strip().split('\t'))) result[line['db_object_id']].add( record(**dict(zip(fields, (line.get(f, None) for f in fields)))) ) return dict(result)
[docs] def go_ancestors_goose(aspects = ('C','F','P')): """ Queries the ancestors of GO terms by AmiGO goose. Returns dict of sets where keys are GO accessions and values are sets of their ancestors. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """ aspects_part = '' respaces = re.compile(r'[\s\n]+') ontologies = { 'C': 'cellular_component', 'F': 'molecular_function', 'P': 'biological_process', } if set(aspects) != {'C', 'F', 'P'}: aspects_part = 'WHERE (%s)' % ( ' OR '.join( 'term.term_type = "%s"' % ontologies[asp] for asp in aspects ) ) sql_path = os.path.join(common.DATA, 'goose_ancestors.sql') with open(sql_path, 'r') as fp: query = query = query % aspects_part query = respaces.sub(r' ', query).strip() url = urls.urls['goose']['url'] % query c = curl.Curl(url, silent = False, large = True) ancestors = collections.defaultdict(set) for l in c.result: l = l.strip().split('\t') ancestors[l[0]].add(l[1]) return ancestors
[docs] def go_ancestors_quickgo(aspects = ('C', 'F', 'P')): """ Queries the ancestors of GO terms by QuickGO REST API. Returns dict of sets where keys are GO accessions and values are sets of their ancestors. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """ desc = go_descendants_quickgo(aspects = aspects) return go_descendants_to_ancestors(desc)
# synonym for the default method go_ancestors = go_ancestors_quickgo
[docs] def go_descendants_to_ancestors(desc): """ Turns a dict of descendants to dict of ancestors by swapping the relationships. This way descendants will be the keys and their ancestors will be the values. """ ancestors = {} for asp, dct in iteritems(desc): ancestors[asp] = collections.defaultdict(set) for anc_term, des in iteritems(dct): for des_term, rel in des: ancestors[asp][des_term].add((anc_term, rel)) ancestors[asp] = dict(ancestors[asp]) return ancestors
[docs] def go_descendants_goose(aspects = ('C','F','P')): """ Queries descendants of GO terms by AmiGO goose. IMPORTANT: This is not the preferred method any more to get descendants. Recently the preferred method to access GO annotations is ``pypath.inputs.go.go_descendants_quickgo``. The data in GO MySQL instances has not been updated since Dec 2016. Unfortunately the providers ceased to support MySQL, the most flexible and highest performance access to GO data. The replacement is Solr which is far from providing the same features as MySQL, for example it is unable to provide GO graph relationships. Other service is QuickGO which is up to date and has nice ways to query the ontology. Returns dict of sets where keys are GO accessions and values are sets of their descendants. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """ desc = collections.defaultdict(set) anc = go_ancestors_goose(aspects = aspects) for term, ancs in iteritems(anc): for terma in ancs: desc[terma].add(term) return desc
[docs] def go_descendants_quickgo( aspects = ('C', 'F', 'P'), terms = None, relations = None, quickgo_download_size = 500, ): """ Queries descendants of GO terms by QuickGO REST API. Returns dict of sets where keys are GO accessions and values are sets of their descendants. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. :param dict terms: Result from ``go_terms_solr``. If ``None`` the method will be called. """ def download_in_chunks(terms, chunk_size, target = None): target = target or collections.defaultdict(set) paginator = common.paginate(terms, chunk_size) for p, terms_part in enumerate(paginator): url = urls.urls['quickgo_rest']['desc'] % ( ','.join(terms_part), '?relations = %s' % relations_part, ) c = curl.Curl( url, req_headers = HEADER_ACCEPT_JSON, silent = True, large = True, ) try: result = json.load(c.fileobj) except json.decoder.JSONDecodeError: done = chunk_size * p remaining = terms[done:] new_chunk_size = chunk_size // 2 if new_chunk_size < 10: _log( 'Failed to download QuickGO, tried to decrease the ' 'number of terms in each query, went below 10 terms ' 'per query but still getting erroneous JSON. ' 'This might be due to very slow network connection. ' 'You might increase the timeout of CURL. ' 'But then it will take forever.' ) return target return download_in_chunks( terms = remaining, chunk_size = new_chunk_size, target = taret, ) for res in result['results']: if 'children' not in res: continue target[res['id']].update( set( (child['id'], child['relation']) for child in res['children'] ) ) return target desc = {} terms = terms or go_terms_quickgo(aspects = aspects) relations = relations or ('is_a', 'part_of', 'occurs_in', 'regulates',) relations_part = ','.join(relations) for asp in aspects: desc[asp] = download_in_chunks( terms = list(terms[asp].keys()), chunk_size = quickgo_download_size, ) return desc
# synonym for the default method go_descendants = go_descendants_quickgo
[docs] def go_terms_solr(aspects = ('C', 'F', 'P')): """ Queries GO terms by AmiGO Solr. Returns dict of dicts where upper level keys are one letter codes of the aspects `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. Lower level keys are GO accessions and values are names of the terms. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """ reamp = re.compile(r'[\s\n\r]+([&\?])') relin = re.compile(r'[\s\n\r]+') ontologies = { 'C': 'cellular_component', 'F': 'molecular_function', 'P': 'biological_process', } ontol_short = dict(reversed(i) for i in ontologies.items()) terms = dict((a, {}) for a in aspects) query = ''' ?q = document_category:"ontology_class" AND idspace:GO AND is_obsolete:0 &rows = 9999999 &start = 0 &fl = annotation_class,annotation_class_label,source ''' query = relin.sub(' ', reamp.sub(r'\1', query.strip())) # downloading data url = urls.urls['golr']['url'] % query c = curl.Curl(url, silent = False, large = True) # parsing XML by lxml.etree.iterparse parser = etree.iterparse(c.fileobj, events = ('start', 'end')) root = next(parser) used_elements = [] for ev, elem in parser: if ev == 'end' and elem.tag == 'doc': asp = elem.find('.//str[@name="source"]').text asp = ontol_short[asp] if asp not in aspects: continue term = elem.find('.//str[@name="annotation_class"]').text name = elem.find('.//str[@name="annotation_class_label"]').text terms[asp][term] = name used_elements.append(elem) # removing used elements to keep memory low if len(used_elements) > 1000: for _ in xrange(500): e = used_elements.pop(0) e.clear() # closing the XML c.fileobj.close() del c return terms
[docs] def go_terms_quickgo(aspects = ('C','F','P')): """ Queries GO terms by the QuickGO REST API. Return dict of dicts where upper level keys are one letter codes of the aspects `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. Lower level keys are GO accessions and values are names of the terms. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """ ontologies = { 'C': 'cellular_component', 'F': 'molecular_function', 'P': 'biological_process', } ontol_short = dict(reversed(i) for i in ontologies.items()) result = dict((a, {}) for a in aspects) url = urls.urls['quickgo_rest']['terms'] last_page = 9999999 this_page = 1 prg = progress.Progress( name = 'Downloading data from QuickGO', interval = 1, ) while this_page <= last_page: page_url = url % this_page c = curl.Curl(page_url, silent = True) this_result = json.loads(c.result) last_page = this_result['pageInfo']['total'] for res in this_result['results']: if 'aspect' not in res: continue asp = ontol_short[res['aspect']] if res['isObsolete'] or asp not in aspects: continue result[asp][res['id']] = res['name'] if is None: prg.set_total(last_page) prg.step() this_page += 1 return result
# synonym for the default method go_terms = go_terms_quickgo
[docs] def go_terms_goose(aspects = ('C','F','P')): """ Queries GO terms by AmiGO goose. Return dict of dicts where upper level keys are one letter codes of the aspects `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. Lower level keys are GO accessions and values are names of the terms. :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. """ aspects_part = '' respaces = re.compile(r'[\s\n]+') ontologies = { 'C': 'cellular_component', 'F': 'molecular_function', 'P': 'biological_process', } ontol_short = dict(reversed(i) for i in ontologies.items()) if set(aspects) != {'C', 'F', 'P'}: aspects_part = 'WHERE (%s)' % ( ' OR '.join( 'term.term_type = "%s"' % ontologies[asp] for asp in aspects ) ) sql_path = os.path.join(common.DATA, 'goose_terms.sql') with open(sql_path, 'r') as fp: query = query = query % aspects_part query = respaces.sub(r' ', query).strip() url = urls.urls['goose']['url'] % query c = curl.Curl(url, silent = False, large = True) terms = {'P': {}, 'C': {}, 'F': {}} for l in c.result: l = l.strip().split('\t') if l[1] not in ontol_short: continue aspect = ontol_short[l[1]] terms[aspect][l[2]] = l[0] return terms
[docs] def go_annotations_quickgo( organism = 9606, aspects = ('C','F','P'), relations = ('is_a', 'part_of'), ): """ Queries GO annotations by QuickGO REST API. IMPORTANT: Recently the preferred method to access GO annotations is ``pypath.inputs.go.go_annotations_goa``. Contrary to its name QuickGO is super slow, otherwise it should yield up to date data, identical to the GOA file. Returns terms in dict of dicts and annotations in dict of dicts of sets. In both dicts the keys are aspects by their one letter codes. In the term dicts keys are GO accessions and values are their names. In the annotation dicts keys are UniProt IDs and values are sets of GO accessions. :param int organism: NCBI Taxonomy ID of one organism. Default is human (9606). :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. :param list uniprots: Optionally a list of UniProt IDs. If `None`, results for all proteins returned. """ annot = dict((a, collections.defaultdict(set)) for a in aspects) ontologies = { 'C': 'cellular_component', 'F': 'molecular_function', 'P': 'biological_process', } ontol_short = dict(reversed(i) for i in ontologies.items()) url = urls.urls['quickgo_rest']['annot'] aspects_part = ','.join(ontologies[a] for a in aspects) relations_part = ','.join(relations) page = 1 while True: this_url = url % ( aspects_part, # aspect relations_part, # goUsageRelationships organism, # taxonId page, ) c = curl.Curl( url = this_url, req_headers = HEADER_ACCEPT_TSV, silent = False, large = True ) _ = next(c.result) # the header row for line in c.result: line = line.strip().split('\t') if line[3] not in relations: continue annot[line[5]][line[1]].add(line[4]) page += 1 return annot
[docs] def go_annotations_solr( organism = 9606, aspects = ('C', 'F', 'P'), references = False, ): """ Queries GO annotations by AmiGO Solr. Before other methods have been provided to access GO. Now this is the preferred method to get annotations. Returns terms in dict of dicts and annotations in dict of dicts of sets. In both dicts the keys are aspects by their one letter codes. In the term dicts keys are GO accessions and values are their names. In the annotation dicts keys are UniProt IDs and values are sets of GO accessions. :param int organism: NCBI Taxonomy ID of one organism. Default is human (9606). :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. :param bool references: Retrieve the references (PubMed IDs) for the annotations. Currently not implemented. """ reamp = re.compile(r'[\s\n\r]+([&\?])') relin = re.compile(r'[\s\n\r]+') annot = dict((a, collections.defaultdict(set)) for a in aspects) ontologies = { 'C': 'cellular_component', 'F': 'molecular_function', 'P': 'biological_process', } ontol_short = dict(reversed(i) for i in ontologies.items()) # assembling the query if len(aspects) < 3: aspects_part = ' AND (%s)' % ( ' OR '.join('aspect:%s' % a for a in aspects) ) else: aspects_part = '' refs_part = ',reference' if references else '' query = ''' ?q = taxon:"NCBITaxon:%u" AND type:protein AND document_category:annotation AND source:UniProtKB%s &rows = 9999999 &start = 0 &fl = bioentity,annotation_class,aspect%s ''' % ( organism, aspects_part, refs_part ) query = relin.sub(' ', reamp.sub(r'\1', query.strip())) # downloading data url = urls.urls['golr']['url'] % query c = curl.Curl(url, silent = False, large = True) # parsing XML by lxml.etree.iterparse parser = etree.iterparse(c.fileobj, events = ('start', 'end')) root = next(parser) used_elements = [] for ev, elem in parser: if ev == 'end' and elem.tag == 'doc': id_ = elem.find('.//str[@name="bioentity"]').text if not id_.startswith('UniProtKB:'): continue asp = elem.find('.//str[@name="aspect"]').text if asp not in aspects: continue term = elem.find('.//str[@name="annotation_class"]').text id_ = id_[10:] # removing the `UniProtKB:` prefix # adding the term to the annotation dict annot[asp][id_].add(term) used_elements.append(elem) # removing used elements to keep memory low if len(used_elements) > 1000: for _ in xrange(500): e = used_elements.pop(0) e.clear() # closing the XML c.fileobj.close() del c return terms, annot
[docs] def go_annotations_goose(organism = 9606, aspects = ('C', 'F', 'P'), uniprots = None): """ Queries GO annotations by AmiGO goose. IMPORTANT: This is not the preferred method any more to get terms and annotations. Recently the preferred method to access GO annotations is ``pypath.inputs.go.go_annotations_solr``. The data in GO MySQL instances has not been updated since Dec 2016. Unfortunately the providers ceased to support MySQL, the most flexible and highest performance access to GO data. The replacement is Solr which is far from providing the same features as MySQL. Returns terms in dict of dicts and annotations in dict of dicts of sets. In both dicts the keys are aspects by their one letter codes. In the term dicts keys are GO accessions and values are their names. In the annotation dicts keys are UniProt IDs and values are sets of GO accessions. :param int organism: NCBI Taxonomy ID of one organism. Default is human (9606). :param tuple aspects: GO aspects: `C`, `F` and `P` for cellular_component, molecular_function and biological_process, respectively. :param list uniprots: Optionally a list of UniProt IDs. If `None`, results for all proteins returned. """ aspects_part = '' uniprot_part = '' respaces = re.compile(r'[\s\n]+') ontologies = { 'C': 'cellular_component', 'F': 'molecular_function', 'P': 'biological_process', } ontol_short = dict(reversed(i) for i in ontologies.items()) if set(aspects) != {'C', 'F', 'P'}: aspects_part = '(%s) AND' % ( ' OR '.join( 'term.term_type="%s"' % ontologies[asp] for asp in aspects ) ) if uniprots is not None: uniprot_part = 'dbxref.xref_key IN (%s) AND' % ( ','.join('"%s"' % uniprot for uniprot in uniprots) ) sql_path = os.path.join(common.DATA, 'goose_annotations.sql') with open(sql_path, 'r') as fp: query = query = query % (organism, aspects_part, uniprot_part) query = respaces.sub(r' ', query).strip() url = urls.urls['goose']['url'] % query c = curl.Curl(url, silent = False, large = True) terms = {'P': {}, 'C': {}, 'F': {}} annot = { 'C': collections.defaultdict(set), 'F': collections.defaultdict(set), 'P': collections.defaultdict(set), } for l in c.result: l = l.strip().split('\t') aspect = ontol_short[l[1]] terms[aspect][l[2]] = l[0] annot[aspect][l[5]].add(l[2]) return terms, annot
[docs] def get_go_desc(go_ids, organism = 9606): """ Deprecated, should be removed soon. """ go_ids = ( ','.join(sorted(go_ids)) if type(go_ids) in {list, tuple, set} else go_ids ) url = urls.urls['quickgo_desc']['url'] % (organism, go_ids) c = curl.Curl( url, silent = False, large = True, req_headers = HEADER_ACCEPT_TSV, ) _ = c.result.readline() return set(l.split('\t')[1] for l in c.result)
[docs] def get_go_quick( organism = 9606, slim = False, names_only = False, aspects = ('C', 'F', 'P'), ): """ Deprecated, should be removed soon. Loads GO terms and annotations from QuickGO. Returns 2 dicts: `names` are GO terms by their IDs, `terms` are proteins GO IDs by UniProt IDs. """ ontologies = { 'C': 'cellular_component', 'F': 'molecular_function', 'P': 'biological_process', } terms = { 'C': collections.defaultdict(set), 'F': collections.defaultdict(set), 'P': collections.defaultdict(set), } names = {} aspects_param = ','.join(sorted(ontologies[a] for a in aspects)) url = urls.urls['quickgo']['url'] % ( organism, aspects_param, '&goUsage = slim' if slim else '', ) c = curl.Curl( url, silent = False, large = True, req_headers = HEADER_ACCEPT_TSV, keep_failed = True, ) _ = next(c.result) for l in c.result: l = l.split('\t') if not names_only: terms[l[5]][l[1]].add(l[4]) return {'terms': terms, 'names': names}
[docs] def get_goslim(url = None): rego = re.compile(r'GO:[0-9]{7}') url = ( url if isinstance(url, str) else urls.urls['goslim_gen']['url'] ) c = curl.Curl(url, silent = False) data = c.result result = [] for l in data.split('\n'): if l.startswith('id:'): result += rego.findall(l) return result