Source code for pypath.inputs.go

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

from past.builtins import xrange, range
from future.utils import iteritems

import os
import re
import json
import collections

from lxml import etree

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.progress as progress
import pypath.share.session as session
import pypath.share.common as common
import pypath.utils.taxonomy as taxonomy

_logger = session.Logger(name = 'uniprot_input')
_log = _logger._log


HEADER_ACCEPT_TSV = {'Accept': 'text/tsv'}
HEADER_ACCEPT_JSON = {'Accept': 'application/json'}



[docs]
def go_annotations_uniprot(organism = 9606, swissprot = 'yes'):
    """
    Deprecated, should be removed soon.
    """

    rev = '' if swissprot is None \
        else ' AND reviewed:%s' % swissprot
    query = 'organism:%u%s' % (int(organism), rev)
    url = urls.urls['uniprot_basic']['url']
    get = {'query': query, 'format': 'tab', 'columns': 'id,go-id'}
    c = curl.Curl(url, get = get, silent = False)
    data = c.result

    return dict([(x[0], [go.strip() for go in x[1].split(';')])
                 for x in [x.split('\t') for x in data.split('\n')]
                 if len(x) > 1])




[docs]
def go_annotations_goa(
        organism = 'human',
        evidence_codes=False,
    ):
    """
    Downloads GO annotation from UniProt GOA.

    Args:
        organism:
            Organism name or NCBI Taxonomy ID.
        evidence_codes:
            Include evidence codes in the output.
    """

    organism = taxonomy.ensure_common_name(organism)

    annot = dict(
        (asp, collections.defaultdict(set))
        for asp in ('C', 'P', 'F')
    )

    url = urls.urls['goa']['ebi_url'] % (organism.upper(), organism.lower())
    c = curl.Curl(url, silent = False, large = True)

    for line in c.result:
        if not line or line[0] == '!':
            continue

        line = line.strip().split('\t')
        if evidence_codes:
            annot[line[8]][line[1]].add((line[4], line[6]))
        else:
            annot[line[8]][line[1]].add(line[4])

    return dict((k, dict(v)) for k, v in iteritems(annot))



# synonym for the default method
go_annotations = go_annotations_goa



[docs]
def go_annotations_all(
        organism: int | str = 'human',
        fields: str | list[str] | None = None
    ) -> dict[str, set[tuple[str]]]:

    if organism != '*':
        organism = taxonomy.ensure_common_name(organism)

    all_fields = (
        'db',
        'db_object_id',
        'db_object_symbol',
        'qualifier',
        'go_id',
        'reference',
        'evidence_code',
        'with_or_from',
        'aspect',
        'db_object_name',
        'db_object_synonym',
        'db_object_type',
        'taxon_and_interacting_taxon',
        'date',
        'assigned_By',
        'annotation_extension',
        'gene_product_form_id'
    )

    fields = fields or all_fields
    fields = common.to_list(fields)

    if organism in ('*', None):
        url = urls.urls['goa']['ebi_url'] % ('UNIPROT', 'uniprot_gcrp')
    else:
        url = urls.urls['goa']['ebi_url'] % (organism.upper(), organism.lower())

    c = curl.Curl(url, silent = False, large = True)

    result = collections.defaultdict(set)
    record = collections.namedtuple('GoAnnotation', fields)

    for line in c.result:

        if not line.strip() or line[0] == '!':
            continue

        line = dict(zip(all_fields, line.strip().split('\t')))
        result[line['db_object_id']].add(
            record(**dict(zip(fields, (line.get(f, None) for f in fields))))
        )

    return dict(result)




[docs]
def go_ancestors_goose(aspects = ('C','F','P')):
    """
    Queries the ancestors of GO terms by AmiGO goose.

    Returns dict of sets where keys are GO accessions and values are sets
    of their ancestors.

    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    """

    aspects_part = ''
    respaces = re.compile(r'[\s\n]+')

    ontologies = {
        'C': 'cellular_component',
        'F': 'molecular_function',
        'P': 'biological_process',
    }

    if set(aspects) != {'C', 'F', 'P'}:

        aspects_part = 'WHERE (%s)' % (
            ' OR '.join(
                'term.term_type = "%s"' % ontologies[asp]
                for asp in aspects
            )
        )

    sql_path = os.path.join(common.DATA, 'goose_ancestors.sql')

    with open(sql_path, 'r') as fp:

        query = fp.read()

    query = query % aspects_part
    query = respaces.sub(r' ', query).strip()

    url = urls.urls['goose']['url'] % query

    c = curl.Curl(url, silent = False, large = True)

    ancestors = collections.defaultdict(set)

    for l in c.result:

        l = l.strip().split('\t')
        ancestors[l[0]].add(l[1])

    return ancestors




[docs]
def go_ancestors_quickgo(aspects = ('C', 'F', 'P')):
    """
    Queries the ancestors of GO terms by QuickGO REST API.

    Returns dict of sets where keys are GO accessions and values are sets
    of their ancestors.

    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    """

    desc = go_descendants_quickgo(aspects = aspects)

    return go_descendants_to_ancestors(desc)



# synonym for the default method
go_ancestors = go_ancestors_quickgo



[docs]
def go_descendants_to_ancestors(desc):
    """
    Turns a dict of descendants to dict of ancestors by swapping the
    relationships. This way descendants will be the keys and their ancestors
    will be the values.
    """

    ancestors = {}

    for asp, dct in iteritems(desc):

        ancestors[asp] = collections.defaultdict(set)

        for anc_term, des in iteritems(dct):

            for des_term, rel in des:

                ancestors[asp][des_term].add((anc_term, rel))

        ancestors[asp] = dict(ancestors[asp])

    return ancestors




[docs]
def go_descendants_goose(aspects = ('C','F','P')):
    """
    Queries descendants of GO terms by AmiGO goose.

    IMPORTANT:
    This is not the preferred method any more to get descendants.
    Recently the preferred method to access GO annotations is
    ``pypath.inputs.go.go_descendants_quickgo``.
    The data in GO MySQL instances has not been updated since Dec 2016.
    Unfortunately the providers ceased to support MySQL, the most flexible
    and highest performance access to GO data. The replacement is Solr
    which is far from providing the same features as MySQL, for example
    it is unable to provide GO graph relationships. Other service is QuickGO
    which is up to date and has nice ways to query the ontology.

    Returns dict of sets where keys are GO accessions and values are sets
    of their descendants.

    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    """

    desc = collections.defaultdict(set)

    anc = go_ancestors_goose(aspects = aspects)

    for term, ancs in iteritems(anc):
        for terma in ancs:
            desc[terma].add(term)

    return desc




[docs]
def go_descendants_quickgo(
        aspects = ('C', 'F', 'P'),
        terms = None,
        relations = None,
        quickgo_download_size = 500,
    ):
    """
    Queries descendants of GO terms by QuickGO REST API.

    Returns dict of sets where keys are GO accessions and values are sets
    of their descendants.

    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    :param dict terms:
        Result from ``go_terms_solr``. If ``None`` the method will be called.
    """


    def download_in_chunks(terms, chunk_size, target = None):

        target = target or collections.defaultdict(set)

        paginator = common.paginate(terms, chunk_size)

        for p, terms_part in enumerate(paginator):

            url = urls.urls['quickgo_rest']['desc'] % (
                ','.join(terms_part),
                '?relations = %s' % relations_part,
            )

            c = curl.Curl(
                url,
                req_headers = HEADER_ACCEPT_JSON,
                silent = True,
                large = True,
            )

            try:
                result = json.load(c.fileobj)

            except json.decoder.JSONDecodeError:

                done = chunk_size * p
                remaining = terms[done:]
                new_chunk_size = chunk_size // 2

                if new_chunk_size < 10:

                    _log(
                        'Failed to download QuickGO, tried to decrease the '
                        'number of terms in each query, went below 10 terms '
                        'per query but still getting erroneous JSON. '
                        'This might be due to very slow network connection. '
                        'You might increase the timeout of CURL. '
                        'But then it will take forever.'
                    )

                    return target

                return download_in_chunks(
                    terms = remaining,
                    chunk_size = new_chunk_size,
                    target = taret,
                )

            for res in result['results']:
                if 'children' not in res:
                    continue

                target[res['id']].update(
                    set(
                        (child['id'], child['relation'])
                        for child in res['children']
                    )
                )

        return target

    desc = {}

    terms = terms or go_terms_quickgo(aspects = aspects)
    relations = relations or ('is_a', 'part_of', 'occurs_in', 'regulates',)

    relations_part = ','.join(relations)

    for asp in aspects:
        desc[asp] = download_in_chunks(
            terms = list(terms[asp].keys()),
            chunk_size = quickgo_download_size,
        )

    return desc



# synonym for the default method
go_descendants = go_descendants_quickgo



[docs]
def go_terms_solr(aspects = ('C', 'F', 'P')):
    """
    Queries GO terms by AmiGO Solr.

    Returns dict of dicts where upper level keys are one letter codes of the
    aspects `C`, `F` and `P` for cellular_component, molecular_function and
    biological_process, respectively. Lower level keys are GO accessions
    and values are names of the terms.

    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    """

    reamp = re.compile(r'[\s\n\r]+([&\?])')
    relin = re.compile(r'[\s\n\r]+')

    ontologies = {
        'C': 'cellular_component',
        'F': 'molecular_function',
        'P': 'biological_process',
    }
    ontol_short = dict(reversed(i) for i in ontologies.items())

    terms = dict((a, {}) for a in aspects)

    query = '''
        ?q = document_category:"ontology_class" AND
            idspace:GO AND
            is_obsolete:0
        &rows = 9999999
        &start = 0
        &fl = annotation_class,annotation_class_label,source
    '''

    query = relin.sub(' ', reamp.sub(r'\1', query.strip()))

    # downloading data
    url = urls.urls['golr']['url'] % query

    c = curl.Curl(url, silent = False, large = True)

    # parsing XML by lxml.etree.iterparse
    parser = etree.iterparse(c.fileobj, events = ('start', 'end'))
    root = next(parser)
    used_elements = []

    for ev, elem in parser:
        if ev == 'end' and elem.tag == 'doc':
            asp  = elem.find('.//str[@name="source"]').text
            asp  = ontol_short[asp]

            if asp not in aspects:
                continue

            term = elem.find('.//str[@name="annotation_class"]').text
            name = elem.find('.//str[@name="annotation_class_label"]').text

            terms[asp][term] = name

        used_elements.append(elem)

        # removing used elements to keep memory low
        if len(used_elements) > 1000:
            for _ in xrange(500):
                e = used_elements.pop(0)
                e.clear()

    # closing the XML
    c.fileobj.close()
    del c

    return terms




[docs]
def go_terms_quickgo(aspects = ('C','F','P')):
    """
    Queries GO terms by the QuickGO REST API.

    Return dict of dicts where upper level keys are one letter codes of the
    aspects `C`, `F` and `P` for cellular_component, molecular_function and
    biological_process, respectively. Lower level keys are GO accessions
    and values are names of the terms.

    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    """

    ontologies = {
        'C': 'cellular_component',
        'F': 'molecular_function',
        'P': 'biological_process',
    }
    ontol_short = dict(reversed(i) for i in ontologies.items())

    result = dict((a, {}) for a in aspects)
    url = urls.urls['quickgo_rest']['terms']
    last_page = 9999999
    this_page = 1
    prg = progress.Progress(
        name = 'Downloading data from QuickGO',
        interval = 1,
    )

    while this_page <= last_page:
        page_url = url % this_page

        c = curl.Curl(page_url, silent = True)

        this_result = json.loads(c.result)
        last_page = this_result['pageInfo']['total']


        for res in this_result['results']:
            if 'aspect' not in res:
                continue

            asp = ontol_short[res['aspect']]

            if res['isObsolete'] or asp not in aspects:
                continue

            result[asp][res['id']] = res['name']

        if prg.total is None:
            prg.set_total(last_page)

        prg.step()

        this_page += 1

    return result



# synonym for the default method
go_terms = go_terms_quickgo



[docs]
def go_terms_goose(aspects = ('C','F','P')):
    """
    Queries GO terms by AmiGO goose.

    Return dict of dicts where upper level keys are one letter codes of the
    aspects `C`, `F` and `P` for cellular_component, molecular_function and
    biological_process, respectively. Lower level keys are GO accessions
    and values are names of the terms.

    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    """

    aspects_part = ''
    respaces = re.compile(r'[\s\n]+')

    ontologies = {
        'C': 'cellular_component',
        'F': 'molecular_function',
        'P': 'biological_process',
    }
    ontol_short = dict(reversed(i) for i in ontologies.items())

    if set(aspects) != {'C', 'F', 'P'}:
        aspects_part = 'WHERE (%s)' % (
            ' OR '.join(
                'term.term_type = "%s"' % ontologies[asp]
                for asp in aspects
            )
        )

    sql_path = os.path.join(common.DATA, 'goose_terms.sql')

    with open(sql_path, 'r') as fp:
        query = fp.read()

    query = query % aspects_part
    query = respaces.sub(r' ', query).strip()

    url = urls.urls['goose']['url'] % query

    c = curl.Curl(url, silent = False, large = True)

    terms = {'P': {}, 'C': {}, 'F': {}}

    for l in c.result:
        l = l.strip().split('\t')

        if l[1] not in ontol_short:
            continue

        aspect = ontol_short[l[1]]
        terms[aspect][l[2]] = l[0]

    return terms




[docs]
def go_annotations_quickgo(
        organism = 9606,
        aspects = ('C','F','P'),
        relations = ('is_a', 'part_of'),
    ):
    """
    Queries GO annotations by QuickGO REST API.

    IMPORTANT:
    Recently the preferred method to access GO annotations is
    ``pypath.inputs.go.go_annotations_goa``.
    Contrary to its name QuickGO is super slow, otherwise it should yield
    up to date data, identical to the GOA file.

    Returns terms in dict of dicts and annotations in dict of dicts of sets.
    In both dicts the keys are aspects by their one letter codes.
    In the term dicts keys are GO accessions and values are their names.
    In the annotation dicts keys are UniProt IDs and values are sets
    of GO accessions.

    :param int organism:
        NCBI Taxonomy ID of one organism. Default is human (9606).
    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    :param list uniprots:
        Optionally a list of UniProt IDs. If `None`, results for all proteins
        returned.
    """

    annot = dict((a, collections.defaultdict(set)) for a in aspects)

    ontologies = {
        'C': 'cellular_component',
        'F': 'molecular_function',
        'P': 'biological_process',
    }
    ontol_short = dict(reversed(i) for i in ontologies.items())

    url = urls.urls['quickgo_rest']['annot']

    aspects_part = ','.join(ontologies[a] for a in aspects)
    relations_part = ','.join(relations)

    page = 1

    while True:

        this_url = url % (
            aspects_part, # aspect
            relations_part, # goUsageRelationships
            organism, # taxonId
            page,
        )

        c = curl.Curl(
            url = this_url,
            req_headers = HEADER_ACCEPT_TSV,
            silent = False,
            large = True
        )

        _ = next(c.result) # the header row

        for line in c.result:
            line = line.strip().split('\t')

            if line[3] not in relations:
                continue

            annot[line[5]][line[1]].add(line[4])

        page += 1

    return annot




[docs]
def go_annotations_solr(
        organism = 9606,
        aspects = ('C', 'F', 'P'),
        references = False,
    ):
    """
    Queries GO annotations by AmiGO Solr.

    Before other methods have been provided to access GO.
    Now this is the preferred method to get annotations.
    Returns terms in dict of dicts and annotations in dict of dicts of sets.
    In both dicts the keys are aspects by their one letter codes.
    In the term dicts keys are GO accessions and values are their names.
    In the annotation dicts keys are UniProt IDs and values are sets
    of GO accessions.

    :param int organism:
        NCBI Taxonomy ID of one organism. Default is human (9606).
    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    :param bool references:
        Retrieve the references (PubMed IDs) for the annotations.
        Currently not implemented.
    """

    reamp = re.compile(r'[\s\n\r]+([&\?])')
    relin = re.compile(r'[\s\n\r]+')

    annot = dict((a, collections.defaultdict(set)) for a in aspects)

    ontologies = {
        'C': 'cellular_component',
        'F': 'molecular_function',
        'P': 'biological_process',
    }
    ontol_short = dict(reversed(i) for i in ontologies.items())

    # assembling the query

    if len(aspects) < 3:
        aspects_part = ' AND (%s)' % (
            ' OR '.join('aspect:%s' % a for a in aspects)
        )

    else:
        aspects_part = ''

    refs_part = ',reference' if references else ''

    query = '''
        ?q = taxon:"NCBITaxon:%u" AND
            type:protein AND
            document_category:annotation AND
            source:UniProtKB%s
        &rows = 9999999
        &start = 0
        &fl = bioentity,annotation_class,aspect%s
    ''' % (
        organism,
        aspects_part,
        refs_part
    )

    query = relin.sub(' ', reamp.sub(r'\1', query.strip()))

    # downloading data
    url = urls.urls['golr']['url'] % query
    c = curl.Curl(url, silent = False, large = True)

    # parsing XML by lxml.etree.iterparse
    parser = etree.iterparse(c.fileobj, events = ('start', 'end'))
    root = next(parser)
    used_elements = []

    for ev, elem in parser:

        if ev == 'end' and elem.tag == 'doc':

            id_ = elem.find('.//str[@name="bioentity"]').text

            if not id_.startswith('UniProtKB:'):
                continue

            asp  = elem.find('.//str[@name="aspect"]').text

            if asp not in aspects:
                continue

            term = elem.find('.//str[@name="annotation_class"]').text
            id_  = id_[10:] # removing the `UniProtKB:` prefix

            # adding the term to the annotation dict
            annot[asp][id_].add(term)

        used_elements.append(elem)

        # removing used elements to keep memory low
        if len(used_elements) > 1000:

            for _ in xrange(500):

                e = used_elements.pop(0)
                e.clear()

    # closing the XML
    c.fileobj.close()
    del c

    return terms, annot




[docs]
def go_annotations_goose(organism = 9606, aspects = ('C', 'F', 'P'),
                         uniprots = None):
    """
    Queries GO annotations by AmiGO goose.

    IMPORTANT:
    This is not the preferred method any more to get terms and annotations.
    Recently the preferred method to access GO annotations is
    ``pypath.inputs.go.go_annotations_solr``.
    The data in GO MySQL instances has not been updated since Dec 2016.
    Unfortunately the providers ceased to support MySQL, the most flexible
    and highest performance access to GO data. The replacement is Solr
    which is far from providing the same features as MySQL.

    Returns terms in dict of dicts and annotations in dict of dicts of sets.
    In both dicts the keys are aspects by their one letter codes.
    In the term dicts keys are GO accessions and values are their names.
    In the annotation dicts keys are UniProt IDs and values are sets
    of GO accessions.

    :param int organism:
        NCBI Taxonomy ID of one organism. Default is human (9606).
    :param tuple aspects:
        GO aspects: `C`, `F` and `P` for cellular_component,
        molecular_function and biological_process, respectively.
    :param list uniprots:
        Optionally a list of UniProt IDs. If `None`, results for all proteins
        returned.
    """

    aspects_part = ''
    uniprot_part = ''
    respaces = re.compile(r'[\s\n]+')

    ontologies = {
        'C': 'cellular_component',
        'F': 'molecular_function',
        'P': 'biological_process',
    }
    ontol_short = dict(reversed(i) for i in ontologies.items())

    if set(aspects) != {'C', 'F', 'P'}:
        aspects_part = '(%s) AND' % (
            ' OR '.join(
                'term.term_type="%s"' % ontologies[asp]
                for asp in aspects
            )
        )

    if uniprots is not None:
        uniprot_part = 'dbxref.xref_key IN (%s) AND' % (
            ','.join('"%s"' % uniprot for uniprot in uniprots)
        )

    sql_path = os.path.join(common.DATA, 'goose_annotations.sql')

    with open(sql_path, 'r') as fp:
        query = fp.read()

    query = query % (organism, aspects_part, uniprot_part)
    query = respaces.sub(r' ', query).strip()

    url = urls.urls['goose']['url'] % query

    c = curl.Curl(url, silent = False, large = True)

    terms = {'P': {}, 'C': {}, 'F': {}}
    annot = {
        'C': collections.defaultdict(set),
        'F': collections.defaultdict(set),
        'P': collections.defaultdict(set),
    }

    for l in c.result:
        l = l.strip().split('\t')

        aspect = ontol_short[l[1]]

        terms[aspect][l[2]] = l[0]
        annot[aspect][l[5]].add(l[2])

    return terms, annot




[docs]
def get_go_desc(go_ids, organism = 9606):
    """
    Deprecated, should be removed soon.
    """

    go_ids = (
        ','.join(sorted(go_ids))
        if type(go_ids) in {list, tuple, set} else
        go_ids
    )

    url = urls.urls['quickgo_desc']['url'] % (organism, go_ids)
    c = curl.Curl(
        url,
        silent = False,
        large = True,
        req_headers = HEADER_ACCEPT_TSV,
    )
    _ = c.result.readline()

    return set(l.split('\t')[1] for l in c.result)




[docs]
def get_go_quick(
        organism = 9606,
        slim = False,
        names_only = False,
        aspects = ('C', 'F', 'P'),
    ):
    """
    Deprecated, should be removed soon.

    Loads GO terms and annotations from QuickGO.
    Returns 2 dicts: `names` are GO terms by their IDs,
    `terms` are proteins GO IDs by UniProt IDs.
    """

    ontologies = {
        'C': 'cellular_component',
        'F': 'molecular_function',
        'P': 'biological_process',
    }

    terms = {
        'C': collections.defaultdict(set),
        'F': collections.defaultdict(set),
        'P': collections.defaultdict(set),
    }

    names = {}
    aspects_param = ','.join(sorted(ontologies[a] for a in aspects))

    url = urls.urls['quickgo']['url'] % (
        organism,
        aspects_param,
        '&goUsage = slim' if slim else '',
    )

    c = curl.Curl(
        url,
        silent = False,
        large = True,
        req_headers = HEADER_ACCEPT_TSV,
        keep_failed = True,
    )

    _ = next(c.result)

    for l in c.result:

        l = l.split('\t')

        if not names_only:

            terms[l[5]][l[1]].add(l[4])

    return {'terms': terms, 'names': names}




[docs]
def get_goslim(url = None):

    rego = re.compile(r'GO:[0-9]{7}')
    url = (
        url
            if isinstance(url, str) else
        urls.urls['goslim_gen']['url']
    )
    c = curl.Curl(url, silent = False)
    data = c.result
    result = []

    for l in data.split('\n'):

        if l.startswith('id:'):

            result += rego.findall(l)

    return result