Source code for pypath.inputs.ebi

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import json
import math
import collections

from typing import Callable, List, Optional, Union

import glom

import pypath.share.curl as curl
import pypath_common._constants as _const
import pypath.share.session as session
import pypath.inputs.common as inputs_common

_logger = session.Logger(name = 'inputs.ebi')
_log = _logger._log



[docs]
def ebi_rest(
        url: str,
        qs: Optional[dict] = None,
        fields: Optional[inputs_common.GlomSpec] = None,
        paginate: bool = True,
        page_length: int = 500,
        size_param: str = 'size',
        page_param: str = 'offset',
        by_page: bool = True,
        page_field: inputs_common.GlomSpec = 'page.number',
        total_field: inputs_common.GlomSpec = 'page.totalPages',
        record_name: Optional[str] = None,
    ) -> List[tuple]:
    """
    Collects data from an EBI REST web service.

    Args
        url:
            The URL of the web service.
        qs:
            Query string parameters to be appended to the URL.
        fields:
            Glom spec of the fields to be extracted from the result.
        paginate:
            Retrieve all pages until the end (if False, only
            one page will be downloaded).
        page_length:
            Number of records on one page.
        size_param:
            Query string key for number of records per page.
        page_param:
            Query string key to request a specific page by.
        by_page:
            The pagination works by page numbers (True) or item
            numbers (False).
        page_field:
            Glom spec of the JSON field that contains the current
            page number.
        total_field:
            Glom spec of the JSON field that contains the total
            number of pages.
        record_name:
            Class name for the named tuples in the result.

    Details
        Read more about glom specs here:
        https://glom.readthedocs.io/en/latest/tutorial.html

    Returns
        A list of named tuples with the requested fields.
    """

    result = []

    qs = qs or {}
    qs[size_param] = qs.get(size_param, page_length)
    page = 0
    totalrec = -1

    while True:

        page_url = '%s?%s' % (
            url,
            '&'.join(
                '{}={}'.format(*it)
                for it in qs.items()
            )
        )

        _log(
            'Downloading page %u (total: %s).' % (
                page + 1,
                'unknown'
                    if totalrec < 0 else
                str(math.ceil(totalrec / page_length))
            )
        )

        c = curl.Curl(page_url)
        c.get_headers()
        headers = c.resp_headers_dict

        totalrec = int(headers.get('x-pagination-totalrecords', totalrec))

        if not c.result:

            break

        res = inputs_common.json_read(c.result)

        page = glom.glom(
            res,
            (page_field, int),
            default = page,
        ) + 1

        qs[page_param] = page * (by_page or page_length)

        total = glom.glom(res, (total_field, int), default = 0)

        if fields:

            res = inputs_common.json_extract(c.result, fields)

        res = res if isinstance(res, list) else [res]

        if res == [None] or res == [_const.GLOM_ERROR]:

            break

        result.extend(res)

        if (
            not paginate or
            (total and page > total) or
            (totalrec > 0 and len(result) >= totalrec)
        ):

            break

    record_name = (
        record_name or
        '%sRecord' % url.rsplit('/', maxsplit = 1)[-1].capitalize()
    )

    record = collections.namedtuple(
        record_name,
        sorted({k for i in result for k in i.keys()})
    )

    nested = all(
        isinstance(val, list)
        for section in result
        for val in section.values()
    )

    if not nested:

        result = [
            dict((k, [v]) for k, v in section.items())
            for section in result
        ]

    result = [
        record(*values)
        for section in result
        for values in zip(
            *(section.get(f, None) for f in record._fields)
        )
    ]

    return result