Source code for pypath.inputs.ebi
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
from future.utils import iteritems
import json
import math
import collections
from typing import Callable, List, Optional, Union
import glom
import pypath.share.curl as curl
import pypath_common._constants as _const
import pypath.share.session as session
import pypath.inputs.common as inputs_common
_logger = session.Logger(name = 'inputs.ebi')
_log = _logger._log
[docs]
def ebi_rest(
url: str,
qs: Optional[dict] = None,
fields: Optional[inputs_common.GlomSpec] = None,
paginate: bool = True,
page_length: int = 500,
size_param: str = 'size',
page_param: str = 'offset',
by_page: bool = True,
page_field: inputs_common.GlomSpec = 'page.number',
total_field: inputs_common.GlomSpec = 'page.totalPages',
record_name: Optional[str] = None,
) -> List[tuple]:
"""
Collects data from an EBI REST web service.
Args
url:
The URL of the web service.
qs:
Query string parameters to be appended to the URL.
fields:
Glom spec of the fields to be extracted from the result.
paginate:
Retrieve all pages until the end (if False, only
one page will be downloaded).
page_length:
Number of records on one page.
size_param:
Query string key for number of records per page.
page_param:
Query string key to request a specific page by.
by_page:
The pagination works by page numbers (True) or item
numbers (False).
page_field:
Glom spec of the JSON field that contains the current
page number.
total_field:
Glom spec of the JSON field that contains the total
number of pages.
record_name:
Class name for the named tuples in the result.
Details
Read more about glom specs here:
https://glom.readthedocs.io/en/latest/tutorial.html
Returns
A list of named tuples with the requested fields.
"""
result = []
qs = qs or {}
qs[size_param] = qs.get(size_param, page_length)
page = 0
totalrec = -1
while True:
page_url = '%s?%s' % (
url,
'&'.join(
'{}={}'.format(*it)
for it in qs.items()
)
)
_log(
'Downloading page %u (total: %s).' % (
page + 1,
'unknown'
if totalrec < 0 else
str(math.ceil(totalrec / page_length))
)
)
c = curl.Curl(page_url)
c.get_headers()
headers = c.resp_headers_dict
totalrec = int(headers.get('x-pagination-totalrecords', totalrec))
if not c.result:
break
res = inputs_common.json_read(c.result)
page = glom.glom(
res,
(page_field, int),
default = page,
) + 1
qs[page_param] = page * (by_page or page_length)
total = glom.glom(res, (total_field, int), default = 0)
if fields:
res = inputs_common.json_extract(c.result, fields)
res = res if isinstance(res, list) else [res]
if res == [None] or res == [_const.GLOM_ERROR]:
break
result.extend(res)
if (
not paginate or
(total and page > total) or
(totalrec > 0 and len(result) >= totalrec)
):
break
record_name = (
record_name or
'%sRecord' % url.rsplit('/', maxsplit = 1)[-1].capitalize()
)
record = collections.namedtuple(
record_name,
sorted({k for i in result for k in i.keys()})
)
nested = all(
isinstance(val, list)
for section in result
for val in section.values()
)
if not nested:
result = [
dict((k, [v]) for k, v in section.items())
for section in result
]
result = [
record(*values)
for section in result
for values in zip(
*(section.get(f, None) for f in record._fields)
)
]
return result