Source code for pypath.inputs.cell

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import os

import pypath.share.curl as curl
import pypath.share.session as session


_logger = session.Logger(name = 'cell_input')
_log = _logger._log



[docs]
def cell_supplementary(supp_url: str, article_url: str) -> str:
    """
    Downloads a supplementary material from the Cell journal webpage.

    Args
        supp_url:
            URL of the supplementary material.
        article_url:
            URL of the article page.

    Return
        The path of the downloaded file.
    """

    c_nocall = curl.Curl(
        supp_url,
        call = False,
        setup = False,
        process = False,
        silent = True,
    )
    c_nocall.get_cache_file_name()
    path = c_nocall.cache_file_name

    req_headers = []

    if not os.path.exists(path):

        cookies = {}
        init_url = article_url

        for step in range(3):

            c_init = curl.Curl(
                init_url,
                silent = True,
                large = True,
                cache = False,
                follow = False,
                req_headers = req_headers + ['user-agent: curl/7.69.1'],
                bypass_url_encoding = True,
                retries = 1,
                empty_attempt_again = False,
            )

            new_cookies = dict(
                tuple(
                    h.decode().split(':')[1].\
                    split(';')[0].\
                    strip().split('=', maxsplit = 1)
                )
                for h in c_init.resp_headers
                if h.lower().startswith(b'set-cookie')
            )
            cookies.update(new_cookies)
            _ = cookies.pop('__cflb', None)

            for h in c_init.resp_headers:

                if h.lower().startswith(b'location'):

                    init_url = h.decode().split(':', maxsplit = 1)[1].strip()

            req_headers = (
                [
                    'Cookie: %s' % (
                        '; '.join(
                            '%s=%s' % cookie
                            for cookie in iteritems(cookies)
                        )
                    )
                ]
                    if cookies else
                []
            )

            _log(
                'HTTP %u; location: `%s`, cookies: `%s`.' % (
                    c_init.status,
                    init_url,
                    req_headers[0] if req_headers else '',
                )
            )

            if c_init.status != 302:

                break

    c_table = curl.Curl(
        supp_url,
        silent = False,
        large = True,
        empty_attempt_again = False,
        req_headers = req_headers + ['user-agent: curl/7.69.1'],
    )
    path = c_table.cache_file_name
    c_table.fileobj.close()

    return path