Source code for pypath.inputs.pharos

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

"""
Retrieve data from the NIH Pharos database.
"""

from __future__ import annotations

import json

from pypath.share.curl import Curl
from pypath.resources.urls import urls
import pypath.share.session as session

_logger = session.Logger(name = 'pharos_input')
_log = _logger._log


QUERY_TYPES = (
    'expression',
    'gtex',
    'orthologs',
    'ligands',
    'xrefs',
    'diseases',
)


PHAROS_QUERY = """
    query targetDetails(
        $chunk_size: Int!,
        $step: Int!,
        $getExpressions: Boolean!,
        $getGtex: Boolean!,
        $getOrthologs: Boolean!,
        $getLigands: Boolean!,
        $getXrefs: Boolean!,
        $getDiseases: Boolean!,
    ) {

        targets {

            targets(top: $chunk_size skip: $step) {

                name
                sym
                uniprot

                expressions(top: 10000) @include(if: $getExpressions) {

                    expid
                    type
                    tissue
                    value

                    uberon {
                        name
                        uid
                    }

                    pub {
                        pmid
                    }
                }

                gtex @include(if: $getGtex) {

                    tissue
                    tpm
                    log2foldchange

                    uberon {
                        name
                        uid
                    }
                }

                orthologs(top: 10000) @include(if: $getOrthologs) {
                    name
                    species
                    orid
                    dbid
                    geneid
                    source
                }

                ligands(top: 10000 isdrug: true) @include(if: $getLigands) {

                    ligid
                    name

                    synonyms {
                        name
                        value
                    }

                    activities(all: true) {
                        actid
                        type
                        moa
                        value

                        pubs {
                            pmid
                            __typename
                            }
                    }
                }

                xrefs(source: "Ensembl") @include(if: $getXrefs) {
                    name
                }

                diseases(top:10000) @include(if: $getDiseases) {

                    name
                    mondoID

                    dids {
                        id
                        dataSources
                        doName
                    }
                }
            }
        }
    }
    """



[docs]
def pharos_general(
        query: str,
        variables: dict[str, bool] | None = None,
    ) -> dict:
    """
    Query the NIH Pharos database by GraphQL.

    Read about Pharos here: https://pharos.nih.gov/about

    Args:
        query:
            A GraphQL query.
        variables:
            Variables to retrieve. A dict of variable names and boolean values.

    Return:
        The JSON response parsed into a dict.
    """

    url = urls['pharos_api']['url']

    req_headers = {
        'Accept-Encoding': 'gzip, deflate, br',
        'Content-Type': 'application/json',
        'Connection': 'keep-alive',
        'DNT': '1',
        'Origin': 'https://pharos-api.ncats.io',
    }

    query_param = {'query': query}

    if variables:

        _log(
            'Querying Pharos, variables: '
             f'{", ".join(k for k, v in variables.items() if v)}'
        )
        query_param['variables'] = variables

    binary_data = json.dumps(query_param).encode('utf-8')

    c = Curl(
        url=url,
        req_headers=req_headers,
        binary_data=binary_data,
        compressed=True,
        compr='gz',
    )
    result = json.loads(c.result)

    result = result['data']

    return result




[docs]
def pharos_targets(
        chunk_size: int = 100,
        expression: bool = False,
        gtex: bool = False,
        orthologs: bool = False,
        ligands: bool = False,
        xrefs: bool = False,
        diseases: bool = False,
    ) -> list:
    """
    Query the NIH Pharos database by GraphQL.

    The queried data is fetched in chunks, by default 100 records each. The
    complete data consists of thousands of chunks, the retrieval takes
    about half hour.

    Args:
        chunk_size:
            Records in one batch. Better stay 100 because higher numbers
            likely to cause timeout errors.

    Return:
        Records as a list of dicts.
    """

    variables = {
        'chunk_size': chunk_size,
        'step': 0,
        'getExpressions': expression,
        'getGtex': gtex,
        'getOrthologs': orthologs,
        'getLigands': ligands,
        'getXrefs': xrefs,
        'getDiseases': diseases,
    }
    result = []

    while True:

        _log(f'Pharos query, chunk #{variables["step"]}')
        response = pharos_general(PHAROS_QUERY, variables)
        response = response['targets']['targets']

        if not response:

            break

        result.extend(response)
        variables['step'] += chunk_size

    return result



def _create_query_functions():

    for qtype in QUERY_TYPES:

        args = {qtype: True}
        name = f'pharos_{qtype}'

        doc = f"""
            Retrieve `{qtype}` records from Pharos.

            Note: data retrieval might take about half an hour.

            Args:
                chunk_size:
                    Records in one batch. Better stay 100 because higher
                    numbers likely to cause timeout errors.

            Return:
                Records as a list of dicts.
            """

        def query_func(chunk_size: int = 100) -> list:

            return pharos_targets(chunk_size = chunk_size, **args)


        query_func.__name__ = name
        query_func.__doc__ = doc

        globals()[name] = query_func


_create_query_functions()