Source code for pypath.inputs.kegg_api

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

import collections
import itertools
import csv
import re
import asyncio
import inspect

from concurrent.futures.thread import ThreadPoolExecutor

from abc import ABC, abstractmethod
from typing import Iterable, Literal

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.session as session
import pypath.share.common as common

_logger = session.Logger(name = 'kegg_api')
_log = _logger._log

_url = urls.urls['kegg_api']['url']


def _generate_relation_functions():

    _entity_types = ('disease', 'drug', 'gene', 'pathway')

    for etypes in itertools.combinations(_entity_types, 2):

        for args in (etypes, reversed(etypes)):

            args = tuple(args)
            name = f'{args[0]}_to_{args[1]}'
            synopsis = f'{args[0].capitalize()}-{args[1]} relations from KEGG.'

            def _relation_function(organism):

                if 'gene' in args:

                    args = args + (organism,)

                return _kegg_relations(*args)


            _relation_function.__name__ = name
            _relation_function.__doc__ = synopsis

            if 'gene' not in args:

                sig = inspect.signature(_relation_function)
                sig.replace(parameters = ())
                _relation_function.__signature__ = sig

            else:

                _relation_function.__doc__ += (
                    '\n\nArgs\n    organism:\n        Name of the organism. '
                    'Gene relations are organism specific.\n'
                )

            globals()[name] = _relation_function


[docs] def drug_to_drug( drugs: list | tuple | None = None, join: bool = True, asynchronous: bool = False ) -> dict[str, tuple]: """ Downloads drug-drug interaction data from KEGG database. Args drugs: Drug IDs as a list or a tuple. join: If it's True, returns individual interactions of queried list. Else, joins them together and returns mutual interactions. asynchronous: Yet to be implemented. Returns A dict with disease IDs as keys and drug-drug interactions as values. """ DrugToDrugInteraction = collections.namedtuple( 'DrugToDrugInteraction', ( 'type', 'name', 'interactions', ), ) Interaction = collections.namedtuple( 'Interaction', ( 'type', 'id', 'name', 'contraindication', 'precaution', ) ) entry_types = {'d': 'drug', 'c': 'compound'} entry_dbs = {'drug': _Drug(), 'compound': _Compound()} interactions = collections.defaultdict( lambda: { 'interactions': collections.defaultdict(list), } ) join = join and (len(drugs) > 0) asynchronous = not drugs or asynchronous drugs = drugs or entry_dbs['drug'].data.keys() entries = _kegg_ddi(drugs, join = join, async_=asynchronous) for entry in entries: partners = dict( ( role, { 'type': entry_types.get(entry[i][0].lower(), None), 'id': entry[i].split(':')[-1], 'name': ( entry_dbs[ entry_types.get(entry[i][0].lower(), None) ]. get(entry[i].split(':')[-1], None) ), } ) for i, role in enumerate(('source', 'target')) ) labels = entry[2].split(',') contraindication = 'CI' in labels precaution = 'P' in labels interaction = Interaction( type = partners['target']['type'], id = partners['target']['id'], name = partners['target']['name'], contraindication = contraindication, precaution = precaution, ) disease_id = partners['source']['id'] try: interactions[disease_id]['interactions'].append(interaction) except AttributeError: interactions[disease_id]['interactions'] = [interaction] interactions[disease_id]['type'] = partners['source']['type'] interactions[disease_id]['name'] = partners['source']['name'] interactions = dict( ( key, DrugToDrugInteraction( value['type'], value['name'], tuple(value['interactions']), ) ) for key, value in interactions.items() ) return interactions
def _generate_conv_functions(): _id_types = ( ('drug', ('chebi',)), ('gene', ('ncbi-geneid', 'uniprot')), ) labels = { 'chebi': 'ChEBI', 'ncbi-geneid': 'NCBI Gene', 'uniprot': 'UniProt', } for entity, id_types in _id_types: for id_type in id_types: args_ = (entity, id_type) for args in (args_, reversed(args_)): synopsis = ( 'Translation dict between ' + ' and '.join( f'{labels.get(a, f"KEGG {a}")} IDs' for a in args ) + '.' ) def _conv_function(organism): splits = [a != 'gene' for a in args] args = [a if s else organism for s, a in zip(splits, args)] return _kegg_conv(*args, *splits) name = ( '_to_'.join( f'kegg_{a}' if a == entity else a for a in args ). replace('-', '_') ) _conv_function.__name__ = name _conv_function.__doc__ = synopsis if entity != 'gene': sig = inspect.signature(_conv_function) sig.replace(parameters = ()) _conv_function.__signature__ = sig else: _conv_function.__doc__ += ( '\n\nArgs\n organism:\n Name of the ' 'organism. Gene relations are organism specific.\n' ) globals()[name] = _conv_function def _kegg_general( operation: str, *arguments: str, ) -> list[list[str]]: arguments = [arg for arg in arguments if arg is not None] url = '/'.join([_url % operation] + list(arguments)) curl_args = {'url': url, 'silent': True, 'large': False} c = curl.Curl(**curl_args) lines = getattr(c, 'result', []).split('\n') or [] return [line.split('\t') for line in lines if line] async def _kegg_general_async( operation: str, *arguments: str, ) -> list[list[str]]: #TODO Yet to be implemented # This function doesn't work but it better # stay so we can implement it without # changing the structure of the module return _kegg_general(operation, *arguments) def _kegg_list( database: str, option: str | None = None, organism: str | int | None = None, ) -> list[list[str]]: args = ['list', database] if database == 'brite' and option is not None: args += common.to_list(option) elif database == 'pathway' and organism is not None: args += common.to_list(organism) return _kegg_general(*args) def _kegg_conv( source_db: str, target_db: str, source_split: bool = False, target_split: bool = False, ) -> dict[str, set[str]]: result = _kegg_general('conv', target_db, source_db) conversion_table = collections.defaultdict(set) for source, target in result: source = source.split(':')[1] if source_split else source target = target.split(':')[1] if target_split else target conversion_table[source].add(target) return dict(conversion_table) def _kegg_link(source_db: str, target_db: str) -> list[list[str]]: return _kegg_general('link', target_db, source_db) def _kegg_ddi(drug_ids: str | Iterable[str], join=True, async_: bool = False): if join and not isinstance(drug_ids, str): drug_ids = '+'.join(common.to_list(drug_ids)) if async_: pool = ThreadPoolExecutor() return pool.submit(asyncio.run, _kegg_ddi_async(drug_ids)).result() return _kegg_ddi_sync(drug_ids) def _kegg_ddi_sync(drug_ids: str | Iterable[str]): return list(itertools.chain(*( _kegg_general('ddi', drug_id) for drug_id in common.to_list(drug_ids) ))) async def _kegg_ddi_async(drug_ids): #TODO Yet to be implemented # This function doesn't work but it better # stay so we can implement it without # changing the structure of the module result = [] for response in asyncio.as_completed([ _kegg_general_async('ddi', drug_id) for drug_id in common.to_list(drug_ids) ]): the_response = await response result.extend(common.to_list(the_response)) return result def _kegg_relations( source_db: Literal['gene', 'pathway', 'disease', 'drug'], target_db: Literal['gene', 'pathway', 'disease', 'drug'], # should have human as a default, instead of triggering an error: organism: str | None = None, ) -> tuple: l_organism = common.to_list(organism) data = {} record = collections.namedtuple( 'KeggEntry', ( 'id', 'name', 'type', 'ncbi_gene_ids', 'uniprot_ids', 'chebi_ids', ) ) def get_data(name, cls_prefix = ''): if name not in data: cls = f'_{cls_prefix}{name.capitalize()}' data[name] = globals()[cls](*l_organism) return data[name] def db(name): return get_data(name) def ids(name): return get_data(name, cls_prefix = 'KeggTo') def process(entry, type_): id_ = db(type_).proc_key(entry) name = db(type_).get(id_, None) ncbi = ids('ncbi').get(id_) if type_ == 'gene' else () uniprot = ids('uniprot').get(id_) if type_ == 'gene' else () chebi = ids('chebi').get(id_) if type_ == 'drug' else () return record( id = id_, name = name, type = type_, ncbi_gene_ids = ncbi, uniprot_ids = uniprot, chebi_ids = chebi, ) args = [organism if db == 'gene' else db for db in (source_db, target_db)] entries = _kegg_link(*args) interactions = [(process(e[0], source_db), process(e[1], target_db)) for e in entries] return interactions class _KeggDatabase(ABC): _data = None _query_args = None def __init__(self, *args): self.load(*args) @abstractmethod def proc_key(self, entry): return entry @abstractmethod def proc_value(self, entry): return entry def load(self, *args): entries = _kegg_list(*common.to_list(self._query_args), *args) self._data = { self.proc_key(entry[0]): self.proc_value(entry[1]) for entry in entries } def get(self, index, default = None): return self._data.get(index, default) def __getitem__(self, index): return self.get(index) @property def data(self): return self._data class _Organism(_KeggDatabase): _query_args = 'organism' def load(self, *args): entries = _kegg_list(*common.to_list(self._query_args), *args) self._data = { self.proc_key(entry[1]): self.proc_value(entry[0], entry[2]) for entry in entries } def proc_value(self, entry): return self.get(entry) def proc_key(self, entry): return entry class _Gene(_KeggDatabase): def __init__(self, organism): super().__init__(organism) def load(self, *args): entries = _kegg_list(*common.to_list(self._query_args), *args) self._data = { self.proc_key(entry[0]): self.proc_value(entry[-1]) for entry in entries } def proc_key(self, entry): return entry def proc_value(self, entry): return entry.rsplit(';', maxsplit = 1)[-1].strip(' ') class _Pathway(_KeggDatabase): _re_pathway = re.compile(r'\d+') _query_args = 'pathway' def proc_value(self, entry): return entry def proc_key(self, entry): pathway_id = self._re_pathway.search(entry) # is this correct? # there are pathway prefixes in KEGG other than "map" return f'map{pathway_id.group()}' class _SplitDatabase(_KeggDatabase): def proc_key(self, entry): return entry[0].split(':')[1] def proc_value(self, entry): return entry[1] class _Disease(_SplitDatabase): _query_args = 'disease' class _Drug(_SplitDatabase): _query_args = 'drug' class _Compound(_SplitDatabase): _query_args = 'compound' class _ConversionTable: _table = {} def __init__( self, *id_types: str, source_split: bool = False, target_split: bool = False, ): self._id_types = id_types self._splits = { 'source_split': source_split, 'target_split': target_split, } self.load() @abstractmethod def load(self): self._table.update(_kegg_conv(*self._id_types, **self._splits)) def get(self, index, default = None): return self._table.get(index, default) def __getitem__(self, index): return self._table.get(index, None) @property def table(self): return self._table class _KeggToNcbi(_ConversionTable): def __init__(self, organism): super().__init__(organism, 'ncbi-geneid', target_split = True) class _NcbiToKegg(_ConversionTable): def __init__(self, organism): super().__init__('ncbi-geneid', organism, source_split = True) class _KeggToUniprot(_ConversionTable): def __init__(self, organism): super().__init__(organism, 'uniprot', target_split = True) class _UniprotToKegg(_ConversionTable): def __init__(self, organism): super().__init__('uniprot', organism, source_split = True) class _KeggToChebi(_ConversionTable): def __init__(self): super().__init__( 'drug', 'chebi', source_split = True, target_split = True, ) class _ChebiToKegg(_ConversionTable): def __init__(self): super().__init__( 'chebi', 'drug', source_split = True, target_split = True, ) _generate_relation_functions() _generate_conv_functions()