Source code for pypath.utils.unichem

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

from future.utils import iteritems
from past.builtins import xrange, range

import json
import os
import sys
import textwrap

import bs4

import pypath.share.progress as progress
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.inputs.unichem as unichem_input



[docs]
class Unichem(object):
    """
    Client for the UniChem drug compound identifier translation service
    (https://www.ebi.ac.uk/unichem/).
    """


[docs]
    def __init__(self):

        sys.stdout.write(
            '\n\tType `Unichem_instance.usage()` to get help.\n\n'
        )
        sys.stdout.flush()

        # from unichem id to db name
        self.uc_dict = unichem_input.unichem_sources()
        # from db name to unichem id
        self.name_dict = common.swap_dict(self.uc_dict)
        self.url_stem = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id'
        self.inchi_stem = 'https://www.ebi.ac.uk/unichem/rest/inchikey/%s'
        self.chembl_url = (
            'http://www.ebi.ac.uk/chemblws/compounds/smiles/{0}.json'
        )
        self.cpd_search = 'http://www.ebi.ac.uk/unichem/rest/{0}/{1}/{2}{3}'
        self.result = {}





[docs]
    def usage(self):
        """
        Prints usage information and examples to the standard output.
        """

        msg = '''
        List of identifier types can be read above.
        To query UniChem, give either names or numbers of the
        ID types you wish to translate from and to.
        E.g.
        >>> u = unichem.Unichem()
        >>> u.translate('pubchem', 'chembl', list_of_pubchems)

        For example, the PubChem CID of Aspirin is 2244. Translate it to
        ChEMBL:
        >>> u.translate('pubchem', 'chembl', '2244')
        >>> u.result
        {'2244': ['CHEMBL25']}

        We can translate multiple identifiers the same way:
        >>>
        >>> u.translate('pubchem', 'chembl', ['2244', '4091'])
        >>> u.result
        {'2244': ['CHEMBL25'], '4091': ['CHEMBL1431']}

        Additional ways of translation are from SMILEs to ChEMBL IDs, and
        from InChiKeys to any ID. These are translated not by the UniChem,
        but by the ChEMBL webservice.
        >>> u.translate('smiles', 'chembl', list_of_smiles)
        >>> u.translate('inchikey', 'chembl', list_of_inchikeys)

        Other option to search is connectivity search from UniChem.
        A-G parameters can be defined optionally. See description at
        https://www.ebi.ac.uk/unichem/info/widesearchInfo
        >>> u.connectivity_search(list_of_zincs, 'zinc', parameters=[1,0,0,0,0,1,0])

        InChiKeys can be used in connectivity search too:
        >>> u.connectivity_search(list_of_inchikeys, 'inchikey', parameters=[1,0,0,0,0,1,0])

        You can also call directly functions accessing ChEMBL webservice, with the
        same result as you would call `translate()` or `connectivity_search()`:
        >>> u.smiles2chembl(list_of_smiles)
        >>> u.inchikey2anything('chembl', list_of_inchikeys)

        Find the dict in `u.result`. Untranslated items have value `None`.
        Every call overwrites previous result!

        For an up to date list of identifier types see
        https://www.ebi.ac.uk/unichem/ucquery/listSources or
        call `Unichem.info(<source>)`:
        >>> Unichem.info('chembl')
        '''

        sys.stdout.write(os.linesep)

        id_types = sorted(
            self.uc_dict.items(),
            key = lambda x: int(x[0])
        )

        if len(id_types) % 2:

            id_types.append(('',) * 2)

        nrows = len(id_types) // 2

        for i in xrange(nrows):

            sys.stdout.write(
                ''.join((
                    ' ' * 8,
                    id_types[i][0].rjust(2),
                    ' ' * 3,
                    id_types[i][1].ljust(20),
                    id_types[i + nrows][0].rjust(2),
                    ' ' * 3,
                    id_types[i + nrows][1].ljust(20),
                    os.linesep,
                ))
            )

        sys.stdout.write(msg + os.linesep)
        sys.stdout.flush()




[docs]
    @staticmethod
    def info(source):
        """
        Print information about one source.

        Args
            source (int,str): The numeric or string ID of one source.
        """

        unichem_input.info(source)




[docs]
    def translate(self, source, target, lst):
        """
        Translate one drug compound identifier to another identifier type
        using the UniChem web service. For an up to date list of identifier
        types see https://www.ebi.ac.uk/unichem/ucquery/listSources.

        Args
            source (str,int): The source ID type, either as a string label
                or as a number, as used in UniChem.
            target (str,int): The target ID type, either as a string label
                or as a number, as used in UniChem.
            lst (str,set): One or more identifiers to translate.

        Returns
            Returns None, the results are stored in the `result` attribute
            of this object.
        """

        lst = common.to_set(lst)
        self.result = {}

        if source == 'inchikey':

            self.inchikey2anything(target, lst)
            return None

        if source == 'smiles':

            self.smiles2chembl(lst)
            return None

        source = (
            str(source)
                if type(source) is int else
            self.name_dict[source]
        )
        target = (
            str(target)
                if type(target) is int else
            self.name_dict[target]
        )

        prg = progress.Progress(
            total=len(lst),
            name='Translating compound identifiers',
            interval=1,
        )

        for comp in lst:

            url = '/'.join([self.url_stem, comp, source, target])
            c = curl.Curl(url, large = False)
            result = c.result
            self.result[comp] = []

            if result is not None:

                data = json.loads(result)

                for d in data:

                    self.result[comp].append(d['src_compound_id'])

            prg.step()

        prg.terminate()




[docs]
    def inchikey2anything(self, target, lst):
        """
        Translate InChi keys to another identifier type using the ChEMBL
        web service.

        Args
            target (str,int): The target ID type, either as a string label
                or as a number, as used in UniChem.
            lst (str,set): One or more InChi keys.

        Returns
            Returns None, the results are stored in the `result` attribute
            of this object.
        """

        lst = common.to_set(lst)

        self.result = {}
        target = (
            str(target)
                if type(target) is int else
            self.name_dict[target]
        )
        prg = progress.Progress(
            total=len(lst),
            name='Translating InChi-Keys',
            interval=1,
        )

        for inchik in lst:

            url = self.inchi_stem % inchik
            c = curl.Curl(url, large = False)
            result = c.result

            if result is not None:

                data = json.loads(result)
                self.result[inchik] = [
                    d['src_compound_id']
                    for d in data
                    if d['src_id'] == target
                ]
            prg.step()

        prg.terminate()




[docs]
    def smiles2chembl(self, smiles):
        """
        Translate SMILES to ChEMBL ID using the ChEMBL web service.

        Args
            smiles (str,list): One or more SMILES.

        Returns
            Returns None, the results are stored in the `result` attribute
            of this object.
        """

        smiles = common.to_set(smiles)

        self.result = {}
        prg = progress.Progress(
            total=len(smiles),
            name='Translating SMILEs',
            interval=1
        )

        for sml in smiles:

            url = self.chembl_url.format(sml)
            c = curl.Curl(url, large = False)
            result = c.result
            self.result[sml] = []

            if result is not None:

                try:

                    data = json.loads(result)

                    for d in data['compounds']:

                        this_smile = d['smiles']
                        this_chembl = d['chemblId']
                        # if this_smile == sml:
                        self.result[sml].append(this_chembl)

                except ValueError:

                    soup = bs4.BeautifulSoup(result)
                    compounds = soup.find_all('compound')

                    if compounds is not None:

                        for compound in compounds:

                            this_smile = compound.find('smiles').text
                            this_chembl = compound.find('chemblid').text
                            # if this_smile == sml:
                            self.result[sml].append(this_chembl)
            prg.step()

        prg.terminate()




[docs]
    def connectivity_search(
            self,
            id_list: str | set,
            id_type: str | int,
            parameters: list[int] = [1, 0, 0, 0, 0, 1, 0]
        ):
        """
        Search for structurally and chemically similar compounds based on
        cheminformatics similarity metrics. Read more at
        https://www.ebi.ac.uk/unichem/info/widesearchInfo.

        Args
            id_list:
                One or more identifiers to query.
            id_type:
                Type of the identifiers, either as a string
                label or a number as used by UniChem. SMILES is not
                available in this type of query.
            parameters:
                A list of parameters A-H as described in
                https://www.ebi.ac.uk/unichem/info/widesearchInfo.

        Returns
            Returns None, the results are stored in the `result` attribute
            of this object.
        """

        id_list = common.to_set(id_list)

        parameters.append(1)  # H parameter must be 1 to process the result
        parameters = [str(i) for i in parameters]
        self.result = {}

        if id_type == 'inchikey':

            id_type = ''
            method = 'key_search'

        elif id_type == 'smiles':

            return None

        else:

            id_type = (
                str(id_type)
                    if type(id_type) is int else
                self.name_dict[id_type]
            )
            id_type = '%s/' % id_type
            method = 'cpd_search'

        prg = progress.Progress(
            total=len(id_list),
            name='Connectivity search',
            interval=1
        )

        for i in id_list:

            prg.step()
            url = self.cpd_search.format(
                method,
                i,
                id_type,
                '/'.join(parameters)
            )
            c = curl.Curl(url, large = False)
            result = c.result
            self.result[i] = []

            if result is not None:

                data = json.loads(result)

                for k, v in iteritems(data):

                    for j in xrange(1, len(v)):

                        self.result[i].append(v[j][0])

            self.result[i] = list(set(self.result[i]))

        prg.terminate()