#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#from__future__importannotationsfromfuture.utilsimportiteritemsfrompast.builtinsimportxrange,rangeimportjsonimportosimportsysimporttextwrapimportbs4importpypath.share.progressasprogressimportpypath.share.curlascurlimportpypath.share.commonascommonimportpypath.inputs.unichemasunichem_input
[docs]classUnichem(object):""" Client for the UniChem drug compound identifier translation service (https://www.ebi.ac.uk/unichem/). """
[docs]def__init__(self):sys.stdout.write('\n\tType `Unichem_instance.usage()` to get help.\n\n')sys.stdout.flush()# from unichem id to db nameself.uc_dict=unichem_input.unichem_sources()# from db name to unichem idself.name_dict=common.swap_dict(self.uc_dict)self.url_stem='https://www.ebi.ac.uk/unichem/rest/src_compound_id'self.inchi_stem='https://www.ebi.ac.uk/unichem/rest/inchikey/%s'self.chembl_url=('http://www.ebi.ac.uk/chemblws/compounds/smiles/{0}.json')self.cpd_search='http://www.ebi.ac.uk/unichem/rest/{0}/{1}/{2}{3}'self.result={}
[docs]defusage(self):""" Prints usage information and examples to the standard output. """msg=''' List of identifier types can be read above. To query UniChem, give either names or numbers of the ID types you wish to translate from and to. E.g. >>> u = unichem.Unichem() >>> u.translate('pubchem', 'chembl', list_of_pubchems) For example, the PubChem CID of Aspirin is 2244. Translate it to ChEMBL: >>> u.translate('pubchem', 'chembl', '2244') >>> u.result {'2244': ['CHEMBL25']} We can translate multiple identifiers the same way: >>> >>> u.translate('pubchem', 'chembl', ['2244', '4091']) >>> u.result {'2244': ['CHEMBL25'], '4091': ['CHEMBL1431']} Additional ways of translation are from SMILEs to ChEMBL IDs, and from InChiKeys to any ID. These are translated not by the UniChem, but by the ChEMBL webservice. >>> u.translate('smiles', 'chembl', list_of_smiles) >>> u.translate('inchikey', 'chembl', list_of_inchikeys) Other option to search is connectivity search from UniChem. A-G parameters can be defined optionally. See description at https://www.ebi.ac.uk/unichem/info/widesearchInfo >>> u.connectivity_search(list_of_zincs, 'zinc', parameters=[1,0,0,0,0,1,0]) InChiKeys can be used in connectivity search too: >>> u.connectivity_search(list_of_inchikeys, 'inchikey', parameters=[1,0,0,0,0,1,0]) You can also call directly functions accessing ChEMBL webservice, with the same result as you would call `translate()` or `connectivity_search()`: >>> u.smiles2chembl(list_of_smiles) >>> u.inchikey2anything('chembl', list_of_inchikeys) Find the dict in `u.result`. Untranslated items have value `None`. Every call overwrites previous result! For an up to date list of identifier types see https://www.ebi.ac.uk/unichem/ucquery/listSources or call `Unichem.info(<source>)`: >>> Unichem.info('chembl') '''sys.stdout.write(os.linesep)id_types=sorted(self.uc_dict.items(),key=lambdax:int(x[0]))iflen(id_types)%2:id_types.append(('',)*2)nrows=len(id_types)//2foriinxrange(nrows):sys.stdout.write(''.join((' '*8,id_types[i][0].rjust(2),' '*3,id_types[i][1].ljust(20),id_types[i+nrows][0].rjust(2),' '*3,id_types[i+nrows][1].ljust(20),os.linesep,)))sys.stdout.write(msg+os.linesep)sys.stdout.flush()
[docs]@staticmethoddefinfo(source):""" Print information about one source. Args source (int,str): The numeric or string ID of one source. """unichem_input.info(source)
[docs]deftranslate(self,source,target,lst):""" Translate one drug compound identifier to another identifier type using the UniChem web service. For an up to date list of identifier types see https://www.ebi.ac.uk/unichem/ucquery/listSources. Args source (str,int): The source ID type, either as a string label or as a number, as used in UniChem. target (str,int): The target ID type, either as a string label or as a number, as used in UniChem. lst (str,set): One or more identifiers to translate. Returns Returns None, the results are stored in the `result` attribute of this object. """lst=common.to_set(lst)self.result={}ifsource=='inchikey':self.inchikey2anything(target,lst)returnNoneifsource=='smiles':self.smiles2chembl(lst)returnNonesource=(str(source)iftype(source)isintelseself.name_dict[source])target=(str(target)iftype(target)isintelseself.name_dict[target])prg=progress.Progress(total=len(lst),name='Translating compound identifiers',interval=1,)forcompinlst:url='/'.join([self.url_stem,comp,source,target])c=curl.Curl(url,large=False)result=c.resultself.result[comp]=[]ifresultisnotNone:data=json.loads(result)fordindata:self.result[comp].append(d['src_compound_id'])prg.step()prg.terminate()
[docs]definchikey2anything(self,target,lst):""" Translate InChi keys to another identifier type using the ChEMBL web service. Args target (str,int): The target ID type, either as a string label or as a number, as used in UniChem. lst (str,set): One or more InChi keys. Returns Returns None, the results are stored in the `result` attribute of this object. """lst=common.to_set(lst)self.result={}target=(str(target)iftype(target)isintelseself.name_dict[target])prg=progress.Progress(total=len(lst),name='Translating InChi-Keys',interval=1,)forinchikinlst:url=self.inchi_stem%inchikc=curl.Curl(url,large=False)result=c.resultifresultisnotNone:data=json.loads(result)self.result[inchik]=[d['src_compound_id']fordindataifd['src_id']==target]prg.step()prg.terminate()
[docs]defsmiles2chembl(self,smiles):""" Translate SMILES to ChEMBL ID using the ChEMBL web service. Args smiles (str,list): One or more SMILES. Returns Returns None, the results are stored in the `result` attribute of this object. """smiles=common.to_set(smiles)self.result={}prg=progress.Progress(total=len(smiles),name='Translating SMILEs',interval=1)forsmlinsmiles:url=self.chembl_url.format(sml)c=curl.Curl(url,large=False)result=c.resultself.result[sml]=[]ifresultisnotNone:try:data=json.loads(result)fordindata['compounds']:this_smile=d['smiles']this_chembl=d['chemblId']# if this_smile == sml:self.result[sml].append(this_chembl)exceptValueError:soup=bs4.BeautifulSoup(result)compounds=soup.find_all('compound')ifcompoundsisnotNone:forcompoundincompounds:this_smile=compound.find('smiles').textthis_chembl=compound.find('chemblid').text# if this_smile == sml:self.result[sml].append(this_chembl)prg.step()prg.terminate()
[docs]defconnectivity_search(self,id_list:str|set,id_type:str|int,parameters:list[int]=[1,0,0,0,0,1,0]):""" Search for structurally and chemically similar compounds based on cheminformatics similarity metrics. Read more at https://www.ebi.ac.uk/unichem/info/widesearchInfo. Args id_list: One or more identifiers to query. id_type: Type of the identifiers, either as a string label or a number as used by UniChem. SMILES is not available in this type of query. parameters: A list of parameters A-H as described in https://www.ebi.ac.uk/unichem/info/widesearchInfo. Returns Returns None, the results are stored in the `result` attribute of this object. """id_list=common.to_set(id_list)parameters.append(1)# H parameter must be 1 to process the resultparameters=[str(i)foriinparameters]self.result={}ifid_type=='inchikey':id_type=''method='key_search'elifid_type=='smiles':returnNoneelse:id_type=(str(id_type)iftype(id_type)isintelseself.name_dict[id_type])id_type='%s/'%id_typemethod='cpd_search'prg=progress.Progress(total=len(id_list),name='Connectivity search',interval=1)foriinid_list:prg.step()url=self.cpd_search.format(method,i,id_type,'/'.join(parameters))c=curl.Curl(url,large=False)result=c.resultself.result[i]=[]ifresultisnotNone:data=json.loads(result)fork,viniteritems(data):forjinxrange(1,len(v)):self.result[i].append(v[j][0])self.result[i]=list(set(self.result[i]))prg.terminate()