Source code for pypath.inputs.pubchem
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
import os
import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.cache as cache
import pypath.share.lookup as lookup
[docs]
def pubchem_mapping(target, source = 'cid'):
"""
Identifier translation data from PubChem.
Args
target (str): The target ID type, either as it is used in the file
names in the PubChem FTP service or as simpler, all lowercase
strings used in this module. Possible values are parent-cid,
component-cid, inchi, iupac, preferred-cid, sid, smiles,
synonym.
source (str): The source ID type. Either sid or cid.
Returns
(dict): A dict of sets with the source identifiers as keys and sets
of target identifiers as values.
"""
id_types = {
'parent-cid': 'Parent',
'component-cid': 'Component',
'inchi': 'InChi',
'iupac': 'IUPAC',
'preferred-cid': 'Preferred',
'pubchem-sid': 'SID',
'sid': 'SID',
'smiles': 'SMILES',
'synonym': 'Synonym-unfiltered',
'cid': 'CID',
'pubchem-cid': 'CID',
}
_target = id_types.get(target, target)
_source = id_types.get(source, source)
if _source not in {'CID', 'SID'}:
msg = (
'The source identifier type must be either CID or SID, '
'not `%s`.' % source
)
_log(msg)
raise ValueError(msg)
ftp_dir = (
{
'SID': 'Substance',
'CID': 'Compound',
}[_source]
)
url = urls.urls['pubchem']['ftp'] % (ftp_dir, _source, _target)
c = curl.Curl(url, large = True, silent = False)
db_path = os.path.join(
cache.get_cachedir(),
'pubchem_%s_%s.sqlite' % (_source, _target)
)
with lookup.ManyToMany(db_path) as result:
result.populate(fileobj = c._gzfile_mode_r)