Source code for pypath.inputs.chembl

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from typing import Literal

import json
import collections

import pypath.share.curl as curl
import pypath.resources.urls as urls


[docs] def chembl_targets() -> list[tuple]: """ Retrieves targets data from ChEMBL. Returns List of drug target records as named tuples. """ fields_target = ( 'accession', 'target_chembl_id', ) ChemblTarget = collections.namedtuple( 'ChemblTarget', fields_target, defaults = (None,) * len(fields_target), ) tgt_lst = [] page_dct = {} while True: if not page_dct: url = ( f"{urls.urls['chembl']['url']}" f"{urls.urls['chembl']['target']}" ) elif page_dct['page_meta']['next']: url = ( f"{urls.urls['chembl']['url']}" f"{page_dct['page_meta']['next']}" ) else: break c = curl.Curl(url, large=True, silent=False) fileobj = open(c.fileobj.name, encoding='utf-8') page_dct = json.loads(fileobj.read()) tgt_lst.extend( ChemblTarget( accession = ( tgt['target_components'][0]['accession'] if tgt['target_components'] else None ), target_chembl_id = tgt['target_chembl_id'], ) for tgt in page_dct['targets'] ) return tgt_lst
[docs] def chembl_assays() -> list[tuple] : """ Retrieves assays data from ChEMBL. Returns List of assay records as named tuples. """ fields_assay = ( 'assay_chembl_id', 'assay_organism', 'assay_type', 'confidence_score', 'target_chembl_id', ) ChemblAssay = collections.namedtuple( 'ChemblAssay', fields_assay, defaults = (None,) * len(fields_assay), ) assay_lst = [] page_dct = {} while True: if not page_dct: url = ( f"{urls.urls['chembl']['url']}" f"{urls.urls['chembl']['assay']}" ) elif page_dct['page_meta']['next']: url = ( f"{urls.urls['chembl']['url']}" f"{page_dct['page_meta']['next']}" ) else: break c = curl.Curl(url, large=True, silent=False) fileobj = open(c.fileobj.name, encoding='utf-8') page_dct = json.loads(fileobj.read()) assay_lst.extend( ChemblAssay( assay_chembl_id = assy_attr['assay_chembl_id'], assay_organism = assy_attr['assay_organism'], assay_type = assy_attr['assay_type'], confidence_score = assy_attr['confidence_score'], target_chembl_id = assy_attr['target_chembl_id'], ) for assy_attr in page_dct['assays'] ) return assay_lst
[docs] def chembl_molecules() -> list[tuple]: """ Retrieves molecules data from ChEMBL. Returns Molecule records as named tuples. """ def _get(mol, key0, key1): molecule_properties = mol.get(f'molecule_{key0}', {}) if molecule_properties: return molecule_properties.get(key1, None) else: return None fields_molecule = ( 'name', 'alogp', 'canonical_smiles', 'chirality', 'full_mwt', 'heavy_atoms', 'species', 'qed_weighted', 'type', 'structure_type', 'chembl', 'parent_chembl', 'prodrug', 'std_inchi_key', 'std_inchi', 'xrefs', ) ChemblMolecule = collections.namedtuple( 'ChemblMolecule', fields_molecule, defaults = (None,) * len(fields_molecule), ) mol_lst = [] page_dct = {} while True: if not page_dct: url = urls.urls['chembl']['url'] + urls.urls['chembl']['molecule'] c = curl.Curl(url, large=True, silent=False) elif page_dct['page_meta']['next']: url = ( f"{urls.urls['chembl']['url']}" f"{page_dct['page_meta']['next']}" ) else: break c = curl.Curl(url, large=True, silent=False) fileobj = open(c.fileobj.name, encoding='utf-8') page_dct = json.loads(fileobj.read()) mol_lst.extend( ChemblMolecule( name = mol['pref_name'], chirality = mol['chirality'], type = mol['molecule_type'], prodrug = mol['prodrug'], structure_type = mol['structure_type'], chembl = _get(mol, 'hierarchy', 'molecule_chembl_id'), parent_chembl = _get(mol, 'hierarchy', 'parent_chembl_id'), alogp = _get(mol, 'properties', 'alogp'), full_mwt = _get(mol, 'properties', 'full_mwt'), heavy_atoms = _get(mol, 'properties', 'heavy_atoms'), species = _get(mol, 'properties', 'molecular_species'), qed_weighted = _get(mol, 'properties', 'qed_weighted'), canonical_smiles = _get(mol, 'structures', 'canonical_smiles'), std_inchi_key = _get(mol, 'structures', 'standard_inchi_key'), std_inchi = _get(mol, 'structures', 'standard_inchi'), xrefs = ( [ { 'xref_id': rec['xref_id'], 'xref_src': rec['xref_src'], } for rec in mol['cross_references'] ] if mol['cross_references'] else None ) ) for mol in page_dct['molecules'] ) return mol_lst
[docs] def chembl_activities( #TODO: are these below all the allowed values? standard_relation: Literal['=', '>', '<', '>=', '<='], pchembl_value_none: bool = False, ) -> list[tuple] : """ Retrieves activities data from ChEMBL. Args pchembl_value_none: # TODO: it is allowed to be None or must be None? Whether the pchembl value should be none or not. standard_relation: Which standard relation in needed. Returns List of activity records as named tuples. `standard_units` attribute is not included in the returned records. # TODO: then why the data_validity_comment is part of the records? Only records without `data_validity_comment` are returned. """ fields_activity = ( 'assay_chembl', 'data_validity_comment', 'chembl', 'pchembl', 'standard_flag', 'standard_relation', 'standard_value', 'standard_type', 'target_chembl', 'document' ) ChemblActivity = collections.namedtuple( 'ChemblActivity', fields_activity, defaults = (None,) * len(fields_activity), ) activity_lst = [] page_dct = {} while True: if not page_dct: url = ( f"{urls.urls['chembl']['url']}" f"{urls.urls['chembl']['activity']}" f"&pchembl_value__isnull={str(pchembl_value_none).lower()}" f"&standard_relation__exact={standard_relation}" ) elif page_dct['page_meta']['next']: url = ( f"{urls.urls['chembl']['url']}" f"{page_dct['page_meta']['next']}" ) else: break c = curl.Curl(url, large=True, silent=False) fileobj = open(c.fileobj.name, encoding='utf-8') page_dct = json.loads(fileobj.read()) activity_lst.extend( ChemblActivity( assay_chembl = act['assay_chembl_id'], data_validity_comment = act['data_validity_comment'], chembl = act['molecule_chembl_id'], pchembl = act['pchembl_value'], standard_flag = True if act['standard_flag'] == 1 else False, standard_relation = act['standard_relation'], standard_value = act['standard_value'], standard_type = act['standard_type'], target_chembl = act['target_chembl_id'], document = act['document_chembl_id'], ) for act in page_dct['activities'] if act['data_validity_comment'] is None ) return activity_lst
[docs] def chembl_documents() -> dict[str, str] : """ Retrieves ChEMBL document ID to PubMed ID conversion. Returns Dictionary of ChEMBL document IDs as keys and PubMed IDs as values. """ page_dct = {} document_dict = {} while True: if not page_dct: url = ( f"{urls.urls['chembl']['url']}" f"{urls.urls['chembl']['document']}" ) elif page_dct['page_meta']['next']: url = ( f"{urls.urls['chembl']['url']}" f"{page_dct['page_meta']['next']}" ) else: break c = curl.Curl(url, large=True, silent=False) fileobj = open(c.fileobj.name, encoding='utf-8') page_dct = json.loads(fileobj.read()) for doc in page_dct['documents']: if doc['pubmed_id']: document_dict[doc['document_chembl_id']]= doc['pubmed_id'] return document_dict
[docs] def chembl_drug_indications( max_phase_threshold: int = 0, ) -> list[tuple]: """ Retrieves drug indications data from ChEMBL. Args max_phase_threshold: The threshold for maximum phase of the drug for which the indication is valid. Returns List of drug indications as namedtuples. """ fields_indication = ( 'efo_id', 'efo_term', 'max_phase', 'mesh_heading', 'mesh_id', 'molecule_chembl', ) ChemblIndication = collections.namedtuple( 'ChemblIndication', fields_indication, defaults = (None,) * len(fields_indication), ) indication_lst = [] page_dct = {} while True: if not page_dct: url = ( f"{urls.urls['chembl']['url']}" f"{urls.urls['chembl']['drug_indication']}" ) elif page_dct['page_meta']['next']: url = ( f"{urls.urls['chembl']['url']}" f"{page_dct['page_meta']['next']}" ) else: break c = curl.Curl(url, large=True, silent=False) fileobj = open(c.fileobj.name, encoding='utf-8') page_dct = json.loads(fileobj.read()) indication_lst.extend( ChemblIndication( efo_id = ind['efo_id'], efo_term = ind['efo_term'], max_phase = float(ind['max_phase_for_ind']), mesh_heading = ind['mesh_heading'], mesh_id = ind['mesh_id'], molecule_chembl = ind['molecule_chembl_id'], ) for ind in page_dct['drug_indications'] if float(ind['max_phase_for_ind']) > max_phase_threshold and max_phase_threshold != 0 \ or max_phase_threshold == 0 ) return indication_lst
[docs] def chembl_mechanisms() -> list[tuple]: """ Retrieves mechanism data from ChEMBL. Returns List of mechanisms as namedtuples. """ fields_mechanism = ( 'action_type', 'direct_interaction', 'disease_efficacy', 'mechanism_of_action', 'chembl', 'target_chembl', ) ChemblMechanism= collections.namedtuple( 'ChemblMechanism', fields_mechanism, defaults = (None,) * len(fields_mechanism), ) mechanism_lst = [] page_dct = {} while True: if not page_dct: url = ( f"{urls.urls['chembl']['url']}" f"{urls.urls['chembl']['mechanism']}" ) elif page_dct['page_meta']['next']: url = ( f"{urls.urls['chembl']['url']}" f"{page_dct['page_meta']['next']}" ) else: break c = curl.Curl(url, large=True, silent=False) fileobj = open(c.fileobj.name, encoding='utf-8') page_dct = json.loads(fileobj.read()) mechanism_lst.extend( ChemblMechanism( action_type = mech['action_type'], direct_interaction = True if mech['direct_interaction'] == 1 else False, disease_efficacy = True if mech['disease_efficacy'] == 1 else False, mechanism_of_action = mech['mechanism_of_action'], chembl = mech['molecule_chembl_id'], target_chembl = mech['target_chembl_id'], ) for mech in page_dct['mechanisms'] ) return mechanism_lst