Source code for pypath.inputs.drugbank

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

from typing import Optional

import os
import re
import csv
import collections
import base64
from lxml import etree
from zipfile import ZipFile

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.share.session as session
import pypath.share.settings as settings
import pypath.inputs.credentials as credentials

_logger = session.Logger(name = 'drugbank_input')
_log = _logger._log


def _drugbank_credentials(
        user: Optional[str] = None,
        passwd: Optional[str] = None,
        credentials_fname: Optional[str] = None,
    ) -> tuple[str, str]:

    return credentials.credentials(
        user = user,
        passwd = passwd,
        resource = 'DrugBank',
        from_file = credentials_fname,
    )


def _drugbank_download(
        *args,
        user: Optional[str] = None,
        passwd: Optional[str] = None,
        credentials_fname: Optional[str] = None,
        **kwargs
    ) -> Optional[curl.Curl]:

    try:

        cred = _drugbank_credentials(
            user = user,
            passwd = passwd,
            credentials_fname = credentials_fname,
        )

    except RuntimeError:

        _log('No credentials available for the DrugBank website.')

        return None

    defaults = {
        'large': True,
        'silent': False,
        'compr': 'zip',
    }

    defaults.update(kwargs)

    auth_str = base64.b64encode(f"{cred['user']}:{cred['passwd']}".encode())

    defaults['req_headers'] = [
        f'Authorization: Basic {auth_str.decode()}',
        settings.get('user_agent'),
    ]

    return curl.Curl(*args, **defaults)


[docs] def drugbank_raw_interactions( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, pharma_active: bool = False, ) -> list[tuple] : """ Retrieves protein identifiers from Drugbank. Args user: E-mail address with registered DrugBank account. passwd: Password for the DrugBank account. pharma_active: Only pharmacologically active relations. Returns List of drug-protein relations. """ csv_name = 'pharmacologically_active.csv' if pharma_active else 'all.csv' fields = ( 'drugbank_id', 'uniprot_id', 'relation', ) DrugbankRawInteraction = collections.namedtuple( 'DrugbankRawInteraction', fields, defaults = (None,) * len(fields), ) result = [] for rel in ('carrier', 'enzyme', 'target', 'transporter'): url = urls.urls['drugbank'][f'drug_{rel}_identifiers'] c = _drugbank_download( url = url, user = user, passwd = passwd, credentials_fname = credentials_fname, files_needed = (csv_name,), ) if not c: continue _ = next(c.result[csv_name]) for l in c.result[csv_name]: drugs, uniprot = l.strip().split(',')[-1], l.strip().split(',')[5] drugs = drugs.strip().split(';') result.extend( DrugbankRawInteraction( drugbank_id = drug.strip(), uniprot_id = uniprot, relation = rel, ) for drug in drugs ) return result
[docs] def drugbank_interactions( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, pharma_active: bool = False, ) -> list[tuple] : """ Drug-protein and protein-drug interactions from Drugbank. Args user: E-mail address with registered DrugBank account. passwd: Password for the DrugBank account. pharma_active: Only pharmacologically active interactions. Returns List of drug-protein and protein-drug interactions. """ raw = drugbank_raw_interactions( user = user, passwd = passwd, pharma_active = pharma_active, credentials_fname = credentials_fname, ) drugs = dict( (d.drugbank, d) for d in drugbank_drugs(user = user, passwd = passwd) ) DrugbankInteraction = collections.namedtuple( 'DrugbankInteraction', ( 'source', 'target', 'source_entity_type', 'target_entity_type', 'interaction_type', ) ) result = [] for r in raw: drug = drugs.get(r.drugbank_id, None) # TODO: later engage the mapping module here if drug and drug.pubchem_cid: src_tgt = reversed if r.relation == 'target' else lambda x: x result.append( DrugbankInteraction( *src_tgt((r.uniprot_id, drug.pubchem_cid)), *src_tgt(('protein', 'drug')), interaction_type = r.relation, ) ) return result
[docs] def drugbank_drugs( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, ) -> list[tuple]: """ Retrieves drug identifiers from Drugbank. Each drug is annotated by its various database cross-references. Args user: E-mail address with registered DrugBank account. passwd: Password for the DrugBank account. Returns List of named tuples, each field corresponding to various identifiers. """ fields = ( 'drugbank', 'name', 'type', 'groups', 'cas', 'inchikey', 'inchi', 'smiles', 'formula', 'kegg_compound', 'kegg_drug', 'pubchem_cid', 'pubchem_sid', 'chebi', 'chembl', 'pharmgkb', 'het', ) raw = {} for table in ('drug', 'structure'): csv_ = f'{table} links.csv' c = _drugbank_download( url = urls.urls['drugbank'][f'all_{table}s'], user = user, passwd = passwd, credentials_fname = credentials_fname, files_needed = (csv_,), ) if not c: continue raw[table] = dict( (rec['DrugBank ID'], rec) for rec in csv.DictReader(c.result[csv_], delimiter = ',') ) DrugbankDrug = collections.namedtuple( 'DrugbankDrug', fields, defaults = (None,) * len(fields), ) result = [] for dbid, struct in raw['structure'].items(): drug = raw['drug'].get(dbid, {}) result.append( DrugbankDrug( drugbank = dbid, name = struct['Name'], type = drug.get('Drug Type', None), groups = struct['Drug Groups'], cas = struct['CAS Number'], inchikey = struct['InChIKey'], inchi = struct['InChI'], smiles = struct['SMILES'], formula = struct['Formula'], kegg_compound = struct['KEGG Compound ID'], kegg_drug = struct['KEGG Drug ID'], pubchem_cid = struct['PubChem Compound ID'], pubchem_sid = struct['PubChem Substance ID'], chebi = struct['ChEBI ID'], chembl = struct['ChEMBL ID'], pharmgkb = drug.get('PharmGKB ID', None), het = drug.get('HET ID', None), ) ) return result
[docs] def drugbank_annotations( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, ) -> dict[str, set[tuple]]: """ Drug annotations from Drugbank. The annotations are restricted to the drug molecule type and drug status. Args user: E-mail address with registered DrugBank account. passwd: Password for the DrugBank account. Returns List of drug annotations. """ drugs = drugbank_drugs( user = user, passwd = passwd, credentials_fname = credentials_fname, ) DrugbankAnnotation = collections.namedtuple( 'DrugbankAnnotation', ( 'type', 'status', ) ) result = collections.defaultdict(set) for d in drugs: if d.pubchem_cid: result[d.pubchem_cid].add( DrugbankAnnotation( type = d.type, status = re.sub(',\s*', ';', d.groups), ) ) return dict(result)
[docs] def drugbank_mapping( id_type: str, target_id_type: str, user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, ) -> dict[str, set[str]]: """ Identifier translation table from DrugBank. Available ID types: drugbank, name, type, groups, cas, inchikey, inchi, smiles, formula, kegg_compound, kegg_drug, pubchem_compound, pubchem_substance, chebi, chembl, pharmgkb, het. Args id_type: The identifier type to be used as keys. target_id_type: The identifier type that will be collected into the values. user: E-mail address with registered DrugBank account. passwd: Password for the DrugBank account. credentials_fname: File name or path to a file with DrugBank login credentials. Returns An identifier translation table. """ synonyms = { 'pubchem_compound': 'pubchem_cid', 'pubchem_substance': 'pubchem_sid', } def id_type_proc(_id_type): _id_type = re.sub('[^cs]id$', '', _id_type.lower()).replace(' ', '_') return synonyms.get(_id_type, _id_type) drugs = drugbank_drugs( user = user, passwd = passwd, credentials_fname = credentials_fname, ) result = collections.defaultdict(set) id_type = id_type_proc(id_type) target_id_type = id_type_proc(target_id_type) for d in drugs: the_id = getattr(d, id_type) target_id = getattr(d, target_id_type) if the_id and target_id: result[the_id].add(target_id) return dict(result)
[docs] class DrugbankFull: """ This is a wrapper around the Drugbank full database XML file. Provides access to the full Drugbank database. The class provides two methods: drugbank_drugs_full and drugbank_targets_full. The first method returns a list of namedtuples, each of which represents a drug. The second method returns a list of namedtuples, each of which represents a drug's target. Args user: E-mail address with registered DrugBank account. passwd: Password for the DrugBank account. """
[docs] def __init__( self, user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, ): path = _drugbank_download( url = urls.urls['drugbank']['full_database'], user = user, passwd = passwd, credentials_fname = credentials_fname, ).fileobj.name with ZipFile(path, 'r') as zip_ref: zip_ref.extractall(os.path.dirname(path)) file = os.path.join(os.path.dirname(path), 'full database.xml') self.tree = etree.ElementTree(file = file) self.ns = self.tree.getroot().nsmap self.ns['db'] = self.ns[None] del self.ns[None] self.drugs = self.tree.xpath('db:drug', namespaces=self.ns)
[docs] def drugbank_drugs_full( self, fields: str | list[str] | None = None, ) -> list[tuple]: """ Returns a list of namedtuples containing detailed information about drugs. Args fields: The fields to return. If None, all XML fields are returned. Default: None Returns A list of namedtuples containing information about drugs. """ basic_fields = [ 'drugbank_id', 'type', 'name', 'description', 'cas_number', 'unii', 'average_mass', 'monoisotopic_mass', 'state', 'synthesis_reference', 'indication', 'pharmacodynamics', 'mechanism_of_action', 'toxicity', 'metabolism', 'absorption', 'half_life', 'protein_binding', 'route_of_elimination', 'volume_of_distribution', 'clearance', 'fda_label', 'msds', ] fields_w_subfields = { 'groups': {'path': '/db:group'}, 'general_references': {'path': '/db:articles/db:article/db:pubmed-id'}, 'classification': {'path': '/db:class'}, 'synonyms': {'path': '/db:synonym'}, 'products': {'path': '/db:product/db:name'}, 'international_brands': {'path': '/db:international-brand/db:name'}, 'mixtures': {'path': '/db:mixture/db:name'}, 'packagers': {'path': '/db:packager/db:name'}, 'manufacturers': {'path': '/db:manufacturer/db:name'}, 'categories': {'path': '/db:category/db:mesh-id'}, 'affected_organisms': {'path': '/db:affected-organism'}, 'atc_codes': {'path': '/db:atc-code', 'key': 'code'}, 'ahfs_codes': {'path': '/db:ahfs-code', 'key': 'code'}, 'pdb_entries': {'path': '/db:pdb-entry'}, 'patents': {'path': '/db:patent/db:number'}, 'food_interactions': {'path': '/db:food-interaction'}, 'drug_interactions': {'path': '/db:drug-interaction/db:drugbank-id'}, 'pathways': {'path': '/db:pathway/db:smpdb-id'}, } # TODO: later process and engage fields below # future_fields: 'salts', 'prices', 'dosages', 'sequences', # 'experimental_properties', 'external_links', # 'reactions', 'snp_effects', 'snp_adverse_drug_reactions' fields = fields or basic_fields + list(fields_w_subfields.keys()) fields = common.to_list(fields) if 'drugbank_id' not in fields: fields.insert(0, 'drugbank_id') result = [] record = collections.namedtuple('DrugbankDrug', fields) for drug in self.drugs: field_dict = {} for field in fields: if field == 'drugbank_id': field_dict[field] = [i for i in drug.xpath('db:drugbank-id', namespaces=self.ns) if i.attrib.get('primary') == 'true'][0].text elif field == 'type': field_dict[field] = drug.get('type') else: if field in fields_w_subfields: path_to_field = f"db:{field.replace('_', '-')}{fields_w_subfields[field]['path']}" if 'key' in fields_w_subfields[field]: field_dict[field] = {f.get(fields_w_subfields[field]['key']) for f in drug.xpath(path_to_field, namespaces=self.ns)} else: field_dict[field] = {f.text for f in drug.xpath(path_to_field, namespaces=self.ns)} else: path_to_field = f"db:{field.replace('_', '-')}" field_dict[field] = {f.text for f in drug.xpath(path_to_field, namespaces=self.ns)} for k, v in field_dict.items(): if v and type(v) != str: field_dict[k] = [elem.replace('\r\n', ' ') for elem in v if elem] if len(field_dict[k]) == 1: field_dict[k] = field_dict[k][0] if not field_dict[k]: field_dict[k] = None result.append(record(**field_dict)) return result
[docs] def drugbank_targets_full( self, fields: str | list[str] | None = None, ) -> list[tuple]: """ Returns a list of namedtuples containing detailed information about drug-target interactions. Args fields: The fields to return. Default: None Returns A list of namedtuples containing information about the target of drugs. """ result = [] all_fields = [ 'drugbank_id', 'id', 'name', 'organism', 'actions', 'references', 'known_action', 'polypeptide', ] fields = fields or all_fields fields = common.to_list(fields) if 'drugbank_id' not in fields: fields.insert(0, 'drugbank_id') record = collections.namedtuple('DrugbankTarget', fields) for drug in self.drugs: db_id = [i for i in drug.xpath('db:drugbank-id', namespaces=self.ns) if i.attrib.get('primary') == 'true'][0].text for target in drug.xpath('db:targets/db:target', namespaces=self.ns): target_dict = {} target_dict['drugbank_id'] = db_id for field in fields: if field in ['id', 'name', 'organism', 'known_action']: target_dict[field] = [f.text for f in target.xpath(f"db:{field.replace('_', '-')}", namespaces=self.ns)] elif field == 'actions': target_dict[field] = [f.text for f in target.xpath('db:actions/db:action', namespaces=self.ns)] elif field == 'references': target_dict[field] = [f.text for f in target.xpath('db:references/db:articles/db:article/db:pubmed-id', namespaces=self.ns)] elif field == 'polypeptide': target_dict[field] = [(f.get('id'), f.get('source')) for f in target.xpath('db:polypeptide', namespaces=self.ns)] for k, v in target_dict.items(): if v and len(v) == 1: target_dict[k] = v[0] if not v: target_dict[k] = None result.append(record(**target_dict)) return result
[docs] def drugbank_external_ids_full( self, ) -> dict[str, dict]: """ Returns a dictionary containing all external identifiers of drugs. """ result = {} for drug in self.drugs: db_id = [i for i in drug.xpath('db:drugbank-id', namespaces=self.ns) if i.attrib.get('primary') == 'true'][0].text for ext_id in drug.xpath('db:external-identifiers/db:external-identifier', namespaces=self.ns): source = ext_id.xpath('db:resource', namespaces=self.ns)[0].text identifier = ext_id.xpath('db:identifier', namespaces=self.ns)[0].text if db_id not in result: result[db_id] = {} result[db_id][source] = identifier return result
[docs] def drugbank_properties_full( self, ) -> dict[str, dict]: """ Returns a dictionary containing calculated properties of drugs. """ result = {} for drug in self.drugs: db_id = [i for i in drug.xpath('db:drugbank-id', namespaces=self.ns) if i.attrib.get('primary') == 'true'][0].text for prop in drug.xpath('db:calculated-properties/db:property', namespaces=self.ns): kind = prop.xpath('db:kind', namespaces=self.ns)[0].text identifier = prop.xpath('db:value', namespaces=self.ns)[0].text if db_id not in result: result[db_id] = {} result[db_id][kind] = identifier return result