Source code for pypath.inputs.dorothea

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import re
import csv
import collections
import itertools
import functools
import pyreadr

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.share.session as session
import pypath.utils.taxonomy as taxonomy


_logger = session.Logger(name = 'dorothea_input')


DorotheaInteraction = collections.namedtuple(
    'DorotheaInteraction',
    [
        'tf',
        'target',
        'effect',
        'level',
        'curated',
        'chipseq',
        'predicted',
        'coexp',
        'curated_sources',
        'chipseq_sources',
        'predicted_sources',
        'coexp_sources',
        'all_sources',
        'pubmed',
        'kegg_pathways',
    ]
)


_resources_upper = (
    'jaspar',
    'trred',
    'kegg',
    'trrust',
    'tred',
    'trrd',
    'hocomoco',
    'fantom4',
    'pazar',
)

_resources_special_case = {
    'tfact': 'TFactS',
    'tf_act': 'TFactS',
    'htri_db': 'HTRIdb',
    'int_act': 'IntAct',
    'fantom_4': 'FANTOM4',
    'oreganno': 'ORegAnno',
    'reviews': 'DoRothEA-reviews',
    'HOCOMOCO_v11': 'HOCOMOCO-v11',
    'hocomoco_v11': 'HOCOMOCO-v11',
    'JASPAR_v2018': 'JASPAR-v2018',
    'remap': 'ReMap',
    'gtex': 'ARACNe-GTEx',
    'nfi_regulome_db': 'NFIRegulomeDB',
    'tf_e': 'TFe',
    'reg_network': 'RegNetwork',
}


def _process_resources(sources):

    if sources == 'none':

        return ''

    revia = re.compile(r',|_via_')

    sources = functools.reduce(
        lambda s, r: s.replace(r, r.upper()),
        _resources_upper,
        sources,
    )

    sources = functools.reduce(
        lambda s, r: s.replace(*r),
        iteritems(_resources_special_case),
        sources,
    )

    return ','.join(revia.split(sources))



[docs] def get_dorothea_old( levels = {'A', 'B'}, only_curated = False ): """ Retrieves TF-target interactions from DoRothEA. :param set levels: Confidence levels to be used. :param bool only_curated: Retrieve only literature curated interactions. Details ------- DoRothEA is a comprehensive resource of TF-target interactions combining multiple lines of evidences: literature curated databases, ChIP-Seq data, PWM based prediction using HOCOMOCO and JASPAR matrices and prediction from GTEx expression data by ARACNe. For details see https://github.com/saezlab/DoRothEA. """ url = urls.urls['dorothea']['url'] % ( 'all' if 'E' in levels else 'ABCD' if 'D' in levels else 'ABC' if 'C' in levels else 'AB' if 'B' in levels else 'A' ) c = curl.Curl(url, silent = False, large = True) _ = next(c.result) return ( list( itertools.chain( ll[:4], (s == 'TRUE' for s in ll[4:8]), ll[-4:], [','.join(s for s in ll[-4:] if s)] if not only_curated else ll[8] ) ) for ll in ( l.strip('\n\r').split('\t') for l in c.result ) if ( ll[3] in levels and not only_curated or ll[4] == 'TRUE' ) )
[docs] def dorothea_old_csv( levels = {'A', 'B'}, only_curated = False ): """ Retrieves TF-target interactions from DoRothEA. :param set levels: Confidence levels to be used. :param bool only_curated: Retrieve only literature curated interactions. Details ------- Note: this method processes DoRothEA from an old CSV file generated in 2018. For an up to date version of DoRothEA please use the ``dorothea_interactions`` method. DoRothEA is a comprehensive resource of TF-target interactions combining multiple lines of evidences: literature curated databases, ChIP-Seq data, PWM based prediction using HOCOMOCO and JASPAR matrices and prediction from GTEx expression data by ARACNe. For details see https://github.com/saezlab/DoRothEA. """ evidence_types = ( 'chipSeq', 'TFbindingMotif', 'coexpression', 'curateddatabase' ) url = urls.urls['dorothea_git']['url'] c = curl.Curl( url, silent = False, large = True, files_needed = ['database.csv'], ) reader = csv.DictReader(c.result['database.csv']) for rec in reader: # process only the ones of the requested levels or if curated if ( rec['score'] not in levels and not ( only_curated and rec['is_evidence_curateddatabase'] == 'TRUE' ) ): continue rec = dict( (k, v if v not in {'-', 'none'} else '') for k, v in iteritems(rec) ) yield DorotheaInteraction( **dict(zip( DorotheaInteraction._fields, itertools.chain( # TF, target, effect, score ( rec[key] for key in ('TF', 'target', 'effect', 'score') ), # boolean values for curated, chipseq, motif pred. # and coexp ( rec['is_evidence_%s' % key] == 'TRUE' for key in evidence_types ), # databases & datasets ( rec['which_%s' % key] for key in evidence_types ), # all data sources (or only the curated ones) ( _process_resources( ','.join( rec[key] for key in ('which_%s' % evt for evt in evidence_types) if rec[key] ) if not only_curated else rec['which_curateddatabase'] ), ), # PubMed and KEGG pw ( rec['pubmedID_from_curated_resources'], rec['kegg_pathway'], ) ) )) )
[docs] def dorothea_rda_raw(organism = 9606): """ :param int,str organism: Name or NCBI Taxonomy ID of the organism. Human and mouse are supported. If `None`, the human interactions will be returned with additional details included. """ _organism = taxonomy.ensure_ncbi_tax_id(organism) if _organism not in (9606, 10090, None): msg = ( 'DoRothEA: invalid organism: `%s`. Only human and mouse ' 'are supported.' % str(organism) ) _logger._log(msg) raise ValueError(msg) fname = ( 'entire_database' if _organism is None else 'dorothea_%s' % ('hs' if _organism == 9606 else 'mm') ) url = urls.urls['dorothea_git']['rda'] % fname c = curl.Curl(url, silent = False, large = True) rdata_path = c.fileobj.name c.fileobj.close() rdata = None try: rdata = pyreadr.read_r(rdata_path)[fname] except pyreadr.custom_errors.LibrdataError as e: _logger._log( 'Could not parse DoRothEA data from Rdata file: ' '`%s`. ' 'Make sure your `pyreadr` installation supports the xz ' 'compression.' % e.args[0] ) return rdata
[docs] def dorothea_full_raw(organism = 9606): """ DoRothEA data as it is provided in the R package. Args organism (int,str): Name or NCBI Taxonomy ID of the organism. The complete DoRothEA database (with all the details about the original sources) is available only for human. Returns (pandas.DataFrame): A data frame of TF-target interactions from DoRothEA. """ _organism = taxonomy.ensure_ncbi_tax_id(organism) if _organism != 9606 and organism: msg = ( 'DoRothEA: invalid organism: `%s`. The full database is ' 'available only for human. To have them for other organisms, ' 'you can load DoRothEA in a Network object and use the homology ' 'translation function.' % str(organism) ) _logger._log(msg) raise ValueError(msg) dorothea_full = dorothea_rda_raw(organism = None) return dorothea_full
[docs] def dorothea_interactions( organism = 9606, levels = {'A', 'B', 'C', 'D'}, only_curated = False, confidence_pairwise = True, ): """ Retrieves TF-target interactions from TF regulons. :param int,str organism: Name or NCBI Taxonomy ID of the organism. Only human is available. :param set levels: Confidence levels to be used. :param bool only_curated: Retrieve only literature curated interactions. Details ------- TF regulons is a comprehensive resource of TF-target interactions combining multiple lines of evidences: literature curated databases, ChIP-Seq data, PWM based prediction using HOCOMOCO and JASPAR matrices and prediction from GTEx expression data by ARACNe. As KEGG is not longer part of the public version of DoRothEA the `kegg_pathways` field is always empty. For details see https://github.com/saezlab/DoRothEA. """ evidence_types = ( 'curated', 'chip_seq', 'tfbs', # inferred 'inferred', # coexp ) df = dorothea_full_raw(organism = organism) df = df[df.confidence.isin(levels)] if only_curated: df = df[df.is_evidence_curated] for rec in df.itertuples(): yield( DorotheaInteraction( **dict(zip( DorotheaInteraction._fields, itertools.chain( # TF, target, effect, score ( rec.tf, rec.target, int(rec.mor), rec.confidence, ), # boolean values for curated, chipseq, motif pred. # and coexp ( getattr(rec, 'is_evidence_%s' % evt) for evt in evidence_types ), # databases & datasets ( _process_resources( getattr(rec, 'which_%s' % evt) ) for evt in evidence_types ), # all data sources (or only the curated ones) ( _process_resources( ','.join( getattr(rec, key) for key in ( 'which_%s' % evt for evt in evidence_types ) if getattr(rec, key) != 'none' ) if not only_curated else rec.which_curated ), ), # PubMed and KEGG pw ( rec.pubmed_id if rec.pubmed_id.isdigit() else '', '', ), ) )) ) )
# synonyms dorothea_interactions_old = dorothea_old_csv tfregulons_interactions_old = dorothea_old_csv get_tfregulons = dorothea_rda_raw tfregulons_interactions = dorothea_interactions