Source code for pypath.inputs.msigdb

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import os
import collections

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.share.settings as settings
import pypath.share.session as session
import pypath.utils.mapping as mapping
import pypath.utils.taxonomy as taxonomy

_logger = session.Logger(name = 'msigdb_input')
_log = _logger._log


ALL_COLLECTIONS = {
    'hallmark': ('h.all', 'mh.all'),
    'positional': ('c1.all', 'm1.all'),
    'chemical_and_genetic_perturbations': ('c2.cgp', 'm2.cgp'),
    'biocarta_pathways': ('c2.cp.biocarta', 'm2.cp.biocarta'),
    'kegg_pathways': ('c2.cp.kegg', None),
    'pid_pathways': ('c2.cp.pid', None),
    'reactome_pathways': ('c2.cp.reactome', 'm2.cp.reactome'),
    'wikipathways': ('c2.cp.wikipathways', 'm2.cp.wikipathways'),
    'mirna_targets_mirdb': ('c3.mir.mirdb', 'm3.mirdb'),
    'mirna_targets_legacy': ('c3.mir.mir_legacy', None),
    'tf_targets_gtrf': ('c3.tft.gtrd', 'm3.gtrd'),
    'tf_targets_legacy': ('c3.tft.tft_legacy', None),
    'cancer_gene_neighborhoods': ('c4.cgn', None),
    'cancer_modules': ('c4.cm', None),
    'go_biological_process': ('c5.go.bp', 'm5.go.bp'),
    'go_molecular_function': ('c5.go.mf', 'm5.go.mf'),
    'go_cellular_component': ('c5.go.cc', 'm5.go.cc'),
    'human_phenotype_ontology': ('c5.hpo', None),
    'mouse_phenotype_ontology': (None, 'm5.mpt'),
    'oncogenic_signatures': ('c6.all', None),
    'immunesigdb': ('c7.immunesigdb', None),
    'vaccine_response': ('c7.vax', None),
    'cell_type_signatures': ('c8.all', 'm8.all'),
}



[docs]
def msigdb_download(
        registered_email = None,
        collection = 'msigdb',
        id_type = 'symbols',
        force_download = False,
        organism = 'human',
        version = None,
    ):
    """
    Downloads and preprocesses a collection of gmt format gene sets from
    MSigDB. Returns dict of sets with gene set names as keys and molecular
    identifiers as values.

    :arg str,NoneType registered_email:
        An email address registered at MSigDB. If `None` the `msigdb_email`
        from ``pypath.settings`` will be used.
    :arg str collection:
        The name of the gene set collection. For available collections (e.g.
        `h.all` or `c2.cpg`) refer to the MSigDB website:
        http://software.broadinstitute.org/gsea/downloads.jsp#msigdb
        The default value `msigdb` contains all the genesets however you
        won't be able to distinguish which geneset comes from which
        collection. For this you need to download the collections one by one.
    :arg str id_type:
        MSigDB provides Gene Symbols (`symbols`) and Entrez Gene IDs
        (`entrez`).
    :arg bool force_download:
        Download even if cache content is available.
    """

    registered_email = registered_email or settings.get('msigdb_email')

    if not registered_email:

        _log(
            'To download MSigDB you must provide an email address '
            'you have previously registered at '
            '`http://software.broadinstitute.org/gsea/register.jsp`. '
            'Could not proceed, returning empty dict.'
        )

        return {}

    organisms = {9606: 'Hs', 10090: 'Mm'}
    ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(organism)
    msigdb_org = organisms.get(ncbi_tax_id, None)

    if not ncbi_tax_id:

        _log(f'Could not recognize organism: `{organism}`.')

        return {}

    version = version or settings.get('msigdb_version')

    #http://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/2022.1.Mm/mh.all.v2022.1.Mm.symbols.gmt
    #http://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/2022.1.Hs/h.all.v2022.1.Hs.symbols.gmt

    url = urls.urls['msigdb']['url'] % (
        version,
        msigdb_org,
        collection,
        version,
        msigdb_org,
        id_type,
    )

    req_headers = []

    # we shouldn't need this cookie game any more as all files are available
    # without any login or cookie from data.broadinstitute.org
    c_nocall = curl.Curl(
        url,
        call = False,
        process = False,
        bypass_url_encoding = True,
    )

    if (
        not os.path.exists(c_nocall.cache_file_name) or
        os.path.getsize(c_nocall.cache_file_name) == 0 or
        force_download
    ):

        c_login_1 = curl.Curl(
            urls.urls['msigdb']['login1'],
            cache = False,
            write_cache = False,
            process = False,
            large = True,
            silent = True,
            post = {
                'username': registered_email,
                'password': 'password',
            },
            empty_attempt_again = False,
            follow = False,
        )

        cookies = {}

        if hasattr(c_login_1, 'resp_headers'):

            for hdr in c_login_1.resp_headers:

                if hdr.lower().startswith(b'set-cookie'):

                    cookie = hdr.decode('ascii')
                    cookie = cookie.split(':', maxsplit = 1)[1].strip()
                    cookie = cookie.split(';', maxsplit = 1)[0].strip()
                    cookie = tuple(cookie.split('=', maxsplit = 1))
                    _log('msigdb cookie: `%s=%s`.' % cookie)
                    cookies[cookie[0]] = cookie[1]

        if not cookies:

            _log('msigdb: could not get cookie, returning empty list.')

            return {}

        req_headers = [
            'Cookie: %s' % ';'.join(
                '%s=%s' % cookie
                for cookie in cookies.items()
            )
        ]

        c_login_2 = curl.Curl(
            urls.urls['msigdb']['login2'],
            cache = False,
            write_cache = False,
            large = False,
            silent = True,
            req_headers = req_headers,
            post = {
                'j_username': registered_email,
                'j_password': 'password',
            },
            process = False,
            empty_attempt_again = False,
        )

        jsessionid_1 = ''

        if hasattr(c_login_2, 'resp_headers'):

            for hdr in c_login_2.resp_headers:

                if hdr.lower().startswith(b'set-cookie'):

                    jsessionid_1 = hdr.split(b':')[1].split(b';')[0].strip()
                    jsessionid_1 = jsessionid_1.decode('ascii')

            _log(
                'msigdb: logged in with email `%s`, '
                'new cookie obtained: `%s`.' % (
                    registered_email,
                    jsessionid_1
                )
            )

        _log('msigdb cookies for upcoming request: %s' % req_headers[0])

    c = curl.Curl(
        url,
        req_headers = req_headers,
        silent = False,
        large = True,
        bypass_url_encoding = True,
        cache = not force_download,
    )

    result = {}

    for gset in c.result:

        gset = gset.strip().split('\t')

        result[gset[0]] = set(gset[2:])

    return result




[docs]
def msigdb_download_collections(
        registered_email = None,
        only_collections = None,
        exclude = ('c5', 'm5'),
        id_type = 'symbols',
        organism = 'human',
        version = None,
    ):
    """
    Downloads all or some MSigDB gene set collections.
    Returns a dict of dicts where upper level keys are collections while
    lower level keys are geneset names and values are molecular identifiers.

    :arg str,NoneType registered_email:
        An email address registered at MSigDB. If `None` the `msigdb_email`
        from ``pypath.settings`` will be used.
    :arg set,NoneType only_collections:
        Limit the annotations only to these collections. For available
        collections e.g. ``{'h.all', 'c2.cgp'}`` refer to the MSigDB webpage:
        http://software.broadinstitute.org/gsea/downloads.jsp#msigdb
    :arg tuple exclude:
        Exclude the collections having their name starting with any of the
        strings in this tuple. By default `c5` and `m5` (Gene Ontology and
        Human/Mouse Phenotype Ontology) is excluded.
    """

    collection_data = {}

    organisms = {9606: 0, 10090: 1}
    ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(organism)
    idx = organisms.get(ncbi_tax_id, None)

    for collection, labels in iteritems(ALL_COLLECTIONS):

        label = labels[idx]

        if (
            not label or
            (
                only_collections and
                label not in only_collections
            ) or
            any(label.startswith(ex) for ex in exclude)
        ):

            continue

        _log(
            'MSigDB: downloading collection `%s` (%s).' % (collection, label)
        )

        collection_data[(collection, label)] = (
            msigdb_download(
                registered_email = registered_email,
                collection = label,
                id_type = id_type,
                organism = organism,
                version = version,
            )
        )

    return collection_data




[docs]
def msigdb_annotations(
        registered_email = None,
        only_collections = None,
        exclude = ('c5', 'm5'),
        organism = 'human',
        version = None,
    ):
    """
    Downloads all or some MSigDB gene set collections and processes them
    to an annotation type dictionary.

    :arg str,NoneType registered_email:
        An email address registered at MSigDB. If `None` the `msigdb_email`
        from ``pypath.settings`` will be used.
    :arg set,NoneType only_collections:
        Limit the annotations only to these collections. For available
        collections e.g. ``{'h.all', 'c2cgp'}`` refer to the MSigDB webpage:
        http://software.broadinstitute.org/gsea/downloads.jsp#msigdb
    :arg tuple exclude:
        Exclude the collections having their name starting with any of the
        strings in this tuple. By default `c5` (Gene Ontology) is excluded.

    """

    MsigdbAnnotation = collections.namedtuple(
        'MsigdbAnnotation',
        [
            'collection',
            'geneset',
        ],
    )


    annotations = collections.defaultdict(set)

    collection_data = msigdb_download_collections(
        registered_email = registered_email,
        only_collections = only_collections,
        exclude = exclude,
        organism = organism,
        version = version,
    )

    ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(organism)

    for (collection, label), genesets in iteritems(collection_data):

        for geneset, genesymbols in iteritems(genesets):

            this_annot = MsigdbAnnotation(
                collection = collection,
                geneset = geneset,
            )

            for uniprot in mapping.map_names(
                genesymbols,
                'genesymbol',
                'uniprot',
                ncbi_tax_id = ncbi_tax_id,
            ):

                annotations[uniprot].add(this_annot)

    return dict(annotations)