#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#fromfuture.utilsimportiteritemsimportosimportcollectionsimportpypath.share.curlascurlimportpypath.resources.urlsasurlsimportpypath.share.settingsassettingsimportpypath.share.sessionassessionimportpypath.utils.mappingasmappingimportpypath.utils.taxonomyastaxonomy_logger=session.Logger(name='msigdb_input')_log=_logger._logALL_COLLECTIONS={'hallmark':('h.all','mh.all'),'positional':('c1.all','m1.all'),'chemical_and_genetic_perturbations':('c2.cgp','m2.cgp'),'biocarta_pathways':('c2.cp.biocarta','m2.cp.biocarta'),'kegg_pathways':('c2.cp.kegg',None),'pid_pathways':('c2.cp.pid',None),'reactome_pathways':('c2.cp.reactome','m2.cp.reactome'),'wikipathways':('c2.cp.wikipathways','m2.cp.wikipathways'),'mirna_targets_mirdb':('c3.mir.mirdb','m3.mirdb'),'mirna_targets_legacy':('c3.mir.mir_legacy',None),'tf_targets_gtrf':('c3.tft.gtrd','m3.gtrd'),'tf_targets_legacy':('c3.tft.tft_legacy',None),'cancer_gene_neighborhoods':('c4.cgn',None),'cancer_modules':('c4.cm',None),'go_biological_process':('c5.go.bp','m5.go.bp'),'go_molecular_function':('c5.go.mf','m5.go.mf'),'go_cellular_component':('c5.go.cc','m5.go.cc'),'human_phenotype_ontology':('c5.hpo',None),'mouse_phenotype_ontology':(None,'m5.mpt'),'oncogenic_signatures':('c6.all',None),'immunesigdb':('c7.immunesigdb',None),'vaccine_response':('c7.vax',None),'cell_type_signatures':('c8.all','m8.all'),}
[docs]defmsigdb_download(registered_email=None,collection='msigdb',id_type='symbols',force_download=False,organism='human',version=None,):""" Downloads and preprocesses a collection of gmt format gene sets from MSigDB. Returns dict of sets with gene set names as keys and molecular identifiers as values. :arg str,NoneType registered_email: An email address registered at MSigDB. If `None` the `msigdb_email` from ``pypath.settings`` will be used. :arg str collection: The name of the gene set collection. For available collections (e.g. `h.all` or `c2.cpg`) refer to the MSigDB website: http://software.broadinstitute.org/gsea/downloads.jsp#msigdb The default value `msigdb` contains all the genesets however you won't be able to distinguish which geneset comes from which collection. For this you need to download the collections one by one. :arg str id_type: MSigDB provides Gene Symbols (`symbols`) and Entrez Gene IDs (`entrez`). :arg bool force_download: Download even if cache content is available. """registered_email=registered_emailorsettings.get('msigdb_email')ifnotregistered_email:_log('To download MSigDB you must provide an email address ''you have previously registered at ''`http://software.broadinstitute.org/gsea/register.jsp`. ''Could not proceed, returning empty dict.')return{}organisms={9606:'Hs',10090:'Mm'}ncbi_tax_id=taxonomy.ensure_ncbi_tax_id(organism)msigdb_org=organisms.get(ncbi_tax_id,None)ifnotncbi_tax_id:_log(f'Could not recognize organism: `{organism}`.')return{}version=versionorsettings.get('msigdb_version')#http://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/2022.1.Mm/mh.all.v2022.1.Mm.symbols.gmt#http://www.gsea-msigdb.org/gsea/msigdb/download_file.jsp?filePath=/msigdb/release/2022.1.Hs/h.all.v2022.1.Hs.symbols.gmturl=urls.urls['msigdb']['url']%(version,msigdb_org,collection,version,msigdb_org,id_type,)req_headers=[]# we shouldn't need this cookie game any more as all files are available# without any login or cookie from data.broadinstitute.orgc_nocall=curl.Curl(url,call=False,process=False,bypass_url_encoding=True,)if(notos.path.exists(c_nocall.cache_file_name)oros.path.getsize(c_nocall.cache_file_name)==0orforce_download):c_login_1=curl.Curl(urls.urls['msigdb']['login1'],cache=False,write_cache=False,process=False,large=True,silent=True,post={'username':registered_email,'password':'password',},empty_attempt_again=False,follow=False,)cookies={}ifhasattr(c_login_1,'resp_headers'):forhdrinc_login_1.resp_headers:ifhdr.lower().startswith(b'set-cookie'):cookie=hdr.decode('ascii')cookie=cookie.split(':',maxsplit=1)[1].strip()cookie=cookie.split(';',maxsplit=1)[0].strip()cookie=tuple(cookie.split('=',maxsplit=1))_log('msigdb cookie: `%s=%s`.'%cookie)cookies[cookie[0]]=cookie[1]ifnotcookies:_log('msigdb: could not get cookie, returning empty list.')return{}req_headers=['Cookie: %s'%';'.join('%s=%s'%cookieforcookieincookies.items())]c_login_2=curl.Curl(urls.urls['msigdb']['login2'],cache=False,write_cache=False,large=False,silent=True,req_headers=req_headers,post={'j_username':registered_email,'j_password':'password',},process=False,empty_attempt_again=False,)jsessionid_1=''ifhasattr(c_login_2,'resp_headers'):forhdrinc_login_2.resp_headers:ifhdr.lower().startswith(b'set-cookie'):jsessionid_1=hdr.split(b':')[1].split(b';')[0].strip()jsessionid_1=jsessionid_1.decode('ascii')_log('msigdb: logged in with email `%s`, ''new cookie obtained: `%s`.'%(registered_email,jsessionid_1))_log('msigdb cookies for upcoming request: %s'%req_headers[0])c=curl.Curl(url,req_headers=req_headers,silent=False,large=True,bypass_url_encoding=True,cache=notforce_download,)result={}forgsetinc.result:gset=gset.strip().split('\t')result[gset[0]]=set(gset[2:])returnresult
[docs]defmsigdb_download_collections(registered_email=None,only_collections=None,exclude=('c5','m5'),id_type='symbols',organism='human',version=None,):""" Downloads all or some MSigDB gene set collections. Returns a dict of dicts where upper level keys are collections while lower level keys are geneset names and values are molecular identifiers. :arg str,NoneType registered_email: An email address registered at MSigDB. If `None` the `msigdb_email` from ``pypath.settings`` will be used. :arg set,NoneType only_collections: Limit the annotations only to these collections. For available collections e.g. ``{'h.all', 'c2.cgp'}`` refer to the MSigDB webpage: http://software.broadinstitute.org/gsea/downloads.jsp#msigdb :arg tuple exclude: Exclude the collections having their name starting with any of the strings in this tuple. By default `c5` and `m5` (Gene Ontology and Human/Mouse Phenotype Ontology) is excluded. """collection_data={}organisms={9606:0,10090:1}ncbi_tax_id=taxonomy.ensure_ncbi_tax_id(organism)idx=organisms.get(ncbi_tax_id,None)forcollection,labelsiniteritems(ALL_COLLECTIONS):label=labels[idx]if(notlabelor(only_collectionsandlabelnotinonly_collections)orany(label.startswith(ex)forexinexclude)):continue_log('MSigDB: downloading collection `%s` (%s).'%(collection,label))collection_data[(collection,label)]=(msigdb_download(registered_email=registered_email,collection=label,id_type=id_type,organism=organism,version=version,))returncollection_data
[docs]defmsigdb_annotations(registered_email=None,only_collections=None,exclude=('c5','m5'),organism='human',version=None,):""" Downloads all or some MSigDB gene set collections and processes them to an annotation type dictionary. :arg str,NoneType registered_email: An email address registered at MSigDB. If `None` the `msigdb_email` from ``pypath.settings`` will be used. :arg set,NoneType only_collections: Limit the annotations only to these collections. For available collections e.g. ``{'h.all', 'c2cgp'}`` refer to the MSigDB webpage: http://software.broadinstitute.org/gsea/downloads.jsp#msigdb :arg tuple exclude: Exclude the collections having their name starting with any of the strings in this tuple. By default `c5` (Gene Ontology) is excluded. """MsigdbAnnotation=collections.namedtuple('MsigdbAnnotation',['collection','geneset',],)annotations=collections.defaultdict(set)collection_data=msigdb_download_collections(registered_email=registered_email,only_collections=only_collections,exclude=exclude,organism=organism,version=version,)ncbi_tax_id=taxonomy.ensure_ncbi_tax_id(organism)for(collection,label),genesetsiniteritems(collection_data):forgeneset,genesymbolsiniteritems(genesets):this_annot=MsigdbAnnotation(collection=collection,geneset=geneset,)foruniprotinmapping.map_names(genesymbols,'genesymbol','uniprot',ncbi_tax_id=ncbi_tax_id,):annotations[uniprot].add(this_annot)returndict(annotations)