Source code for pypath.inputs.cosmic

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import os
import csv
import collections
import base64
import json

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.session as session_mod
import pypath.share.settings as settings
import pypath.utils.mapping as mapping
import pypath.inputs.credentials as credentials

_logger = session_mod.Logger(name = 'cosmic_input')
_log = _logger._log


[docs] def cancer_gene_census_annotations( user = None, passwd = None, credentials_fname = 'cosmic_credentials', ): """ Retrieves a list of cancer driver genes (Cancer Gene Census) from the Sanger COSMIC (Catalogue of Somatic Mutations in Cancer) database. Returns dict of annotations. """ try: cosmic_cred = credentials.credentials( user = user, passwd = passwd, resource = 'COSMIC', from_file = credentials_fname, ) except RuntimeError: _log( 'No credentials available for the COSMIC website. ' 'Either set the `cosmic_credentials` key in the `settings` ' 'module (e.g. `{\'user\': \'myuser\', ' '\'passwd\': \'mypassword\'}`), or pass them directly to the ' '`pypath.inputs.cosmic.cancer_gene_census_annotations` ' 'method.' ) return {} CancerGeneCensusAnnotation = collections.namedtuple( 'CancerGeneCensusAnnotation', ( 'tier', 'hallmark', 'somatic', 'germline', 'tumour_types_somatic', 'tumour_types_germline', 'cancer_syndrome', 'tissue_type', 'genetics', 'role', 'mutation_type', ), ) def multi_field(content): return ( tuple(sorted(i.strip() for i in content.split(','))) if content.strip() else () ) url = urls.urls['cgc']['url_new'] auth_str = base64.b64encode( ('%s:%s\n' % (cosmic_cred['user'], cosmic_cred['passwd'])).encode() ) req_hdrs = ['Authorization: Basic %s' % auth_str.decode()] c = curl.Curl( url, large = False, silent = False, req_headers = req_hdrs, cache = False, ) access_url = json.loads(c.result) if 'url' not in access_url: _log( 'Could not retrieve COSMIC access URL. ' 'Most likely the authentication failed. ' 'The reply was: `%s`' % c.result ) return None c = curl.Curl( access_url['url'], large = True, silent = False, bypass_url_encoding = True, ) data = csv.DictReader(c.fileobj, delimiter = ',') result = collections.defaultdict(set) for rec in data: uniprots = mapping.map_name( rec['Gene Symbol'], 'genesymbol', 'uniprot', ) for uniprot in uniprots: result[uniprot].add( CancerGeneCensusAnnotation( tier = int(rec['Tier']), hallmark = rec['Hallmark'].strip().lower() == 'yes', somatic = rec['Somatic'].strip().lower() == 'yes', germline = rec['Germline'].strip().lower() == 'yes', tumour_types_somatic = ( multi_field(rec['Tumour Types(Somatic)']) ), tumour_types_germline = ( multi_field(rec['Tumour Types(Germline)']) ), cancer_syndrome = ( multi_field(rec['Cancer Syndrome']) ), tissue_type = ( multi_field(rec['Tissue Type'].replace(' ', '')) ), genetics = rec['Molecular Genetics'].strip() or None, role = ( multi_field(rec['Role in Cancer']) ), mutation_type = ( multi_field(rec['Mutation Types']) ), ) ) return dict(result)