Source code for pypath.inputs.proteinatlas

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import csv
import collections

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.inputs.common as inputs_common
import pypath.inputs.science as science


[docs] def get_proteinatlas(normal = True, pathology = True, cancer = True): result = { 'normal': collections.defaultdict(dict), 'pathology': collections.defaultdict(dict), } def line(l): return l.strip('\n\r').split('\t') if normal: c = curl.Curl(urls.urls['proteinatlas']['normal'], silent = False, large = True) fp = list(c.result.values())[0] hdr = line(fp.readline().strip()) for l in fp: l = line(l) uniprots = mapping.map_name(l[1], 'genesymbol', 'uniprot') tissue = '%s:%s' % (l[2], l[3]) for u in uniprots: result['normal'][tissue][u] = (l[4], l[5].strip()) if cancer or pathology: c = curl.Curl(urls.urls['proteinatlas']['pathology'], silent = False, large = True) fp = list(c.result.values())[0] hdr = line(fp.readline()) for l in fp: l = line(l) uniprots = mapping.map_name(l[1], 'genesymbol', 'uniprot') tissue = l[2] values = dict( (h, float(l[i + 3]) if '.' in l[i + 3] else int(l[i + 3])) for i, h in enumerate(hdr[3:]) if len(l) and len(l[i + 3].strip()) ) for u in uniprots: result['pathology'][tissue][u] = values return dict((k, dict(v)) for k, v in iteritems(result))
[docs] def proteinatlas_annotations(normal = True, pathology = True, cancer = True): LEVELS = ('Not detected', 'Low', 'Medium', 'High') ProteinatlasAnnotation = collections.namedtuple( 'ProtainatlasAnnotation', [ 'organ', 'tissue', 'level', 'status', 'n_not_detected', 'n_low', 'n_medium', 'n_high', 'prognostic', 'favourable', 'score', 'pathology', ], ) ProteinatlasAnnotation.__new__.__defaults__ = ( (None,) * 4 + (False, False, None, False) ) def n_or_none(ex, key): return ex[key] if key in ex else None data = get_proteinatlas( normal = normal, pathology = pathology, cancer = cancer, ) result = collections.defaultdict(set) if normal: for tissue, gex in iteritems(data['normal']): organ = tissue if ':' in tissue: organ, tissue = tissue.split(':') organ = organ.strip() tissue = tissue.strip() for uniprot, ex in iteritems(gex): uniprots = mapping.map_name(uniprot, 'uniprot', 'uniprot') for _uniprot in uniprots: result[_uniprot].add( ProteinatlasAnnotation( organ = organ, tissue = tissue, level = ex[0], status = ex[1], ) ) if pathology or cancer: for condition, gex in iteritems(data['pathology']): for uniprot, ex in iteritems(gex): try: effect, score = next( i for i in iteritems(ex) if i[0] not in LEVELS ) prognostic = not effect.startswith('unprognostic') favourable = not effect.endswith('unfavourable') except StopIteration: prognostic, favourable, score = None, None, None uniprots = mapping.map_name(uniprot, 'uniprot', 'uniprot') for _uniprot in uniprots: result[_uniprot].add( ProteinatlasAnnotation( organ = condition, tissue = condition, level = max( (i for i in iteritems(ex) if i[0] in LEVELS), key = lambda i: i[1], default = (None,), )[0], status = None, n_not_detected = n_or_none(ex, 'Not detected'), n_low = n_or_none(ex, 'Low'), n_medium = n_or_none(ex, 'Medium'), n_high = n_or_none(ex, 'High'), prognostic = prognostic, favourable = favourable, score = score, pathology = True, ) ) return dict(result)
[docs] def proteinatlas_subcellular_annotations(): ProteinatlasSubcellularAnnotation = collections.namedtuple( 'ProteinatlasSubcellularAnnotation', [ 'location', 'status', ], ) url = urls.urls['proteinatlas']['subcell'] c = curl.Curl( url, large = True, silent = False, default_mode = 'r', ) reader = csv.DictReader( c.files_multipart['subcellular_location.tsv'], delimiter = '\t', ) result = collections.defaultdict(set) for rec in reader: uniprots = mapping.map_name(rec['Gene name'], 'genesymbol', 'uniprot') for uniprot in uniprots: for status in ('Enhanced', 'Supported', 'Uncertain'): if not rec[status]: continue for location in rec[status].split(';'): result[uniprot].add(ProteinatlasSubcellularAnnotation( location = location, status = status, )) return dict(result)
[docs] def proteinatlas_secretome_annotations(): ProteinatlasSecretomeAnnotation = collections.namedtuple( 'ProteinatlasSecretomeAnnotation', [ 'mainclass', 'secreted', ], ) url = urls.urls['proteinatlas']['secretome'] path = science.science_download(url) reader = inputs_common.read_xls(path)[1:] result = collections.defaultdict(set) for rec in reader: for uniprot_original in rec[2].split(','): uniprots = mapping.map_name( uniprot_original, 'uniprot', 'uniprot', ) for uniprot in uniprots: result[uniprot].add(ProteinatlasSecretomeAnnotation( mainclass = rec[3], secreted = 'secreted' in rec[3].lower(), )) return dict(result)