Source code for pypath.inputs.proteinatlas

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import csv
import collections

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.inputs.common as inputs_common
import pypath.inputs.science as science



[docs]
def get_proteinatlas(normal = True, pathology = True, cancer = True):

    result = {
        'normal':    collections.defaultdict(dict),
        'pathology': collections.defaultdict(dict),
    }


    def line(l):

        return l.strip('\n\r').split('\t')


    if normal:

        c = curl.Curl(urls.urls['proteinatlas']['normal'],
                    silent = False, large = True)
        fp = list(c.result.values())[0]
        hdr = line(fp.readline().strip())

        for l in fp:
            l = line(l)

            uniprots = mapping.map_name(l[1], 'genesymbol', 'uniprot')
            tissue = '%s:%s' % (l[2], l[3])

            for u in uniprots:
                result['normal'][tissue][u] = (l[4], l[5].strip())


    if cancer or pathology:

        c = curl.Curl(urls.urls['proteinatlas']['pathology'],
                    silent = False, large = True)
        fp = list(c.result.values())[0]
        hdr = line(fp.readline())

        for l in fp:

            l = line(l)
            uniprots = mapping.map_name(l[1], 'genesymbol', 'uniprot')
            tissue   = l[2]

            values = dict(
                (h, float(l[i + 3]) if '.' in l[i + 3] else int(l[i + 3]))
                for i, h in enumerate(hdr[3:])
                if len(l) and len(l[i + 3].strip())
            )

            for u in uniprots:

                result['pathology'][tissue][u] = values

    return dict((k, dict(v)) for k, v in iteritems(result))




[docs]
def proteinatlas_annotations(normal = True, pathology = True, cancer = True):

    LEVELS = ('Not detected', 'Low', 'Medium', 'High')


    ProteinatlasAnnotation = collections.namedtuple(
        'ProtainatlasAnnotation',
        [
            'organ',
            'tissue',
            'level',
            'status',
            'n_not_detected',
            'n_low',
            'n_medium',
            'n_high',
            'prognostic',
            'favourable',
            'score',
            'pathology',
        ],
    )
    ProteinatlasAnnotation.__new__.__defaults__ = (
        (None,) * 4 + (False, False, None, False)
    )


    def n_or_none(ex, key):

        return ex[key] if key in ex else None


    data = get_proteinatlas(
        normal = normal,
        pathology = pathology,
        cancer = cancer,
    )

    result = collections.defaultdict(set)

    if normal:

        for tissue, gex in iteritems(data['normal']):

            organ = tissue

            if ':' in tissue:
                organ, tissue = tissue.split(':')

            organ = organ.strip()
            tissue = tissue.strip()

            for uniprot, ex in iteritems(gex):
                uniprots = mapping.map_name(uniprot, 'uniprot', 'uniprot')

                for _uniprot in uniprots:
                    result[_uniprot].add(
                        ProteinatlasAnnotation(
                            organ = organ,
                            tissue = tissue,
                            level = ex[0],
                            status = ex[1],
                        )
                    )

    if pathology or cancer:
        for condition, gex in iteritems(data['pathology']):
            for uniprot, ex in iteritems(gex):
                try:
                    effect, score = next(
                        i for i in iteritems(ex) if i[0] not in LEVELS
                    )
                    prognostic = not effect.startswith('unprognostic')
                    favourable = not effect.endswith('unfavourable')

                except StopIteration:
                    prognostic, favourable, score = None, None, None

                uniprots = mapping.map_name(uniprot, 'uniprot', 'uniprot')

                for _uniprot in uniprots:
                    result[_uniprot].add(
                        ProteinatlasAnnotation(
                            organ = condition,
                            tissue = condition,
                            level = max(
                                (i for i in iteritems(ex) if i[0] in LEVELS),
                                key = lambda i: i[1],
                                default = (None,),
                            )[0],
                            status = None,
                            n_not_detected = n_or_none(ex, 'Not detected'),
                            n_low = n_or_none(ex, 'Low'),
                            n_medium = n_or_none(ex, 'Medium'),
                            n_high = n_or_none(ex, 'High'),
                            prognostic = prognostic,
                            favourable = favourable,
                            score = score,
                            pathology = True,
                        )
                    )

    return dict(result)




[docs]
def proteinatlas_subcellular_annotations():

    ProteinatlasSubcellularAnnotation = collections.namedtuple(
        'ProteinatlasSubcellularAnnotation',
        [
            'location',
            'status',
        ],
    )

    url = urls.urls['proteinatlas']['subcell']

    c = curl.Curl(
        url,
        large = True,
        silent = False,
        default_mode = 'r',
    )
    reader = csv.DictReader(
        c.files_multipart['subcellular_location.tsv'],
        delimiter = '\t',
    )

    result = collections.defaultdict(set)

    for rec in reader:
        uniprots = mapping.map_name(rec['Gene name'], 'genesymbol', 'uniprot')

        for uniprot in uniprots:
            for status in ('Enhanced', 'Supported', 'Uncertain'):
                if not rec[status]:
                    continue

                for location in rec[status].split(';'):
                    result[uniprot].add(ProteinatlasSubcellularAnnotation(
                        location = location,
                        status = status,
                    ))

    return dict(result)




[docs]
def proteinatlas_secretome_annotations():

    ProteinatlasSecretomeAnnotation = collections.namedtuple(
        'ProteinatlasSecretomeAnnotation',
        [
            'mainclass',
            'secreted',
        ],
    )


    url = urls.urls['proteinatlas']['secretome']
    path = science.science_download(url)
    reader = inputs_common.read_xls(path)[1:]
    result = collections.defaultdict(set)

    for rec in reader:

        for uniprot_original in rec[2].split(','):

            uniprots = mapping.map_name(
                uniprot_original,
                'uniprot',
                'uniprot',
            )

            for uniprot in uniprots:
                result[uniprot].add(ProteinatlasSecretomeAnnotation(
                    mainclass = rec[3],
                    secreted = 'secreted' in rec[3].lower(),
                ))

    return dict(result)