Source code for pypath.utils.uniprot

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

"""
Interface to UniProt protein datasheets.
"""

from future.utils import iteritems

import os
import sys
import re
import shutil
import importlib as imp
import collections
import itertools

import pypath.inputs.uniprot as uniprot_input
import pypath.inputs.genecards as genecards_input
import pypath.share.common as common
import pypath_common._constants as _const
import pypath.share.settings as settings
import pypath.core.entity as entity



[docs]
class UniprotProtein(object):

    _relength = re.compile(r'([0-9]+) AA')
    _rename = re.compile(r'Name=([\w\(\)-]+)\W')
    _rerecname = re.compile(r'(?:Rec|Sub)Name: Full=([^;\{]+)(?: \{.*\})?;')
    _recc = re.compile(r'-!- ([A-Z ]+):\s?(.*)')
    _remw = re.compile(r'([0-9]+) MW')
    _redb = re.compile(r'([^;]+);\s?(.*)\s?\.\s?(?:\[(.*)\])?')
    _redbsep = re.compile(r'\s?;\s?')
    _retaxid = re.compile(r'=(\d+)[^\d]')
    _rexref = re.compile(r'[\.,]?\s?\{[^\}]+\}')
    _reec = re.compile(r'EC=(\d+(?:\.[-\d]+)+)')


[docs]
    def __init__(self, uniprot_id):

        self.uniprot_id = uniprot_id.strip()
        self.load()



    def reload(self):

        modname = self.__class__.__module__
        mod = __import__(modname, fromlist = [modname.split('.')[0]])
        imp.reload(mod)
        new = getattr(mod, self.__class__.__name__)
        setattr(self, '__class__', new)


    def load(self):

        self.raw = uniprot_input.protein_datasheet(self.uniprot_id)


    @property
    def is_reviewed(self):

        return 'Reviewed' in self.raw[0][1]


    @property
    def id(self):

        return self.raw[0][1].split()[0]


    @property
    def ac(self):

        return next(self.itertag('AC')).split(';')[0]


    @property
    def length(self):
        """
        Returns the length (number of residues) of the canonical sequence.
        """

        return int(self._relength.search(self.raw[0][1]).groups()[0])


    @property
    def organism(self):

        return int(self._retaxid.search(next(self.itertag('OX'))).groups()[0])


    @property
    def full_name(self):

        return self._rerecname.search(next(self.itertag('DE'))).groups()[0]


    @property
    def ec(self):

        return set(self._reec.findall(''.join(self.itertag('DE'))))


    @property
    def info(self):

        if not hasattr(self, '_info'):

            self.update_info()

        return self._info


    def update_info(self):

        result = collections.defaultdict(list)
        title = None

        for cc in self.itertag('CC'):

            if cc.startswith('---'):

                break

            m = self._recc.match(cc)

            if m:

                title, cc = m.groups()

            line = cc.strip()

            if line:

                result[title].append(line)

        self._info = dict(
            (
                title,
                ' '.join(line)
            )
            for title, line in iteritems(result)
        )


    @property
    def function_genecards(self):

        summaries = genecards_input.genecards_summaries(self.genesymbol)

        return ' '.join(
            '%s: %s' % (resource, summary)
            for resource, summary in iteritems(summaries)
        )


    @property
    def function_with_xrefs(self):

        return self.info_section('FUNCTION')


    @property
    def function(self):

        return self.remove_xrefs(self.function_with_xrefs)


    @property
    def function_with_genecards(self):

        return '%s %s' % (
            self.function,
            self.function_genecards,
        )


    @property
    def function_or_genecards(self):

        return self.function or self.function_genecards


    @property
    def subcellular_location(self):

        return self.remove_xrefs(self.subcellular_location_with_xrefs)


    @property
    def tissue_specificity(self):

        return self.remove_xrefs(self.tissue_specificity_with_xrefs)


    @property
    def subunit(self):

        return self.remove_xrefs(self.subunit_with_xrefs)


    @property
    def interaction(self):

        return self.remove_xrefs(self.interaction_with_xrefs)


    @property
    def sequence_caution(self):

        return self.remove_xrefs(self.sequence_caution_with_xrefs)


    @property
    def catalytic_activity(self):

        return self.remove_xrefs(self.catalytic_activity_with_xrefs)


    @property
    def activity_regulation(self):

        return self.remove_xrefs(self.activity_regulation_with_xrefs)


    @property
    def alternative_products(self):

        return self.remove_xrefs(self.alternative_products_with_xrefs)


    @property
    def ptm(self):

        return self.remove_xrefs(self.ptm_with_xrefs)


    @property
    def disease(self):

        return self.remove_xrefs(self.disease_with_xrefs)


    @property
    def similarity(self):

        return self.remove_xrefs(self.similarity_with_xrefs)


    @property
    def web_resource(self):

        return self.remove_xrefs(self.web_resource_with_xrefs)


    @property
    def subcellular_location_with_xrefs(self):

        return self.info_section('SUBCELLULAR LOCATION')


    @property
    def tissue_specificity_with_xrefs(self):

        return self.info_section('TISSUE SPECIFICITY')


    @property
    def subunit_with_xrefs(self):

        return self.info_section('SUBUNIT')


    @property
    def interaction_with_xrefs(self):

        return self.info_section('INTERACTION')


    @property
    def sequence_caution_with_xrefs(self):

        return self.info_section('SEQUENCE CAUTION')


    @property
    def catalytic_activity_with_xrefs(self):

        return self.info_section('CATALYTIC ACTIVITY')


    @property
    def activity_regulation_with_xrefs(self):

        return self.info_section('ACTIVITY REGULATION')


    @property
    def alternative_products_with_xrefs(self):

        return self.info_section('ALTERNATIVE PRODUCTS')


    @property
    def ptm_with_xrefs(self):

        return self.info_section('PTM')


    @property
    def disease_with_xrefs(self):

        return self.info_section('DISEASE')


    @property
    def similarity_with_xrefs(self):

        return self.info_section('SIMILARITY')


    @property
    def web_resource_with_xrefs(self):

        return self.info_section('WEB RESOURCE')


    @property
    def lengths(self):
        """
        Returns the length of all isoforms as a list.
        """

        return [
            int(self._relength.search(sq).groups()[0])
            for sq in self.itertag('SQ')
        ]


    @property
    def weight(self):
        """
        Returns the molecular weight of the canonical isoform in Daltons.
        """

        try:

            return int(
                self._remw.search(next(self.itertag('SQ'))).groups()[0]
            )

        except StopIteration:

            return None


    @property
    def weights(self):
        """
        Returns the molecular weights of all isoforms as a list.
        """

        return [
            int(self._remw.search(sq).groups()[0])
            for sq in self.itertag('SQ')
        ]


    @property
    def databases(self):
        """
        Returns the database identifiers (cross-references) as a dict of
        database names and identifiers.
        """

        if not hasattr(self, '_databases'):

            self.update_databases()

        return self._databases


    def update_databases(self):

        result = collections.defaultdict(set)

        for db in self.itertag('DR'):

            m = self._redb.match(db)

            if m:

                dbname, ids, subtype = m.groups()
                ids = self._redbsep.split(ids)
                ids = tuple(_id for _id in ids if _id != '-')

                if subtype:

                    ids += (subtype,)

                ids = ids[0] if len(ids) == 1 else ids
                result[dbname].add(ids)

        self._databases = dict(result)



[docs]
    def info_section(self, title):
        """
        Retrieves a section from the description. If the section is not
        availeble, returns ``None``.
        """

        info = self.info

        if title in info:

            return info[title]


    @property
    def genesymbol(self):

        try:

            m = self._rename.search(next(self.itertag('GN')))

            return m.groups()[0] if m else self.ac

        except StopIteration:

            return self.ac


    @property
    def keywords_with_xrefs(self):
        """
        Returns the keywords as a list with keeping the cross-references.
        """

        return [
            kw for kw in
            itertools.chain(
                *(
                    self._redbsep.split(kw.strip('.'))
                    for kw in self.itertag('KW')
                )
            )
            if kw
        ]


    @property
    def keywords(self):
        """
        Returns the keywords as a list.
        """

        return (
            self.remove_xrefs(
                '\t'.join(self.keywords_with_xrefs)
            ).split('\t')
        )


    @classmethod
    def remove_xrefs(cls, value):

        return cls._rexref.sub('', value) if value else value


    @property
    def sequence(self):
        """
        Returns the canonical sequence (the first one) as a string of
        standard capital letter residue symbols.
        """

        result = []
        collect = False

        for tag, line in self:

            if not collect and tag == 'SQ':

                collect = True

            elif collect:

                if tag == '  ':

                    result.append(line)

                else:

                    break

        return ''.join(x.replace(' ', '') for x in result)


    def __iter__(self):

        return self.raw.__iter__()


    def itertag(self, tag):

        for _tag, line in self:

            if _tag == tag:

                yield line


    def has_tag(self, tag):

        return any(line[0] == tag for line in self)


    def __repr__(self):

        return '<UniProt datasheet %s (%s)>' % (self.ac, self.genesymbol)



def _update_methods():

    for method_name in UniprotProtein.__dict__.keys():

        if method_name.startswith('_'):

            continue

        def create_method(method_name):

            def method(uniprot_id, *args, **kwargs):

                bound_m = getattr(UniprotProtein(uniprot_id), method_name)

                if isinstance(getattr(UniprotProtein, method_name), property):

                    return bound_m

                else:

                    return bound_m(*args, **kwargs)

            return method

        _method = create_method(method_name)

        common.add_method(
            cls = sys.modules[__name__],
            method_name = method_name,
            method = _method,
            doc = getattr(UniprotProtein, method_name).__doc__,
        )


_update_methods()



[docs]
def query(*uniprot_ids):
    """
    Queries the datasheet of one or more UniProt IDs.
    Returns a single ``UniprotProtein`` object or a list of those objects.
    """

    if (
        len(uniprot_ids) > 0 and
        isinstance(uniprot_ids[0], _const.LIST_LIKE)
    ):

        uniprot_ids = uniprot_ids[0]

    uniprot_ids = common.to_list(uniprot_ids)
    uniprot_ids = entity.Entity.only_proteins(uniprot_ids)

    single_id = len(uniprot_ids) == 1

    result = [
        UniprotProtein(uniprot_id)
        for uniprot_id in uniprot_ids
    ]
    result = [u for u in result if u.raw]

    return common.first(result) if single_id else result




[docs]
def collect(uniprot_ids, *features):
    """
    Collects data about one or more UniProt IDs.

    :param str,list uniprot_ids:
        One or more UniProt IDs.
    :param *str,list features:
        Features to query: these must be method (property) names of the
        ``UniprotProtein`` class. E.g. ``['ac', 'genesymbol', 'function']``.

    :return:
        A ``collections.OrderedDict`` object with feature names as keys and
        list of values for each UniProt ID as values.
    """

    uniprot_ids = entity.Entity.only_proteins(uniprot_ids)

    resources = [
        UniprotProtein(uniprot_id)
        for uniprot_id in uniprot_ids
    ]
    # this is mainly for removal of obsolate records
    # where the response from the server is empty
    # most of the times it removes nothing
    resources = [u for u in resources if u.raw]

    features = features or default_features

    if 'ac' not in features:

        features = ['ac'] + list(features)

    table = collections.OrderedDict(
        (
            feature_name,
            [
                getattr(resource, feature_name)
                for resource in resources
            ]
        )
        for feature_name in features
    )

    return table




[docs]
def features_table(
        uniprot_ids,
        *features,
        width = 40,
        maxlen = None,
        tablefmt = 'fancy_grid',
        **kwargs
    ):
    """
    Returns a table with the requested features of a list of UniProt IDs.
    The underlying table formatting module is ``tabulate``, a versatile
    module to export various ascii tables as well as HTML or LaTeX --
    check the docs for formatting options:
    https://github.com/astanin/python-tabulate

    Args
        kwargs:
            Passed to ``tabulate.tabulate``.

    Returns
        The table as a string.
    """

    maxlen = maxlen or settings.get('uniprot_info_maxlen')

    features = features or default_features

    tbl = collect(uniprot_ids, *features)

    return common.table_format(
        tbl,
        width = width,
        maxlen = maxlen,
        tablefmt = tablefmt,
        **kwargs
    )




[docs]
def print_features(
        uniprot_ids,
        *features,
        fileobj = None,
        width = None,
        maxlen = None,
        tablefmt = 'fancy_grid',
        **kwargs
    ):
    """
    Prints a table with the requested features of a list of UniProt IDs.
    The underlying table formatting module is ``tabulate``, a versatile
    module to export various ascii tables as well as HTML or LaTeX --
    check the docs for formatting options:
    https://github.com/astanin/python-tabulate

    Args
        kwargs:
            Passed to ``tabulate.tabulate``.
    """

    maxlen = maxlen or settings.get('uniprot_info_maxlen')
    features = features or default_features
    term_width = (shutil.get_terminal_size().columns - 60) * 2 + 40
    width = width or int(term_width / len(features)) if term_width else 40
    fileobj = fileobj or sys.stdout

    fileobj.write(
        features_table(
            uniprot_ids,
            *features,
            width = width,
            maxlen = maxlen,
            tablefmt = tablefmt,
            **kwargs
        )
    )
    fileobj.write(os.linesep)
    fileobj.flush()




[docs]
def info(
        *uniprot_ids,
        features = None,
        fileobj = None,
        header = None,
        **kwargs
    ):
    """
    Prints a table with the most important (or the requested) features of a
    list of UniProt IDs.
    """

    if (
        len(uniprot_ids) == 1 and
        isinstance(uniprot_ids, _const.LIST_LIKE)
    ):

        uniprot_ids = uniprot_ids[0]

    features = features or default_features

    fileobj = fileobj or sys.stdout

    header = (
        header or
        '=====> [%u proteins] <=====\n' % len(
            list(
                entity.Entity.filter_entity_type(
                    common.to_list(uniprot_ids),
                    entity_type = 'protein',
                )
            )
        )
    )

    fileobj.write(header)

    print_features(
        common.to_list(uniprot_ids),
        *features,
        fileobj = fileobj,
        **kwargs
    )




[docs]
def browse(groups, start = 0, fileobj = None, **kwargs):
    """
    Browses through a series of protein groups, printing an information table
    for each group. ``kwargs`` passed to ``info`` and then to print_features``.
    Parameters for ``common.table_format`` can be provided.
    """

    labels = sorted(groups.keys())
    n_groups = len(labels)
    stop = False
    maxlen_default = kwargs['maxlen'] if 'maxlen' in kwargs else 500
    fileobj = fileobj or sys.stdout

    for n, label in enumerate(labels):

        if start > n + 1:

            continue

        if stop:

            break

        kwargs['maxlen'] = maxlen_default

        while True:

            uniprots = groups[label]
            uniprots = (
                uniprots.members
                    if hasattr(uniprots, 'members') else
                uniprots
            )

            header = (
                '[%u/%u] =====> %s <===== [%u proteins]\n' % (
                    n + 1,
                    n_groups,
                    label,
                    len(uniprots)
                )
            )

            info(uniprots, fileobj = fileobj, header = header, **kwargs)

            inp = input()

            if inp == 'q':

                stop = True
                break

            elif inp.isdigit():

                kwargs['maxlen'] = int(inp)

            else:

                fileobj.write(os.linesep * 2)
                break

    sys.stdout.write(os.linesep)
    sys.stdout.flush()



default_features = (
    'ac',
    'genesymbol',
    'length',
    'weight',
    'full_name',
    'function_or_genecards',
    'keywords',
    'subcellular_location',
)