Source code for pypath.inputs.uniprot

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

from future.utils import iteritems

from typing import Iterable

import re
import json
import collections
import itertools
import functools
import urllib.parse

import pandas as pd

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.settings as settings
import pypath.share.session as session_mod
import pypath.share.common as common
import pypath_common._constants as _const
import pypath.utils.taxonomy as taxonomy
from pypath.inputs.uniprot_idmapping import idtypes as idmapping_idtypes

_logger = session_mod.Logger(name = 'uniprot_input')

_redatasheet = re.compile(r'([A-Z\s]{2})\s*([^\n\r]+)[\n\r]+')

# regex for matching UniProt AC format
# from https://www.uniprot.org/help/accession_numbers
reac = re.compile(
    r'[OPQ][0-9][A-Z0-9]{3}[0-9]|'
    r'[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}'
)
_rename = re.compile(r'Name=([\w\(\)-]+)\W')
_retaxid = re.compile(r'=(\d+)[^\d]')


def _all_uniprots(organism = 9606, swissprot = None):

    swissprot = _swissprot_param(swissprot)
    rev = '' if swissprot is None else ' AND reviewed: %s' % swissprot
    url = urls.urls['uniprot_basic']['url']
    get = {
        'query': 'organism_id:%s%s' % (str(organism), rev),
        'format': 'tsv',
        'fields': 'accession',
    }

    if organism == '*':
        get['query'] = rev.strip(' AND ')

    c = curl.Curl(url, get = get, silent = False, slow = True)
    data = c.result

    return {
        l.strip() for l in data.split('\n')[1:] if l.strip()
    }


def _swissprot_param(swissprot):

    return (
        'true'
            if swissprot in {'true', 'True', 'yes', 'YES', True} else
        'false'
            if swissprot in {'false', 'False', 'no', 'NO', False} else
        None
    )



[docs]
def valid_uniprot(name):
    """
    Checks if ``name`` fits the format requirements for UniProt accession
    numbers.
    """

    return bool(reac.match(name))




[docs]
def protein_datasheet(identifier):

    url = urls.urls['uniprot_basic']['datasheet'] % identifier.strip()

    datasheet =  _protein_datasheet(url)

    if not datasheet:

        _logger._log(
            'UniProt ID `%s` returns empty response, it might be and an old '
            'ID which has been deleted from the database. Attempting to '
            'find its history and retrieve either an archived version or '
            'the find the new ID which replaced this one.' % identifier
        )
        return uniprot_history_recent_datasheet(identifier)

    else:

        return datasheet




[docs]
def deleted_uniprot_genesymbol(identifier):
    """
    Retrieves the archived datasheet for a deleted UniProt ID and returns
    the Gene Symbol and the NCBI Taxonomy ID from the datasheet.
    """

    datasheet = uniprot_history_recent_datasheet(identifier)
    genesymbol = None
    ncbi_tax_id = None

    for tag, line in datasheet:

        if tag == 'GN':

            m = _rename.search(line.strip())

            if m:

                genesymbol = m.groups()[0]

        if tag == 'OX':

            ncbi_tax_id = int(_retaxid.search(line).groups()[0])
            break

    return genesymbol, ncbi_tax_id



def _protein_datasheet(url):

    cache = True

    for a in range(3):

        c = curl.Curl(
            url,
            silent = True,
            large = False,
            cache = cache,
            connect_timeout = (
                settings.get('uniprot_datasheet_connect_timeout')
            ),
            timeout = settings.get('uniprot_datasheet_timeout'),
        )

        if not c.result or c.result.startswith('<!DOCTYPE'):

            cache = False

        else:

            break

    if not c.result:

        _logger._log(
            'Could not retrieve UniProt datasheet by URL `%s`.' % url
        )

    return _redatasheet.findall(c.result) if c.result else []



[docs]
def uniprot_history_recent_datasheet(identifier):

    recent_version = uniprot_recent_version(identifier)

    if recent_version:

        if recent_version.replaced_by:

            new = recent_version.replaced_by.split(';')[0]
            url = urls.urls['uniprot_basic']['datasheet'] % new
            _logger._log(
                'UniProt ID `%s` is obsolete, has been replaced by '
                '`%s`: `%s`.' % (
                    identifier,
                    new,
                    url,
                )
            )
            return protein_datasheet(new)

        else:

            version = int(recent_version.entry_version)
            url = '%s?version=%u' % (
                urls.urls['uniprot_basic']['datasheet'] % identifier,
                version,
            )
            _logger._log(
                'UniProt ID `%s` is obsolete, downloading archived '
                'version %u: `%s`.' % (
                    identifier,
                    version,
                    url,
                )
            )
            c = curl.Curl(url, silent = True, large = False)
            return _protein_datasheet(url)

    return []



UniprotRecordHistory = collections.namedtuple(
    'UniprotRecordHistory',
    [
        'entry_version',
        'sequence_version',
        'entry_name',
        'database',
        'number',
        'date',
        'replaces',
        'replaced_by',
    ],
)



[docs]
def uniprot_history(identifier):
    """
    Retrieves the history of a record.
    Returns a generator iterating over the history from most recent to the
    oldest.
    """

    if valid_uniprot(identifier):

        url_history = urls.urls['uniprot_basic']['history'] % identifier
        c_history = curl.Curl(
            url_history,
            silent = True,
            large = True,
        )

        if c_history.result:

            line0 = next(c_history.result)

            if not line0.startswith('<!DOCTYPE'):

                for line in c_history.result:

                    if line:

                        yield UniprotRecordHistory(
                            *(
                                field.strip() for field in line.split('\t')
                            )
                        )




[docs]
def uniprot_recent_version(identifier):

    for version in uniprot_history(identifier):

        if (
            (
                version.entry_version != '0' and
                version.entry_name != 'null'
            ) or version.replaced_by
        ):

            return version




[docs]
def uniprot_deleted(confirm = True):

    return swissprot_deleted() | trembl_deleted(confirm = confirm)



def _uniprot_deleted(swissprot = True, confirm = True):

    if not swissprot and confirm:

        resp = input(
            'Loading the list of deleted TrEMBL IDs requires '
            '>5GB memory. Do you want to proceed [y/n] '
        )

        if not resp or resp[0].lower() != 'y':

            return set()

    key = 'deleted_%s' % ('sp' if swissprot else 'tr')
    url = urls.urls['uniprot_basic'][key]
    c = curl.Curl(url, silent = False, large = True)

    result = set()

    for line in c.result:

        m = reac.match(line.strip())

        if m:

            result.add(m.groups()[0])

    return result



[docs]
def swissprot_deleted():

    return _uniprot_deleted(swissprot = True)




[docs]
def trembl_deleted(confirm = True):

    return _uniprot_deleted(swissprot = False, confirm = True)




[docs]
def get_uniprot_sec(organism = 9606):
    """
    Downloads and processes the mapping between secondary and
    primary UniProt IDs.

    Yields pairs of secondary and primary UniProt IDs.

    :param int organism:
        NCBI Taxonomy ID of the organism.
    """

    _organism = organism not in (None, _const.NOT_ORGANISM_SPECIFIC)

    if _organism:

        from pypath.inputs import uniprot_db
        proteome = uniprot_db.all_uniprots(organism=organism)
        proteome = set(proteome)

    sec_pri = []
    url = urls.urls['uniprot_sec']['url']
    c = curl.Curl(url, silent = False, large = True, timeout = 2400)

    for i, line in enumerate(c.result):

        if i < 30:

            continue

        line = line.split()

        if len(line) == 2 and (not _organism or line[1] in proteome):

            yield line




[docs]
class UniprotQuery:

    _PROCESS = {
        'dict': '_process_dict',
        'list': '_process_list',
    }
    _OP = ('_AND', '_NOT', '_OR')
    _OPSTART = re.compile(r'^(OR|AND)')
    _OPEND = re.compile(r'(OR|AND)$')
    _FIELDSEP = re.compile(r'[\s;]')
    _FIELDEND = re.compile(r'$;')
    _SYNONYMS = {
        'organism': 'organism_id',
        'ncbi_tax_id': 'organism_id',
    }
    _FIELD_SYNONYMS = {
        'function': 'cc_function',
        'activity_regulation': 'cc_activity_regulation',
        'tissue_specificity': 'cc_tissue_specificity',
        'developmental_stage': 'cc_developmental_stage',
        'induction': 'cc_induction',
        'intramembrane': 'ft_intramem',
        'signal_peptide': 'ft_signal',
        'subcellular_location': 'cc_subcellular_location',
        'transmembrane': 'ft_transmem',
        'comment': 'cc_miscellaneous',
        'topological_domain': 'ft_topo_dom',
        'family': 'protein_families',
        'interactor': 'cc_interaction',
        'keywords': 'keyword',
    }



[docs]
    def __init__(
            self,
            *query,
            fields: str | Iterable[str] | None = None,
            **kwargs
        ):
        """
        Constructs a query for the UniProt REST API.

        Args:
            query:
                Query elements: can be a ready query or its components, bypassing
                the processing in this function or performing only simple
                concatenation. Alternatively, it can be a nested structure of lists
                and dicts describing more complex queries. See the examples below.
            kwargs:
                Same as passing a dict to ``query``.

        Details:
            The query can be built in several ways:
            - Simple string or concatenation of strings:
              query_builder('kinase AND organism_id:9606')
              query_builder('kinase', 'organism_id:9606')
              query_builder('kinase', organism_id = 9606)
              The above 3 examples all return the same query:
              `kinase AND organism_id:9606`
            - The default operator within lists is `OR` and within dicts is `AND`:
              query_builder(organism = [9606, 10090, 10116])
              `organism_id:9606 OR organism_id:10090 OR organism_id:10116`
              query_builder({'organism_id': 9606, 'reviewed': True})
              `organism_id:9606 AND reviewed:true`
            - These default operators can be changed by including the `op` key in
              dicts or including the operator with an underscore in lists:
              query_builder({'length': (500,), 'mass': (50000,), 'op': 'OR'})
              `length:[500 TO *] OR mass:[50000 TO *]`
              query_builder(lit_author = ['Huang', 'Kovac', '_AND'])
              `lit_author:Huang AND lit_author:Kovac`
            - The nested structures translate into nested parentheses in the query:
              query_builder({'organism_id': [9606, 10090], 'reviewed': True})
              `(organism_id:9606 OR organism_id:10090) AND reviewed:true`
            - Values are converted to strings, intervals can be provided as tuples:
              query_builder({'length': (100, None), 'organism_id': 9606})
              `length:[100 TO *] AND organism_id:9606`

            For a complete reference of the available parameters, see
            https://www.uniprot.org/help/query-fields and
            https://www.uniprot.org/help/text-search for additional syntax
            elements.

            For the available fields refer to the ``_FIELD_SYNONYMS`` attribute of
            this class or the UniProt website:
            https://www.uniprot.org/help/return_fields

        Methods:
            __iter__:
                Perform the query and iterate over the lines in the results,
                skipping the header and the empty lines, stripping the
                linebreaks and splitting by tab.

                Yields:
                    A list of fields for each line.

        Attributes:
            fail_on_empty:
                If set to True, an error will be raised if the UniProt API
                returns empty response. By default no error is raised.
            name_process:
                If set to True, a different processing will be applied on the
                results. This is appropriate especially for identifier type
                fields.
        """

        self.fields = common.to_list(fields)
        self._args = query, kwargs
        self._process_main()
        # tolerate empty result: Curl returns None in case of
        # empty file but in case of UniProt, especially for under-researched
        # taxons it can happen there is no result for certain queries
        self.fail_on_empty = False
        self.name_process = False



    @classmethod
    def _value(
            cls,
            val: str | int | bool | tuple,
            field: str | None = None,
        ) -> str:

        field = cls._SYNONYMS.get(field, field)

        if field == 'organism_id':

            result = str(taxonomy.ensure_ncbi_tax_id(val) or val)

        elif isinstance(val, tuple):

            val = (tuple(map(cls._value, val)) + ('*',))[:2]
            result = '[%s TO %s]' % val

        elif val is None:

            if field == 'reviewed':

                result = ''
                field = None

            else:

                result = '*'

        elif isinstance(val, bool):

            result = str(val).lower()

        else:

            result = str(val)

        if field:

            result = f'{field}:{result}'

        return result


    def _process_main(self):

        query, kwargs = self._args
        op = kwargs.pop('_op', 'AND')
        query = list(query)
        query.append(kwargs)
        result = []

        for q in query:

            q = self._process(q).strip()

            if (
                result and
                q and
                not self._OPEND.match(result[-1]) and
                not self._OPSTART.match(q)
            ):

                result.append(op)

            if q:

                result.append(q)

        self.query = ' '.join(result)


    @classmethod
    def _process(
            cls,
            query: str | list | dict,
            field: str | None = None,
        ) -> str:

        method = cls._PROCESS.get(type(query).__name__, '_value')

        return getattr(cls, method)(query, field)


    @classmethod
    def _process_list(cls, query: list, field: str | None = None) -> str:

        op = '_OR'

        for _op in cls._OP:

            if _op in query:

                op = query.pop(query.index(_op))

        op = f' {op[1:]} '

        query = [cls._process(i, field) for i in query]

        return cls._par(op.join(query))


    @classmethod
    def _process_dict(cls, query: dict, field: str | None = None) -> str:

        query = query.copy()
        op = ' %s ' % query.pop('op', ' AND ').strip()
        result = op.join(
            it for k, v in query.items()
            if (it := cls._process(v, k))
        )

        return cls._par(result) if len(query) > 1 else result


    @staticmethod
    def _par(value: str) -> str:

        return f'({value})' if value else ''


    @property
    def _get(self) -> dict[str, str]:

        field_qs = ','.join(
            ['accession'] +
            [self._FIELD_SYNONYMS.get(f, f) for f in self.fields]
        )

        return {
            'query': self.query,
            'format': 'tsv',
            'fields': field_qs,
            'compressed': 'true',
        }


    @property
    def _baseurl(self) -> str:

        return urls.urls['uniprot_basic']['url']


    @property
    def url(self) -> str:
        """
        UniProt REST API URL (urlencoded).

        Returns:
            A valid query suitable for the UniProt REST API.
        """

        return f'{self._baseurl}?{urllib.parse.urlencode(self._get)}'


    @property
    def url_plain(self) -> str:
        """
        UniProt REST API URL (plain).
        """

        return urllib.parse.unquote_plus(self.url)



[docs]
    def __iter__(self):

        c = curl.Curl(
            self._baseurl,
            get = self._get,
            silent = False,
            large = True,
            compr = 'gz',
        )
        result = c.result if c.result or self.fail_on_empty else [0].__iter__()
        _ = next(result)
        _proc0 = functools.partial(self._FIELDEND.sub, '')
        _proc1 = self._FIELDSEP.split if self.name_process else common.identity

        for line in result:

            line = line.strip('\n\r')

            if line.strip():

                yield [_proc1(_proc0(f)) for f in line.split('\t')]




[docs]
    def perform(self) -> list[str] | dict[str, str] | dict[str, dict[str, str]]:
        """
        Perform the query and preprocess the result.

        Returns:
            - A list of UniProt IDs if no fields were provided.
            - A dict of UniProt IDs and corresponding field values if
              exactly one field was provided.
            - A dict with field names as top level keys and dicts of the
              kind described in the previous point as values.
        """

        _id, *variables = zip(*self)
        _id = list(map(common.sfirst, _id))

        if variables:

            result = {
                f: {i: v for i, v in zip(_id, vs) if i}
                for f, vs in zip(self.fields, variables)
            }

            result = (
                common.first(result.values())
                    if len(result) == 1 else
                result
            )

        else:

            result = list(_id)

        return result





[docs]
def query_builder(*query, **kwargs) -> str:
    """
    Build a query for the UniProt web site and REST API.

    Args:
        query:
            Query elements: can be a ready query or its components, bypassing
            the processing in this function or performing only simple
            concatenation. Alternatively, it can be a nested structure of lists
            and dicts describing more complex queries. See the examples below.
        kwargs:
            Same as passing a dict to ``query``.

    Details:
        The query can be built in several ways:
        - Simple string or concatenation of strings:
          query_builder('kinase AND organism_id:9606')
          query_builder('kinase', 'organism_id:9606')
          query_builder('kinase', organism_id = 9606)
          The above 3 examples all return the same query:
          `kinase AND organism_id:9606`
        - The default operator within lists is `OR` and within dicts is `AND`:
          query_builder(organism = [9606, 10090, 10116])
          `organism_id:9606 OR organism_id:10090 OR organism_id:10116`
          query_builder({'organism_id': 9606, 'reviewed': True})
          `organism_id:9606 AND reviewed:true`
        - These default operators can be changed by including the `op` key in
          dicts or including the operator with an underscore in lists:
          query_builder({'length': (500,), 'mass': (50000,), 'op': 'OR'})
          `length:[500 TO *] OR mass:[50000 TO *]`
          query_builder(lit_author = ['Huang', 'Kovac', '_AND'])
          `lit_author:Huang AND lit_author:Kovac`
        - The nested structures translate into nested parentheses in the query:
          query_builder({'organism_id': [9606, 10090], 'reviewed': True})
          `(organism_id:9606 OR organism_id:10090) AND reviewed:true`
        - Values are converted to strings, intervals can be provided as tuples:
          query_builder({'length': (100, None), 'organism_id': 9606})
          `length:[100 TO *] AND organism_id:9606`

        For a complete reference of the available parameters, see
        https://www.uniprot.org/help/query-fields and
        https://www.uniprot.org/help/text-search for additional syntax
        elements.

    Returns:
        A query that can be inserted into the UniProt search field.
    """

    return UniprotQuery(*query, **kwargs).query




[docs]
def uniprot_data(
        *query,
        fields: str | Iterable[str] | None = None,
        organism: str | int | None = 9606,
        reviewed: bool | None = True,
        **kwargs
    ) -> dict[str, str] | dict[str, dict[str, str]]:
    """
    Basic client for the UniProt REST API.

    Retrieves one or more fields from UniProt, by default for all reviewed
    (SwissProt) proteins of one organism

    Args:
        query:
            Query elements: can be a ready query or its components, bypassing
            the processing in this function or performing only simple
            concatenation. Alternatively, it can be a nested structure of lists
            and dicts describing more complex queries. See the examples below.
        fields:
            One or more UniProt field name. See details.
        organism:
            Organism name or identifier, e.g. "human", or "Homo sapiens",
            or 9606.
        reviewed:
            Restrict the query to SwissProt (True), to TrEMBL (False), or
            cover both (None).
        kwargs:
            Same as passing a dict to ``query``.

    Details:
        The query can be built in several ways:
        - Simple string or concatenation of strings:
          query_builder('kinase AND organism_id:9606')
          query_builder('kinase', 'organism_id:9606')
          query_builder('kinase', organism_id = 9606)
          The above 3 examples all return the same query:
          `kinase AND organism_id:9606`
        - The default operator within lists is `OR` and within dicts is `AND`:
          query_builder(organism = [9606, 10090, 10116])
          `organism_id:9606 OR organism_id:10090 OR organism_id:10116`
          query_builder({'organism_id': 9606, 'reviewed': True})
          `organism_id:9606 AND reviewed:true`
        - These default operators can be changed by including the `op` key in
          dicts or including the operator with an underscore in lists:
          query_builder({'length': (500,), 'mass': (50000,), 'op': 'OR'})
          `length:[500 TO *] OR mass:[50000 TO *]`
          query_builder(lit_author = ['Huang', 'Kovac', '_AND'])
          `lit_author:Huang AND lit_author:Kovac`
        - The nested structures translate into nested parentheses in the query:
          query_builder({'organism_id': [9606, 10090], 'reviewed': True})
          `(organism_id:9606 OR organism_id:10090) AND reviewed:true`
        - Values are converted to strings, intervals can be provided as tuples:
          query_builder({'length': (100, None), 'organism_id': 9606})
          `length:[100 TO *] AND organism_id:9606`

        For a complete reference of the available parameters, see
        https://www.uniprot.org/help/query-fields and
        https://www.uniprot.org/help/text-search for additional syntax
        elements.

        For the available fields refer to the ``_FIELD_SYNONYMS`` attribute of
        the UniprotQuery class or the UniProt website:
        https://www.uniprot.org/help/return_fields

    Returns:
        - A list of UniProt IDs if no fields were provided.
        - A dict of UniProt IDs and corresponding field values if
          exactly one field was provided.
        - A dict with field names as top level keys and dicts of the
          kind described in the previous point as values.
    """

    for arg in ('organism', 'reviewed'):

        if locals()[arg] is not None:

            kwargs[arg] = locals()[arg]

    return uniprot_query(*query, fields = fields, **kwargs)




[docs]
def uniprot_query(
        *query,
        fields: str | Iterable[str] | None = None,
        **kwargs
    ) -> dict[str, str] | dict[str, dict[str, str]]:
    """
    Basic client for the UniProt REST API.

    Args:
        query:
            Query elements: can be a ready query or its components, bypassing
            the processing in this function or performing only simple
            concatenation. Alternatively, it can be a nested structure of lists
            and dicts describing more complex queries. See the examples below.
        fields:
            One or more UniProt field name. See details.
        kwargs:
            Same as passing a dict to ``query``.

    Details:
        The query can be built in several ways:
        - Simple string or concatenation of strings:
          query_builder('kinase AND organism_id:9606')
          query_builder('kinase', 'organism_id:9606')
          query_builder('kinase', organism_id = 9606)
          The above 3 examples all return the same query:
          `kinase AND organism_id:9606`
        - The default operator within lists is `OR` and within dicts is `AND`:
          query_builder(organism = [9606, 10090, 10116])
          `organism_id:9606 OR organism_id:10090 OR organism_id:10116`
          query_builder({'organism_id': 9606, 'reviewed': True})
          `organism_id:9606 AND reviewed:true`
        - These default operators can be changed by including the `op` key in
          dicts or including the operator with an underscore in lists:
          query_builder({'length': (500,), 'mass': (50000,), 'op': 'OR'})
          `length:[500 TO *] OR mass:[50000 TO *]`
          query_builder(lit_author = ['Huang', 'Kovac', '_AND'])
          `lit_author:Huang AND lit_author:Kovac`
        - The nested structures translate into nested parentheses in the query:
          query_builder({'organism_id': [9606, 10090], 'reviewed': True})
          `(organism_id:9606 OR organism_id:10090) AND reviewed:true`
        - Values are converted to strings, intervals can be provided as tuples:
          query_builder({'length': (100, None), 'organism_id': 9606})
          `length:[100 TO *] AND organism_id:9606`

        For a complete reference of the available parameters, see
        https://www.uniprot.org/help/query-fields and
        https://www.uniprot.org/help/text-search for additional syntax
        elements.

        For the available fields refer to the ``_FIELD_SYNONYMS`` attribute of
        the UniprotQuery class or the UniProt website:
        https://www.uniprot.org/help/return_fields

    Returns:
        - A list of UniProt IDs if no fields were provided.
        - A dict of UniProt IDs and corresponding field values if
          exactly one field was provided.
        - A dict with field names as top level keys and dicts of the
          kind described in the previous point as values.
    """

    return UniprotQuery(*query, fields = fields, **kwargs).perform()




[docs]
def uniprot_preprocess(field, organism = 9606, reviewed = True):

    relabel = re.compile(r'[A-Z\s]+:\s')
    reisoform = re.compile(r'\[[-\w\s]+\]:?\s?')
    retermsep = re.compile(r'\s?[\.,]\s?')
    reref = re.compile(r'\{[-\w :\|,\.]*\}')

    result = collections.defaultdict(set)

    data = uniprot_data(
        fields = field,
        organism = organism,
        reviewed = reviewed,
    )

    for uniprot, raw in iteritems(data):

        raw = raw.split('Note=')[0]
        raw = relabel.sub('', raw)
        raw = reref.sub('', raw)
        raw = reisoform.sub('', raw)
        raw = retermsep.split(raw)

        for item in raw:

            if item.startswith('Note'):

                continue

            item = item.split('{')[0]
            elements = tuple(
                it0
                for it0 in
                (
                    common.upper0(it.strip(' .;,'))
                    for it in item.split(';')
                )
                if it0
            )

            if elements:

                result[uniprot].add(elements)

    return result




[docs]
def uniprot_locations(organism = 9606, reviewed = True):


    UniprotLocation = collections.namedtuple(
        'UniprotLocation',
        [
            'location',
            'features',
        ],
    )


    result = collections.defaultdict(set)

    data = uniprot_preprocess(
        field = 'subcellular_location',
        organism = organism,
        reviewed = reviewed,
    )

    for uniprot, locations in iteritems(data):

        for location in locations:

            result[uniprot].add(
                UniprotLocation(
                    location = location[0],
                    features = location[1:] or None,
                )
            )

    return dict(result)




[docs]
def uniprot_keywords(organism = 9606, reviewed = True):

    UniprotKeyword = collections.namedtuple(
        'UniprotKeyword',
        [
            'keyword',
        ],
    )


    result = collections.defaultdict(set)

    data = uniprot_data(
        fields = 'keywords',
        organism = organism,
        reviewed = reviewed,
    )

    for uniprot, keywords in iteritems(data):

        for keyword in keywords.split(';'):

            result[uniprot].add(
                UniprotKeyword(
                    keyword = keyword.strip(),
                )
            )

    return dict(result)




[docs]
def uniprot_families(organism = 9606, reviewed = True):

    refamily = re.compile(r'(.+) (?:super)?family(?:, (.*) subfamily)?')


    UniprotFamily = collections.namedtuple(
        'UniprotFamily',
        [
            'family',
            'subfamily',
        ],
    )


    result = collections.defaultdict(set)

    data = uniprot_data(
        fields = 'family',
        organism = organism,
        reviewed = reviewed,
    )

    for uniprot, family in iteritems(data):

        if not family:

            continue

        family, subfamily = refamily.search(family).groups()

        result[uniprot].add(
            UniprotFamily(
                family = family,
                subfamily = subfamily,
            )
        )

    return dict(result)




[docs]
def uniprot_topology(organism = 9606, reviewed = True):

    retopo = re.compile(r'TOPO_DOM (\d+)\.\.(\d+);\s+/note="(\w+)"')
    retm = re.compile(r'(TRANSMEM|INTRAMEM) (\d+)\.\.(\d+);')


    UniprotTopology = collections.namedtuple(
        'UniprotTopology',
        [
            'topology',
            'start',
            'end',
        ],
    )


    result = collections.defaultdict(set)

    transmem = uniprot_data(
        fields = 'transmembrane',
        organism = organism,
        reviewed = reviewed,
    )

    intramem = uniprot_data(
        fields = 'intramembrane',
        organism = organism,
        reviewed = reviewed,
    )

    signal = uniprot_data(
        fields = 'signal_peptide',
        organism = organism,
        reviewed = reviewed,
    )

    data = uniprot_data(
        fields = 'topological_domain',
        organism = organism,
        reviewed = reviewed,
    )

    for uniprot, topo in iteritems(data):

        for topo_dom in retopo.findall(topo):

            start, end, topology = topo_dom
            start = int(start)
            end = int(end)

            result[uniprot].add(
                UniprotTopology(
                    topology = topology,
                    start = start,
                    end = end,
                )
            )

    for uniprot, tm in itertools.chain(
        iteritems(transmem),
        iteritems(intramem),
        iteritems(signal),
    ):

        for mem, start, end in retm.findall(tm):

            topology = (
                '%s%s' % (
                    mem.capitalize(),
                    'brane' if mem.endswith('MEM') else ''
                )
            )
            start = int(start)
            end = int(end)

            result[uniprot].add(
                UniprotTopology(
                    topology = topology,
                    start = start,
                    end = end,
                )
            )

    return dict(result)




[docs]
def uniprot_tissues(organism = 9606, reviewed = True):

    reref = re.compile(r'\s?\{.*\}\s?')
    resep = re.compile(
        r',?(?:'
            r' in almost all |'
            r' but also in |'
            r' but also at |'
            r' within the |'
            r', in |'
            r' in |'
            r' but |'
            r', and |'
            r' and |'
            r' such as |'
            r' \(both |'
            r' as well as |'
            r' as |'
            r' or |'
            r' at the |'
            r' at |'
            r' including |'
            r' during |'
            r' especially |'
            r' to |'
            r' into |'
            r' = |'
            r' > |'
            r'; |'
            r', '
        r')(?=[^\d])'
    )
    relabel = re.compile(r'^TISSUE SPECIFICITY: ')
    repubmed = re.compile(r'\(?PubMed:?\d+\)?')
    respeci = re.compile(r'(\w+)[-\s]specific')
    rethe = re.compile(
        r'\s?(?:'
           r'[Tt]he |'
           r'[Ii]n |'
           r'[Ss]ome|'
           r'[Ii]n the|'
           r'[Ww]ithin the|'
           r'[Ww]ithin|'
           r'[Ii]nto|'
           r'[Ww]ith only|'
           r'[Ww]ith the|'
           r'[Ww]ith an|'
           r'[Ww]ith |'
           r'[Ii]s |'
           r'[Mm]any  |'
           r'[Aa] variety of '
           r'[Aa] |'
           r'[Ii]t |'
           r'[Tt]o |'
           r'[Oo]n |'
           r'[Oo]f |'
           r'[Tt]hose |'
           r'[Ff]rom |'
           r'[Aa]lso|'
           r'[Bb]y |'
           r'[Pp]articularly|'
           r'[Pp]articular|'
           r'[Pp]atients|'
           r'[Aa]n |'
           r'\'|'
           r':|'
           r'/'
        r')?(.*)'
    )
    reand = re.compile(r'(?: and| of| from| or| than)$')
    replevel = re.compile(r'\(at \w+ levels?\)')
    reiso = re.compile(r'[Ii]soform \w+')
    reindef = re.compile(
        r'\w'
        r'(?:'
           r'ifferent parts of |'
           r'ariety of tissues |'
           r' variety of tissues |'
           r' number of |'
           r'everal regions of '
        r')'
    )

    level_kw = (
        ('low', 'low'),
        ('weak', 'low'),
        ('lesser extent', 'low'),
        ('minimal level', 'low'),
        ('decrease', 'low'),
        ('moderate', 'low'),
        ('barely', 'low'),
        ('minor level', 'low'),
        ('reduced', 'low'),
        ('lesser', 'low'),
        ('down-regulated', 'low'),
        ('high', 'high'),
        ('elevated', 'high'),
        ('strong', 'high'),
        ('prominent', 'high'),
        ('greatest level', 'high'),
        ('concentrated', 'high'),
        ('predominant', 'high'),
        ('increase', 'high'),
        ('enrich', 'high'),
        ('abundant', 'high'),
        ('primarily', 'high'),
        ('induced', 'high'),
        ('up-regulated', 'high'),
        ('up regulated', 'high'),
        ('expression is restricted', 'high'),
        ('amplified', 'high'),
        ('basal l', 'basal'),
        ('not detected', 'none'),
        ('absent', 'none'),
        ('expressed', 'undefined'),
        ('detect', 'undefined'),
        ('found', 'undefined'),
        ('present', 'undefined'),
        ('expression', 'undefined'),
        ('localized', 'undefined'),
        ('produced', 'undefined'),
        ('confined', 'undefined'),
        ('transcribed', 'undefined'),
        ('xpressed', 'undefined'),
        ('synthesized', 'undefined'),
        ('secreted', 'undefined'),
        ('seen', 'undefined'),
        ('prevalent', 'undefined'),
        ('released', 'undefined'),
        ('appears', 'undefined'),
        ('varying levels', 'undefined'),
        ('various levels', 'undefined'),
        ('identified', 'undefined'),
        ('observed', 'undefined'),
        ('occurs', 'undefined'),
    )

    wide_kw = (
        ('widely', 'wide'),
        ('wide tissue distribution', 'wide'),
        ('wide range of tissues', 'wide'),
        ('wide range of adult tissues', 'wide'),
        ('wide range of cells', 'wide'),
        ('wide variety of normal adult tissues', 'wide'),
        ('widespread', 'wide'),
        ('ubiquitous', 'ubiquitous'),
        ('variety of tissues', 'wide'),
        ('many tissues', 'wide'),
        ('many organs', 'wide'),
        ('various organs', 'wide'),
        ('various tissues', 'wide'),
    )

    tissue_exclude = {
        'Adult',
        'All',
        'Apparently not',
        'Areas',
        'Are likely',
        'Both',
        'By contrast',
        'Normal cells',
        'Not only',
        'A',
        '[]: Localized',
        'Early',
        'Change from a quiescent',
        'Central',
        'Beta',
        'This layer',
        'With little',
        'Preferential occurrence',
        'Stage III',
        'Take up',
        'Hardly',
        'Only seen',
        'Prevalent',
        'Inner segment',
        'Memory',
        'Many fetal',
        'Tissues',
        '0 kb',
        '9 kb',
        'A 2',
        'A 3',
        'A 5',
        'A 6',
        '1-7',
        '1b-1',
        '2 is widely',
        '8 and 4',
        'Often amplified',
        'Other',
        'Others',
        'Those',
        'Tissues examined',
        'Tissues with',
        'Tissues (e)',
        'Probably shed',
        'Reports that',
        'Primitive',
        'Prolactin',
        'Overlap',
        'A smaller 0',
        'A smaller form',
        'A smaltissues',
        'Different levels',
        'Different amounts',
        'Disappears',
        'Digestion',
        'Very similar',
        'Vivo',
        'Contrary',
        'Contrast',
        'Not',
        'Not all',
        'Has it',
        'Has little',
        'All stages',
        'Soon',
        'Specific',
        'Stage',
        'Stage I',
        'Stage II',
        'Stages II',
        'Ends',
        'A minor degree',
        'A much smaller extent',
        'Lost',
        'Varies',
        'Various',
        'Mostly restricted',
        'Mostly',
        'Most probably',
        'Much more stable',
        'Naive',
        'Neither',
        'Nor',
        'None',
    }

    exclude_startswith = (
        'Were',
        'Where',
        'Which',
        'While',
        'When',
        'There',
        'Their',
        'Then',
        'These',
        'Level',
        'This',
        'Almost',
        'If',
        'Control',
        'Be ',
        'Although',
        'Than',
        'Addition',
    )

    exclude_in = (
        'kb transcript',
        'compared',
        'soform',
        'concentration of'
    )


    UniprotTissue = collections.namedtuple(
        'UniprotTissue',
        [
            'tissue',
            'level',
        ],
    )


    data = uniprot_data(
        fields = 'tissue_specificity',
        organism = organism,
        reviewed = reviewed,
    )

    result = collections.defaultdict(set)

    for uniprot, raw in iteritems(data):

        raw = relabel.sub('', raw)
        raw = reref.sub('', raw)
        raw = replevel.sub('', raw)
        raw = reiso.sub('', raw)
        raw = repubmed.sub('', raw)
        raw = reindef.sub('', raw)
        raw = raw.replace('adult and fetal', '')

        raw = raw.split('.')

        for phrase in raw:

            tokens = tuple(resep.split(phrase))
            level = None

            for token in tokens:

                level_token = False
                wide_token = False
                tissue = None

                token_lower = token.lower()

                for kw, lev in level_kw:

                    if kw in token_lower:

                        level = lev
                        level_token = True
                        break

                if level_token:

                    for kw, wide in wide_kw:

                        if kw in token_lower:

                            tissue = wide
                            wide_token = True
                            break

                if not level_token or wide_token:

                    if not wide_token:

                        specific = respeci.search(token)

                        tissue = (
                            specific.groups()[0].lower()
                                if specific else
                            token
                        )

                        if specific and not level:

                            level = 'high'

                    if tissue.strip():

                        if any(e in tissue for e in exclude_in):

                            continue

                        tissue = rethe.match(tissue).groups()[0]
                        tissue = rethe.match(tissue).groups()[0]
                        tissue = rethe.match(tissue).groups()[0]

                        if tissue.endswith('+'):

                            tissue = '%s cells' % tissue

                        tissue = tissue.strip(')(.,;- ')

                        if '(' in tissue and ')' not in tissue:

                            tissue = '%s)' % tissue

                        tissue = reand.sub('', tissue)
                        tissue = common.upper0(tissue)
                        tissue = tissue.replace('  ', ' ')

                        if any(
                            tissue.startswith(e)
                            for e in exclude_startswith
                        ):

                            continue

                        if tissue in tissue_exclude or len(tissue) < 3:

                            continue

                        result[uniprot].add(
                            UniprotTissue(
                                tissue = tissue,
                                level = level or 'undefined',
                            )
                        )

    return dict(result)




[docs]
def uniprot_taxonomy(
        ncbi_tax_ids: bool = False,
    ) -> dict[str, set[str]] | dict[str, int]:
    """
    From UniProt IDs to organisms

    Args:
        ncbi_tax_ids:
            Translate the names to NCBI Taxonomy numeric identifiers.

    Returns:
        A dictionary with SwissProt IDs as keys and sets of various taxon
        names as values.
    """

    rename = re.compile(r'\(?(\w[\w\s\',/\.-]+\w)\)?')
    reac = re.compile(r'\s*\w+\s+\(([A-Z\d]+)\)\s*,')

    url = urls.urls['uniprot_basic']['speindex']
    c = curl.Curl(url, large = True, silent = False)

    result = collections.defaultdict(set)

    for line in c.result:

        if line[0] != ' ':

            names = set(rename.findall(line))

        else:

            for ac in reac.findall(line):

                result[ac].update(names)

    if ncbi_tax_ids:

        new_result = {}

        for ac, names in result.items():

            for name in names:

                nti = taxonomy.ensure_ncbi_tax_id(name)

                if nti:

                    new_result[ac] = nti
                    break

        result = new_result

    return dict(result)



Taxon = collections.namedtuple(
    'Taxon',
    [
        'ncbi_id',
        'latin',
        'english',
        'latin_synonym',
    ]
)
Taxon.__new__.__defaults__ = (None, None)



[docs]
def uniprot_ncbi_taxids():

    url = urls.urls['uniprot_basic']['taxids']

    with settings.context(curl_timeout = 10000):

        c = curl.Curl(
            url,
            large = True,
            silent = False,
            compr = 'gz',
        )

    _ = next(c.result)

    result = {}

    for line in c.result:

        line = line.split('\t')

        if line[0].isdigit() and len(line) > 2:

            taxid = int(line[0])

            result[taxid] = Taxon(
                ncbi_id = taxid,
                latin = line[2],
                english = line[1] or None,
            )

    return result




[docs]
def uniprot_ncbi_taxids_2():

    reline = re.compile(
        r'(?:([A-Z\d]+)\s+)?' # code
        r'(?:([A-Z]))?\s+' # kingdom
        r'(?:(\d+): )?' # NCBI Taxonomy ID
        r'([A-Z])=' # name type
        r'([ \w\(\),/\.\'-]+)[\n\r\s]*' # the name
    )

    url = urls.urls['uniprot_basic']['speclist']
    c = curl.Curl(url, large = True, silent = False)

    result = {}
    entry = {}

    for line in c.result:

        m = reline.match(line)

        if m:

            _code, _kingdom, _taxid, _name_type, _name = m.groups()

            if _taxid:

                if entry and 'ncbi_id' in entry:

                    result[entry['ncbi_id']] = Taxon(**entry)

                entry = {}
                entry['ncbi_id'] = int(_taxid)

            if _name_type == 'N':

                entry['latin'] = _name

            elif _name_type == 'C':

                entry['english'] = _name

            elif _name_type == 'S':

                entry['latin_synonym'] = _name

    if entry and 'ncbi_id' in entry:

        result[entry['ncbi_id']] = Taxon(**entry)

    return result