Source code for pypath.inputs.humsavar

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import re
import collections

import pypath.resources.urls as urls
import pypath.share.curl as curl

redesc = re.compile(
    r'([^\(^\[]+[^\s^\(^\[])\s?'
    r'(?:\((\w+)\))?\s?'
    r'(?:\[(MIM:\d+)\])?'
)
reempty = re.compile(r'^-?\s?$')


def _parse_desc(desc):

        return (
            (None,) * 3
                if reempty.match(desc) else
            redesc.match(desc).groups()
        )



[docs]
def uniprot_variants() -> dict[str, set[tuple]]:
    """
    Retrieves all human missense variants annotated in UniProtKB/Swiss-Prot.

    Returns:
        Drug attributes in below as a list of named tuples.
    """

    UniprotVariant = collections.namedtuple(
            'UniprotVariant',
            (
                'genesymbol',
                'ftid',
                'aa_change',
                'variant_category',
                'dbsnp',
                'disease',
                'disease_symbol',
                'disease_omim'
            ),
        )

    url = urls.urls['humsavar']['url']
    c = curl.Curl(url, large=True, silent=False)

    result = collections.defaultdict(set)
    respace = re.compile(r'\s+')

    # skipping data description information in txt file
    for r in c.result:

        if r.startswith('_'):

            break

    for line in c.result:

        line = respace.split(line.strip())

        if len(line) == 1:

            break

        disease, symbol, omim = _parse_desc(' '.join(line[6:]))

        variant = UniprotVariant(
                line[0],
                *line[2:6],
                disease = disease,
                disease_symbol = symbol,
                disease_omim = omim,
        )

        result[line[1]].add(variant)

    return dict(result)