Source code for pypath.inputs.uniprot_idmapping

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2024
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

import json

import pandas as pd

import pypath.resources.urls as urls
import pypath.share.curl as curl



[docs]
def idtypes(
        pairs: bool = True,
        raw: bool = False,
    ) -> dict[str, pd.DataFrame] | set[tuple[str, str]] | dict:
    """
    Identifier types in the UniProt ID mapping service.

    Args:
        pairs:
            Process the data into pairs of identifiers.
        raw:
            Return the raw data as extracted from JSON.

    Returns:
        The JSON contents as a dict if `raw` is `True`,
        a list of tuples if `pairs` is `True`,
        otherwise a set of tuples of ID types.
    """

    url = urls.urls['uniprot_idmapping']['fields']
    c = curl.Curl(url, large = False, silent = False)
    data = json.loads(c.result)

    if raw:

        return data

    groups = (
        pd.DataFrame(data['groups']).
        explode('items').
        reset_index(drop = True)
    )
    groups = (
        pd.concat(
            [
                groups['groupName'],
                pd.DataFrame(groups['items'].tolist())
            ],
            axis = 1,
        ).
        rename(columns = {'from': 'from_'})
    )

    rules = pd.DataFrame(data['rules'])

    if not pairs:

        return {'groups': groups, 'rules': rules}

    rules = {int(r.ruleId): r.tos for r in rules.itertuples()}
    groups.fillna(-1., inplace = True)

    result = set()

    for idtype in groups.itertuples():

        tos = rules.get(int(idtype.ruleId), [])

        from_to = {(idtype.name, t) for t in tos}

        if idtype.from_:

            result.update(from_to)

        if idtype.to:

            result.update({t[::-1] for t in from_to})

    return result