Source code for pypath.inputs.homologene

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import collections
import itertools

import pypath.utils.mapping as mapping
import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.inputs.uniprot_db as uniprot_db
import pypath.share.session as session

_logger = session.Logger(name = 'homologene_input')
_log = _logger._log



[docs]
def get_homologene():
    """
    Downloads the latest release of the NCBI HomoloGene database.
    Returns file pointer.
    """

    url = urls.urls['homologene']['url_rescued']

    c = curl.Curl(
        url = url,
        silent = False,
        large = True,
        timeout = 1800,
        ignore_content_length = True,
    )

    return c.result




[docs]
def homologene_dict(source, target, id_type):
    """
    Returns orthology translation table as dict, obtained
    from NCBI HomoloGene data.

    Args
        source (int): NCBI Taxonomy ID of the source species (keys).
        target (int): NCBI Taxonomy ID of the target species (values).
        id_type (str): ID type to be used in the dict. Possible values:
            'RefSeq', 'Entrez', 'GI', 'GeneSymbol'.

    Returns
        Dict of sets: keys are IDs of the source organism, values are sets
        of IDs of the target organism.
    """
    ids = {
        'refseq': 5,
        'refseqp': 5,
        'genesymbol': 3,
        'gi': 4,
        'entrez': 2
    }

    try:
        id_col = ids[id_type.lower()]
    except KeyError:
        _log(
            'Unknown ID type: `%s`. Please use RefSeq, '
            'Entrez, GI or GeneSymbol.' % id_type
        )
        raise

    hg = get_homologene()
    hgroup = None
    result = collections.defaultdict(set)

    for l in hg:

        l = l.strip().split('\t')
        this_hgroup = l[0].strip()

        if this_hgroup != hgroup:

            this_source = None
            this_target = None
            hgroup = this_hgroup

        this_taxon = int(l[1].strip())

        if this_taxon == source:

            this_source = l[id_col]

        elif this_taxon == target:

            this_target = l[id_col]

        if (
            this_source and
            this_target
        ):

            result[this_source].add(this_target)

    return dict(result)




[docs]
def homologene_uniprot_dict(source, target, only_swissprot = True):
    """
    Returns orthology translation table as dict from UniProt to Uniprot,
    obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for
    translation.

    Args
        source (int): NCBI Taxonomy ID of the source species (keys).
        target(int): NCBI Taxonomy ID of the target species (values).
        only_swissprot (bool): Use only SwissProt IDs.

    Returns
        Dict of sets: keys are UniProt IDs of the source organism, values
        are sets of UniProt IDs of the target organism.
    """

    result = {}

    hge = homologene_dict(source, target, 'entrez')
    hgr = homologene_dict(source, target, 'refseq')

    all_source = set(uniprot_db.all_uniprots(
        organism = source,
        swissprot = 'YES',
    ))

    if not only_swissprot:

        all_source_trembl = uniprot_db.all_uniprots(
            organism = source,
            swissprot = 'NO',
        )
        all_source.update(set(all_source_trembl))

    for u in all_source:

        source_e = mapping.map_name(u, 'uniprot', 'entrez', source)
        source_r = mapping.map_name(u, 'uniprot', 'refseqp', source)
        target_u = set()
        target_r = set()
        target_e = set()

        for e in source_e:

            if e in hge:

                target_e.update(hge[e])

        for r in source_r:

            if r in hgr:

                target_r.update(hgr[r])

        for e in target_e:

            target_u.update(
                mapping.map_name(e, 'entrez', 'uniprot', target)
            )

        for r in target_r:

            target_u.update(
                mapping.map_name(e, 'refseqp', 'uniprot', target)
            )


        target_u = (
            itertools.chain(
                *map(
                    lambda tu:
                        mapping.map_name(tu, 'uniprot', 'uniprot', target),
                    target_u
                )
            )
        )

        result[u] = sorted(list(target_u))

    return result