Source code for pypath.inputs.homologene

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import collections
import itertools

import pypath.utils.mapping as mapping
import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.inputs.uniprot_db as uniprot_db
import pypath.share.session as session

_logger = session.Logger(name = 'homologene_input')
_log = _logger._log


[docs] def get_homologene(): """ Downloads the latest release of the NCBI HomoloGene database. Returns file pointer. """ url = urls.urls['homologene']['url_rescued'] c = curl.Curl( url = url, silent = False, large = True, timeout = 1800, ignore_content_length = True, ) return c.result
[docs] def homologene_dict(source, target, id_type): """ Returns orthology translation table as dict, obtained from NCBI HomoloGene data. Args source (int): NCBI Taxonomy ID of the source species (keys). target (int): NCBI Taxonomy ID of the target species (values). id_type (str): ID type to be used in the dict. Possible values: 'RefSeq', 'Entrez', 'GI', 'GeneSymbol'. Returns Dict of sets: keys are IDs of the source organism, values are sets of IDs of the target organism. """ ids = { 'refseq': 5, 'refseqp': 5, 'genesymbol': 3, 'gi': 4, 'entrez': 2 } try: id_col = ids[id_type.lower()] except KeyError: _log( 'Unknown ID type: `%s`. Please use RefSeq, ' 'Entrez, GI or GeneSymbol.' % id_type ) raise hg = get_homologene() hgroup = None result = collections.defaultdict(set) for l in hg: l = l.strip().split('\t') this_hgroup = l[0].strip() if this_hgroup != hgroup: this_source = None this_target = None hgroup = this_hgroup this_taxon = int(l[1].strip()) if this_taxon == source: this_source = l[id_col] elif this_taxon == target: this_target = l[id_col] if ( this_source and this_target ): result[this_source].add(this_target) return dict(result)
[docs] def homologene_uniprot_dict(source, target, only_swissprot = True): """ Returns orthology translation table as dict from UniProt to Uniprot, obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for translation. Args source (int): NCBI Taxonomy ID of the source species (keys). target(int): NCBI Taxonomy ID of the target species (values). only_swissprot (bool): Use only SwissProt IDs. Returns Dict of sets: keys are UniProt IDs of the source organism, values are sets of UniProt IDs of the target organism. """ result = {} hge = homologene_dict(source, target, 'entrez') hgr = homologene_dict(source, target, 'refseq') all_source = set(uniprot_db.all_uniprots( organism = source, swissprot = 'YES', )) if not only_swissprot: all_source_trembl = uniprot_db.all_uniprots( organism = source, swissprot = 'NO', ) all_source.update(set(all_source_trembl)) for u in all_source: source_e = mapping.map_name(u, 'uniprot', 'entrez', source) source_r = mapping.map_name(u, 'uniprot', 'refseqp', source) target_u = set() target_r = set() target_e = set() for e in source_e: if e in hge: target_e.update(hge[e]) for r in source_r: if r in hgr: target_r.update(hgr[r]) for e in target_e: target_u.update( mapping.map_name(e, 'entrez', 'uniprot', target) ) for r in target_r: target_u.update( mapping.map_name(e, 'refseqp', 'uniprot', target) ) target_u = ( itertools.chain( *map( lambda tu: mapping.map_name(tu, 'uniprot', 'uniprot', target), target_u ) ) ) result[u] = sorted(list(target_u)) return result