#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#importcollectionsimportitertoolsimportpypath.utils.mappingasmappingimportpypath.resources.urlsasurlsimportpypath.share.curlascurlimportpypath.inputs.uniprot_dbasuniprot_dbimportpypath.share.sessionassession_logger=session.Logger(name='homologene_input')_log=_logger._log
[docs]defget_homologene():""" Downloads the latest release of the NCBI HomoloGene database. Returns file pointer. """url=urls.urls['homologene']['url_rescued']c=curl.Curl(url=url,silent=False,large=True,timeout=1800,ignore_content_length=True,)returnc.result
[docs]defhomologene_dict(source,target,id_type):""" Returns orthology translation table as dict, obtained from NCBI HomoloGene data. Args source (int): NCBI Taxonomy ID of the source species (keys). target (int): NCBI Taxonomy ID of the target species (values). id_type (str): ID type to be used in the dict. Possible values: 'RefSeq', 'Entrez', 'GI', 'GeneSymbol'. Returns Dict of sets: keys are IDs of the source organism, values are sets of IDs of the target organism. """ids={'refseq':5,'refseqp':5,'genesymbol':3,'gi':4,'entrez':2}try:id_col=ids[id_type.lower()]exceptKeyError:_log('Unknown ID type: `%s`. Please use RefSeq, ''Entrez, GI or GeneSymbol.'%id_type)raisehg=get_homologene()hgroup=Noneresult=collections.defaultdict(set)forlinhg:l=l.strip().split('\t')this_hgroup=l[0].strip()ifthis_hgroup!=hgroup:this_source=Nonethis_target=Nonehgroup=this_hgroupthis_taxon=int(l[1].strip())ifthis_taxon==source:this_source=l[id_col]elifthis_taxon==target:this_target=l[id_col]if(this_sourceandthis_target):result[this_source].add(this_target)returndict(result)
[docs]defhomologene_uniprot_dict(source,target,only_swissprot=True):""" Returns orthology translation table as dict from UniProt to Uniprot, obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for translation. Args source (int): NCBI Taxonomy ID of the source species (keys). target(int): NCBI Taxonomy ID of the target species (values). only_swissprot (bool): Use only SwissProt IDs. Returns Dict of sets: keys are UniProt IDs of the source organism, values are sets of UniProt IDs of the target organism. """result={}hge=homologene_dict(source,target,'entrez')hgr=homologene_dict(source,target,'refseq')all_source=set(uniprot_db.all_uniprots(organism=source,swissprot='YES',))ifnotonly_swissprot:all_source_trembl=uniprot_db.all_uniprots(organism=source,swissprot='NO',)all_source.update(set(all_source_trembl))foruinall_source:source_e=mapping.map_name(u,'uniprot','entrez',source)source_r=mapping.map_name(u,'uniprot','refseqp',source)target_u=set()target_r=set()target_e=set()foreinsource_e:ifeinhge:target_e.update(hge[e])forrinsource_r:ifrinhgr:target_r.update(hgr[r])foreintarget_e:target_u.update(mapping.map_name(e,'entrez','uniprot',target))forrintarget_r:target_u.update(mapping.map_name(e,'refseqp','uniprot',target))target_u=(itertools.chain(*map(lambdatu:mapping.map_name(tu,'uniprot','uniprot',target),target_u)))result[u]=sorted(list(target_u))returnresult