#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#from__future__importannotationsfromfuture.utilsimportiteritemsimporttimeimportdatetimeimportitertoolsimporttimeloopimportpypath.share.commonascommonimportpypath.share.sessionassessionimportpypath.share.settingsassettingsimportpypath_common._constantsas_constimportpypath.inputs.uniprotasuniprot_inputimportpypath.inputs.ensemblasensembl_input_logger=session.Logger(name='taxonomy')_log=_logger._logdb={}_cleanup_period=settings.get('mapper_cleanup_interval')_lifetime=300_last_used={}NOT_ORGANISM_SPECIFIC=_const.NOT_ORGANISM_SPECIFIC# XXX: Shouldn't we keep all functions and variables separated# (together among them)?taxids={9606:'human',10090:'mouse',10116:'rat',9031:'chicken',9913:'cow',9986:'rabbit',9940:'sheep',10141:'guinea pig',10036:'hamster',7227:'fruit fly',9615:'dog',9823:'pig',8355:'frog',9091:'quail',9796:'horse',9925:'goat',89462:'water buffalo',9598:'monkey',9103:'turkey',9685:'cat',7604:'starfish',7609:'spiny starfish',1213717:'torpedo',9669:'ferret',8839:'duck',9593:'gorilla',7460:'honeybee',8407:'european common frog',9544:'rhesus macaque',}taxids2=dict((t.taxon_id,t.common_name.lower())fortinensembl_input.ensembl_organisms())taxa=common.swap_dict_simple(taxids)taxa2=common.swap_dict_simple(taxids2)taxa_synonyms={'bovine':'cow','western gorilla':'gorilla',}phosphoelm_taxids={9606:'Homo sapiens',10090:'Mus musculus',9913:'Bos taurus',9986:'Oryctolagus cuniculus',9615:'Canis familiaris',10029:'Cricetulus griseus',9267:'Didelphis virginiana',9031:'Gallus gallus',10036:'Mesocricetus auratus',9940:'Ovis aries',10116:'Rattus norvegicus',9823:'Sus scrofa',8355:'Xenopus laevis',10141:'Cavia porcellus',9796:'Equus caballus',7227:'Drosophila melanogaster',487:'Neisseria meningitidis',562:'Escherichia coli',5207:'Cryptococcus neoformans',470:'Acinetobacter baumannii',1280:'Staphylococcus aureus',4932:'Saccharomyces cerevisiae',34:'Myxococcus xanthus',1392:'Bacillus anthracis',210:'Helicobacter pylori',6239:'Caenorhabditis elegans',}phosphoelm_taxids.update([(t.taxon_id,t.scientific_name)fortinensembl_input.ensembl_organisms()])dbptm_taxids={9606:'HUMAN',10090:'MOUSE',7227:'DROME',10116:'RAT',559292:'YEAST',284812:'SCHPO',4081:'SOLLC',3702:'ARATH',9940:'SHEEP',9913:'BOVIN',9925:'CAPHI',44689:'DICDI',4577:'MAIZE',9823:'PIG',9615:'CANLF',6239:'CAEEL',8455:'XENLA',83333:'ECOLI',1891767:'SV40',}mirbase_taxids={9606:'hsa',10090:'mmu',10116:'rno',7227:'dme',}ensembl_taxids=dict((t.taxon_id,t.ensembl_name)fortinensembl_input.ensembl_organisms())nonstandard_taxids={'drosophila':7227,'c.elegans':6239,'xenopus':8355,'Synechocystis_sp.':1142,}
[docs]defshorten_latin_name(name:str,dot:bool=True)->str:""" For a complete latin name, returns its shortened version. In short latin names the genus name is marked only by its initial. """ifname:name=name.split()returnf'{name[0][0].upper()}{"."ifdotelse""}{"".join(name[1:])}'
[docs]defshort_latin_names(long_names:dict[str,int])->dict[str,int]:""" For a dict of long latin names returns a dict with all names shortened. """return{shorten_latin_name(k,dot=dot):vfork,vinlong_names.items()fordotin(True,False)}
[docs]defensure_common_name(taxon_id:str|int,lower:bool=False)->str|None:""" Common English name of an organism. Args: taxon_id: Organism name or NCBI Taxonomy ID. lower: Return lowercase name. Default is capitalized. """common_name=(# priority for these common namestaxids.get(taxon_id,None)or_ensure_name(taxon_id,'common'))returncommon_name.lower()iflowerelsecommon_name.capitalize()
def_ensure_name(taxon_id,name_type):ncbi_tax_id=ensure_ncbi_tax_id(taxon_id)ncbi_to_name=get_db('ncbi_to_%s'%name_type)ifncbi_tax_idinncbi_to_name:returnncbi_to_name[ncbi_tax_id]_log('Could not find %s taxon name for `%s`.'%(name_type,str(taxon_id),))
[docs]defensure_ncbi_tax_id(taxon_id):""" For taxon names of various formats returns NCBI Taxonomy ID if possible. Handles English names, scientific names and other common language synonyms and database specific codenames. """ifisinstance(taxon_id,int):returntaxon_idelse:ifhasattr(taxon_id,'strip'):taxon_id=taxon_id.strip()ifcommon.is_str(taxon_id)and'('intaxon_id:part0,part1=taxon_id.split('(',maxsplit=1)ncbi_tax_id=(ensure_ncbi_tax_id(part0)orensure_ncbi_tax_id(part1.split(')',maxsplit=1)[0]))elifhasattr(taxon_id,'isdigit')andtaxon_id.isdigit():ncbi_tax_id=int(taxon_id)else:ncbi_tax_id=(taxid_from_dbptm_taxon_name(taxon_id)ortaxid_from_nonstandard(taxon_id)ortaxid_from_common_name(taxon_id)ortaxid_from_latin_name(taxon_id)ortaxid_from_ensembl_name(taxon_id))ifnotncbi_tax_id:_log('Could not map to NCBI Taxonomy ID: `%s`.'%str(taxon_id))returnncbi_tax_id
[docs]defuniprot_taxid(uniprot):""" For a UniProt ID returns its NCBI Taxonomy ID. """uniprot_to_taxid=get_db('swissprot')ifuniprotinuniprot_to_taxid:returnuniprot_to_taxid[uniprot]
dbptm_to_ncbi_tax_id=common.swap_dict_simple(dbptm_taxids)latin_name_to_ncbi_tax_id=common.swap_dict_simple(phosphoelm_taxids)short_latin_name_to_ncbi_tax_id=short_latin_names(latin_name_to_ncbi_tax_id)ensembl_name_to_ncbi_tax_id=common.swap_dict_simple(ensembl_taxids)_cleanup_timeloop=timeloop.Timeloop()_cleanup_timeloop.logger.setLevel(9999)@_cleanup_timeloop.job(interval=datetime.timedelta(seconds=_cleanup_period))def_cleanup():keys=list(globals()['db'].keys())forkeyinkeys:iftime.time()-globals()['_last_used'][key]>_lifetime:_remove(key)_cleanup_timeloop.start(block=False)def_remove(key):ifkeyinglobals()['db']:_logger._log('Removing taxonomy data `%s`.'%key)delglobals()['db'][key]ifkeyinglobals()['_last_used']:delglobals()['_last_used'][key]