#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#from__future__importannotationsfromfuture.utilsimportiteritemsfromtypingimportIterableimportreimportjsonimportcollectionsimportitertoolsimportfunctoolsimporturllib.parseimportpandasaspdimportpypath.resources.urlsasurlsimportpypath.share.curlascurlimportpypath.share.settingsassettingsimportpypath.share.sessionassession_modimportpypath.share.commonascommonimportpypath_common._constantsas_constimportpypath.utils.taxonomyastaxonomyfrompypath.inputs.uniprot_idmappingimportidtypesasidmapping_idtypes_logger=session_mod.Logger(name='uniprot_input')_redatasheet=re.compile(r'([A-Z\s]{2})\s*([^\n\r]+)[\n\r]+')# regex for matching UniProt AC format# from https://www.uniprot.org/help/accession_numbersreac=re.compile(r'[OPQ][0-9][A-Z0-9]{3}[0-9]|'r'[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}')_rename=re.compile(r'Name=([\w\(\)-]+)\W')_retaxid=re.compile(r'=(\d+)[^\d]')def_all_uniprots(organism=9606,swissprot=None):swissprot=_swissprot_param(swissprot)rev=''ifswissprotisNoneelse' AND reviewed: %s'%swissproturl=urls.urls['uniprot_basic']['url']get={'query':'organism_id:%s%s'%(str(organism),rev),'format':'tsv','fields':'accession',}iforganism=='*':get['query']=rev.strip(' AND ')c=curl.Curl(url,get=get,silent=False,slow=True)data=c.resultreturn{l.strip()forlindata.split('\n')[1:]ifl.strip()}def_swissprot_param(swissprot):return('true'ifswissprotin{'true','True','yes','YES',True}else'false'ifswissprotin{'false','False','no','NO',False}elseNone)
[docs]defvalid_uniprot(name):""" Checks if ``name`` fits the format requirements for UniProt accession numbers. """returnbool(reac.match(name))
[docs]defprotein_datasheet(identifier):url=urls.urls['uniprot_basic']['datasheet']%identifier.strip()datasheet=_protein_datasheet(url)ifnotdatasheet:_logger._log('UniProt ID `%s` returns empty response, it might be and an old ''ID which has been deleted from the database. Attempting to ''find its history and retrieve either an archived version or ''the find the new ID which replaced this one.'%identifier)returnuniprot_history_recent_datasheet(identifier)else:returndatasheet
[docs]defdeleted_uniprot_genesymbol(identifier):""" Retrieves the archived datasheet for a deleted UniProt ID and returns the Gene Symbol and the NCBI Taxonomy ID from the datasheet. """datasheet=uniprot_history_recent_datasheet(identifier)genesymbol=Nonencbi_tax_id=Nonefortag,lineindatasheet:iftag=='GN':m=_rename.search(line.strip())ifm:genesymbol=m.groups()[0]iftag=='OX':ncbi_tax_id=int(_retaxid.search(line).groups()[0])breakreturngenesymbol,ncbi_tax_id
def_protein_datasheet(url):cache=Trueforainrange(3):c=curl.Curl(url,silent=True,large=False,cache=cache,connect_timeout=(settings.get('uniprot_datasheet_connect_timeout')),timeout=settings.get('uniprot_datasheet_timeout'),)ifnotc.resultorc.result.startswith('<!DOCTYPE'):cache=Falseelse:breakifnotc.result:_logger._log('Could not retrieve UniProt datasheet by URL `%s`.'%url)return_redatasheet.findall(c.result)ifc.resultelse[]
[docs]defuniprot_history_recent_datasheet(identifier):recent_version=uniprot_recent_version(identifier)ifrecent_version:ifrecent_version.replaced_by:new=recent_version.replaced_by.split(';')[0]url=urls.urls['uniprot_basic']['datasheet']%new_logger._log('UniProt ID `%s` is obsolete, has been replaced by ''`%s`: `%s`.'%(identifier,new,url,))returnprotein_datasheet(new)else:version=int(recent_version.entry_version)url='%s?version=%u'%(urls.urls['uniprot_basic']['datasheet']%identifier,version,)_logger._log('UniProt ID `%s` is obsolete, downloading archived ''version %u: `%s`.'%(identifier,version,url,))c=curl.Curl(url,silent=True,large=False)return_protein_datasheet(url)return[]
[docs]defuniprot_history(identifier):""" Retrieves the history of a record. Returns a generator iterating over the history from most recent to the oldest. """ifvalid_uniprot(identifier):url_history=urls.urls['uniprot_basic']['history']%identifierc_history=curl.Curl(url_history,silent=True,large=True,)ifc_history.result:line0=next(c_history.result)ifnotline0.startswith('<!DOCTYPE'):forlineinc_history.result:ifline:yieldUniprotRecordHistory(*(field.strip()forfieldinline.split('\t')))
def_uniprot_deleted(swissprot=True,confirm=True):ifnotswissprotandconfirm:resp=input('Loading the list of deleted TrEMBL IDs requires ''>5GB memory. Do you want to proceed [y/n] ')ifnotresporresp[0].lower()!='y':returnset()key='deleted_%s'%('sp'ifswissprotelse'tr')url=urls.urls['uniprot_basic'][key]c=curl.Curl(url,silent=False,large=True)result=set()forlineinc.result:m=reac.match(line.strip())ifm:result.add(m.groups()[0])returnresult
[docs]defget_uniprot_sec(organism=9606):""" Downloads and processes the mapping between secondary and primary UniProt IDs. Yields pairs of secondary and primary UniProt IDs. :param int organism: NCBI Taxonomy ID of the organism. """_organism=organismnotin(None,_const.NOT_ORGANISM_SPECIFIC)if_organism:frompypath.inputsimportuniprot_dbproteome=uniprot_db.all_uniprots(organism=organism)proteome=set(proteome)sec_pri=[]url=urls.urls['uniprot_sec']['url']c=curl.Curl(url,silent=False,large=True,timeout=2400)fori,lineinenumerate(c.result):ifi<30:continueline=line.split()iflen(line)==2and(not_organismorline[1]inproteome):yieldline
[docs]def__init__(self,*query,fields:str|Iterable[str]|None=None,**kwargs):""" Constructs a query for the UniProt REST API. Args: query: Query elements: can be a ready query or its components, bypassing the processing in this function or performing only simple concatenation. Alternatively, it can be a nested structure of lists and dicts describing more complex queries. See the examples below. kwargs: Same as passing a dict to ``query``. Details: The query can be built in several ways: - Simple string or concatenation of strings: query_builder('kinase AND organism_id:9606') query_builder('kinase', 'organism_id:9606') query_builder('kinase', organism_id = 9606) The above 3 examples all return the same query: `kinase AND organism_id:9606` - The default operator within lists is `OR` and within dicts is `AND`: query_builder(organism = [9606, 10090, 10116]) `organism_id:9606 OR organism_id:10090 OR organism_id:10116` query_builder({'organism_id': 9606, 'reviewed': True}) `organism_id:9606 AND reviewed:true` - These default operators can be changed by including the `op` key in dicts or including the operator with an underscore in lists: query_builder({'length': (500,), 'mass': (50000,), 'op': 'OR'}) `length:[500 TO *] OR mass:[50000 TO *]` query_builder(lit_author = ['Huang', 'Kovac', '_AND']) `lit_author:Huang AND lit_author:Kovac` - The nested structures translate into nested parentheses in the query: query_builder({'organism_id': [9606, 10090], 'reviewed': True}) `(organism_id:9606 OR organism_id:10090) AND reviewed:true` - Values are converted to strings, intervals can be provided as tuples: query_builder({'length': (100, None), 'organism_id': 9606}) `length:[100 TO *] AND organism_id:9606` For a complete reference of the available parameters, see https://www.uniprot.org/help/query-fields and https://www.uniprot.org/help/text-search for additional syntax elements. For the available fields refer to the ``_FIELD_SYNONYMS`` attribute of this class or the UniProt website: https://www.uniprot.org/help/return_fields Methods: __iter__: Perform the query and iterate over the lines in the results, skipping the header and the empty lines, stripping the linebreaks and splitting by tab. Yields: A list of fields for each line. Attributes: fail_on_empty: If set to True, an error will be raised if the UniProt API returns empty response. By default no error is raised. name_process: If set to True, a different processing will be applied on the results. This is appropriate especially for identifier type fields. """self.fields=common.to_list(fields)self._args=query,kwargsself._process_main()# tolerate empty result: Curl returns None in case of# empty file but in case of UniProt, especially for under-researched# taxons it can happen there is no result for certain queriesself.fail_on_empty=Falseself.name_process=False
@classmethoddef_value(cls,val:str|int|bool|tuple,field:str|None=None,)->str:field=cls._SYNONYMS.get(field,field)iffield=='organism_id':result=str(taxonomy.ensure_ncbi_tax_id(val)orval)elifisinstance(val,tuple):val=(tuple(map(cls._value,val))+('*',))[:2]result='[%s TO %s]'%valelifvalisNone:iffield=='reviewed':result=''field=Noneelse:result='*'elifisinstance(val,bool):result=str(val).lower()else:result=str(val)iffield:result=f'{field}:{result}'returnresultdef_process_main(self):query,kwargs=self._argsop=kwargs.pop('_op','AND')query=list(query)query.append(kwargs)result=[]forqinquery:q=self._process(q).strip()if(resultandqandnotself._OPEND.match(result[-1])andnotself._OPSTART.match(q)):result.append(op)ifq:result.append(q)self.query=' '.join(result)@classmethoddef_process(cls,query:str|list|dict,field:str|None=None,)->str:method=cls._PROCESS.get(type(query).__name__,'_value')returngetattr(cls,method)(query,field)@classmethoddef_process_list(cls,query:list,field:str|None=None)->str:op='_OR'for_opincls._OP:if_opinquery:op=query.pop(query.index(_op))op=f' {op[1:]} 'query=[cls._process(i,field)foriinquery]returncls._par(op.join(query))@classmethoddef_process_dict(cls,query:dict,field:str|None=None)->str:query=query.copy()op=' %s '%query.pop('op',' AND ').strip()result=op.join(itfork,vinquery.items()if(it:=cls._process(v,k)))returncls._par(result)iflen(query)>1elseresult@staticmethoddef_par(value:str)->str:returnf'({value})'ifvalueelse''@propertydef_get(self)->dict[str,str]:field_qs=','.join(['accession']+[self._FIELD_SYNONYMS.get(f,f)forfinself.fields])return{'query':self.query,'format':'tsv','fields':field_qs,'compressed':'true',}@propertydef_baseurl(self)->str:returnurls.urls['uniprot_basic']['url']@propertydefurl(self)->str:""" UniProt REST API URL (urlencoded). Returns: A valid query suitable for the UniProt REST API. """returnf'{self._baseurl}?{urllib.parse.urlencode(self._get)}'@propertydefurl_plain(self)->str:""" UniProt REST API URL (plain). """returnurllib.parse.unquote_plus(self.url)
[docs]defperform(self)->list[str]|dict[str,str]|dict[str,dict[str,str]]:""" Perform the query and preprocess the result. Returns: - A list of UniProt IDs if no fields were provided. - A dict of UniProt IDs and corresponding field values if exactly one field was provided. - A dict with field names as top level keys and dicts of the kind described in the previous point as values. """_id,*variables=zip(*self)_id=list(map(common.sfirst,_id))ifvariables:result={f:{i:vfori,vinzip(_id,vs)ifi}forf,vsinzip(self.fields,variables)}result=(common.first(result.values())iflen(result)==1elseresult)else:result=list(_id)returnresult
[docs]defquery_builder(*query,**kwargs)->str:""" Build a query for the UniProt web site and REST API. Args: query: Query elements: can be a ready query or its components, bypassing the processing in this function or performing only simple concatenation. Alternatively, it can be a nested structure of lists and dicts describing more complex queries. See the examples below. kwargs: Same as passing a dict to ``query``. Details: The query can be built in several ways: - Simple string or concatenation of strings: query_builder('kinase AND organism_id:9606') query_builder('kinase', 'organism_id:9606') query_builder('kinase', organism_id = 9606) The above 3 examples all return the same query: `kinase AND organism_id:9606` - The default operator within lists is `OR` and within dicts is `AND`: query_builder(organism = [9606, 10090, 10116]) `organism_id:9606 OR organism_id:10090 OR organism_id:10116` query_builder({'organism_id': 9606, 'reviewed': True}) `organism_id:9606 AND reviewed:true` - These default operators can be changed by including the `op` key in dicts or including the operator with an underscore in lists: query_builder({'length': (500,), 'mass': (50000,), 'op': 'OR'}) `length:[500 TO *] OR mass:[50000 TO *]` query_builder(lit_author = ['Huang', 'Kovac', '_AND']) `lit_author:Huang AND lit_author:Kovac` - The nested structures translate into nested parentheses in the query: query_builder({'organism_id': [9606, 10090], 'reviewed': True}) `(organism_id:9606 OR organism_id:10090) AND reviewed:true` - Values are converted to strings, intervals can be provided as tuples: query_builder({'length': (100, None), 'organism_id': 9606}) `length:[100 TO *] AND organism_id:9606` For a complete reference of the available parameters, see https://www.uniprot.org/help/query-fields and https://www.uniprot.org/help/text-search for additional syntax elements. Returns: A query that can be inserted into the UniProt search field. """returnUniprotQuery(*query,**kwargs).query
[docs]defuniprot_data(*query,fields:str|Iterable[str]|None=None,organism:str|int|None=9606,reviewed:bool|None=True,**kwargs)->dict[str,str]|dict[str,dict[str,str]]:""" Basic client for the UniProt REST API. Retrieves one or more fields from UniProt, by default for all reviewed (SwissProt) proteins of one organism Args: query: Query elements: can be a ready query or its components, bypassing the processing in this function or performing only simple concatenation. Alternatively, it can be a nested structure of lists and dicts describing more complex queries. See the examples below. fields: One or more UniProt field name. See details. organism: Organism name or identifier, e.g. "human", or "Homo sapiens", or 9606. reviewed: Restrict the query to SwissProt (True), to TrEMBL (False), or cover both (None). kwargs: Same as passing a dict to ``query``. Details: The query can be built in several ways: - Simple string or concatenation of strings: query_builder('kinase AND organism_id:9606') query_builder('kinase', 'organism_id:9606') query_builder('kinase', organism_id = 9606) The above 3 examples all return the same query: `kinase AND organism_id:9606` - The default operator within lists is `OR` and within dicts is `AND`: query_builder(organism = [9606, 10090, 10116]) `organism_id:9606 OR organism_id:10090 OR organism_id:10116` query_builder({'organism_id': 9606, 'reviewed': True}) `organism_id:9606 AND reviewed:true` - These default operators can be changed by including the `op` key in dicts or including the operator with an underscore in lists: query_builder({'length': (500,), 'mass': (50000,), 'op': 'OR'}) `length:[500 TO *] OR mass:[50000 TO *]` query_builder(lit_author = ['Huang', 'Kovac', '_AND']) `lit_author:Huang AND lit_author:Kovac` - The nested structures translate into nested parentheses in the query: query_builder({'organism_id': [9606, 10090], 'reviewed': True}) `(organism_id:9606 OR organism_id:10090) AND reviewed:true` - Values are converted to strings, intervals can be provided as tuples: query_builder({'length': (100, None), 'organism_id': 9606}) `length:[100 TO *] AND organism_id:9606` For a complete reference of the available parameters, see https://www.uniprot.org/help/query-fields and https://www.uniprot.org/help/text-search for additional syntax elements. For the available fields refer to the ``_FIELD_SYNONYMS`` attribute of the UniprotQuery class or the UniProt website: https://www.uniprot.org/help/return_fields Returns: - A list of UniProt IDs if no fields were provided. - A dict of UniProt IDs and corresponding field values if exactly one field was provided. - A dict with field names as top level keys and dicts of the kind described in the previous point as values. """forargin('organism','reviewed'):iflocals()[arg]isnotNone:kwargs[arg]=locals()[arg]returnuniprot_query(*query,fields=fields,**kwargs)
[docs]defuniprot_query(*query,fields:str|Iterable[str]|None=None,**kwargs)->dict[str,str]|dict[str,dict[str,str]]:""" Basic client for the UniProt REST API. Args: query: Query elements: can be a ready query or its components, bypassing the processing in this function or performing only simple concatenation. Alternatively, it can be a nested structure of lists and dicts describing more complex queries. See the examples below. fields: One or more UniProt field name. See details. kwargs: Same as passing a dict to ``query``. Details: The query can be built in several ways: - Simple string or concatenation of strings: query_builder('kinase AND organism_id:9606') query_builder('kinase', 'organism_id:9606') query_builder('kinase', organism_id = 9606) The above 3 examples all return the same query: `kinase AND organism_id:9606` - The default operator within lists is `OR` and within dicts is `AND`: query_builder(organism = [9606, 10090, 10116]) `organism_id:9606 OR organism_id:10090 OR organism_id:10116` query_builder({'organism_id': 9606, 'reviewed': True}) `organism_id:9606 AND reviewed:true` - These default operators can be changed by including the `op` key in dicts or including the operator with an underscore in lists: query_builder({'length': (500,), 'mass': (50000,), 'op': 'OR'}) `length:[500 TO *] OR mass:[50000 TO *]` query_builder(lit_author = ['Huang', 'Kovac', '_AND']) `lit_author:Huang AND lit_author:Kovac` - The nested structures translate into nested parentheses in the query: query_builder({'organism_id': [9606, 10090], 'reviewed': True}) `(organism_id:9606 OR organism_id:10090) AND reviewed:true` - Values are converted to strings, intervals can be provided as tuples: query_builder({'length': (100, None), 'organism_id': 9606}) `length:[100 TO *] AND organism_id:9606` For a complete reference of the available parameters, see https://www.uniprot.org/help/query-fields and https://www.uniprot.org/help/text-search for additional syntax elements. For the available fields refer to the ``_FIELD_SYNONYMS`` attribute of the UniprotQuery class or the UniProt website: https://www.uniprot.org/help/return_fields Returns: - A list of UniProt IDs if no fields were provided. - A dict of UniProt IDs and corresponding field values if exactly one field was provided. - A dict with field names as top level keys and dicts of the kind described in the previous point as values. """returnUniprotQuery(*query,fields=fields,**kwargs).perform()
[docs]defuniprot_tissues(organism=9606,reviewed=True):reref=re.compile(r'\s?\{.*\}\s?')resep=re.compile(r',?(?:'r' in almost all |'r' but also in |'r' but also at |'r' within the |'r', in |'r' in |'r' but |'r', and |'r' and |'r' such as |'r' \(both |'r' as well as |'r' as |'r' or |'r' at the |'r' at |'r' including |'r' during |'r' especially |'r' to |'r' into |'r' = |'r' > |'r'; |'r', 'r')(?=[^\d])')relabel=re.compile(r'^TISSUE SPECIFICITY: ')repubmed=re.compile(r'\(?PubMed:?\d+\)?')respeci=re.compile(r'(\w+)[-\s]specific')rethe=re.compile(r'\s?(?:'r'[Tt]he |'r'[Ii]n |'r'[Ss]ome|'r'[Ii]n the|'r'[Ww]ithin the|'r'[Ww]ithin|'r'[Ii]nto|'r'[Ww]ith only|'r'[Ww]ith the|'r'[Ww]ith an|'r'[Ww]ith |'r'[Ii]s |'r'[Mm]any |'r'[Aa] variety of 'r'[Aa] |'r'[Ii]t |'r'[Tt]o |'r'[Oo]n |'r'[Oo]f |'r'[Tt]hose |'r'[Ff]rom |'r'[Aa]lso|'r'[Bb]y |'r'[Pp]articularly|'r'[Pp]articular|'r'[Pp]atients|'r'[Aa]n |'r'\'|'r':|'r'/'r')?(.*)')reand=re.compile(r'(?: and| of| from| or| than)$')replevel=re.compile(r'\(at \w+ levels?\)')reiso=re.compile(r'[Ii]soform \w+')reindef=re.compile(r'\w'r'(?:'r'ifferent parts of |'r'ariety of tissues |'r' variety of tissues |'r' number of |'r'everal regions of 'r')')level_kw=(('low','low'),('weak','low'),('lesser extent','low'),('minimal level','low'),('decrease','low'),('moderate','low'),('barely','low'),('minor level','low'),('reduced','low'),('lesser','low'),('down-regulated','low'),('high','high'),('elevated','high'),('strong','high'),('prominent','high'),('greatest level','high'),('concentrated','high'),('predominant','high'),('increase','high'),('enrich','high'),('abundant','high'),('primarily','high'),('induced','high'),('up-regulated','high'),('up regulated','high'),('expression is restricted','high'),('amplified','high'),('basal l','basal'),('not detected','none'),('absent','none'),('expressed','undefined'),('detect','undefined'),('found','undefined'),('present','undefined'),('expression','undefined'),('localized','undefined'),('produced','undefined'),('confined','undefined'),('transcribed','undefined'),('xpressed','undefined'),('synthesized','undefined'),('secreted','undefined'),('seen','undefined'),('prevalent','undefined'),('released','undefined'),('appears','undefined'),('varying levels','undefined'),('various levels','undefined'),('identified','undefined'),('observed','undefined'),('occurs','undefined'),)wide_kw=(('widely','wide'),('wide tissue distribution','wide'),('wide range of tissues','wide'),('wide range of adult tissues','wide'),('wide range of cells','wide'),('wide variety of normal adult tissues','wide'),('widespread','wide'),('ubiquitous','ubiquitous'),('variety of tissues','wide'),('many tissues','wide'),('many organs','wide'),('various organs','wide'),('various tissues','wide'),)tissue_exclude={'Adult','All','Apparently not','Areas','Are likely','Both','By contrast','Normal cells','Not only','A','[]: Localized','Early','Change from a quiescent','Central','Beta','This layer','With little','Preferential occurrence','Stage III','Take up','Hardly','Only seen','Prevalent','Inner segment','Memory','Many fetal','Tissues','0 kb','9 kb','A 2','A 3','A 5','A 6','1-7','1b-1','2 is widely','8 and 4','Often amplified','Other','Others','Those','Tissues examined','Tissues with','Tissues (e)','Probably shed','Reports that','Primitive','Prolactin','Overlap','A smaller 0','A smaller form','A smaltissues','Different levels','Different amounts','Disappears','Digestion','Very similar','Vivo','Contrary','Contrast','Not','Not all','Has it','Has little','All stages','Soon','Specific','Stage','Stage I','Stage II','Stages II','Ends','A minor degree','A much smaller extent','Lost','Varies','Various','Mostly restricted','Mostly','Most probably','Much more stable','Naive','Neither','Nor','None',}exclude_startswith=('Were','Where','Which','While','When','There','Their','Then','These','Level','This','Almost','If','Control','Be ','Although','Than','Addition',)exclude_in=('kb transcript','compared','soform','concentration of')UniprotTissue=collections.namedtuple('UniprotTissue',['tissue','level',],)data=uniprot_data(fields='tissue_specificity',organism=organism,reviewed=reviewed,)result=collections.defaultdict(set)foruniprot,rawiniteritems(data):raw=relabel.sub('',raw)raw=reref.sub('',raw)raw=replevel.sub('',raw)raw=reiso.sub('',raw)raw=repubmed.sub('',raw)raw=reindef.sub('',raw)raw=raw.replace('adult and fetal','')raw=raw.split('.')forphraseinraw:tokens=tuple(resep.split(phrase))level=Nonefortokenintokens:level_token=Falsewide_token=Falsetissue=Nonetoken_lower=token.lower()forkw,levinlevel_kw:ifkwintoken_lower:level=levlevel_token=Truebreakiflevel_token:forkw,wideinwide_kw:ifkwintoken_lower:tissue=widewide_token=Truebreakifnotlevel_tokenorwide_token:ifnotwide_token:specific=respeci.search(token)tissue=(specific.groups()[0].lower()ifspecificelsetoken)ifspecificandnotlevel:level='high'iftissue.strip():ifany(eintissueforeinexclude_in):continuetissue=rethe.match(tissue).groups()[0]tissue=rethe.match(tissue).groups()[0]tissue=rethe.match(tissue).groups()[0]iftissue.endswith('+'):tissue='%s cells'%tissuetissue=tissue.strip(')(.,;- ')if'('intissueand')'notintissue:tissue='%s)'%tissuetissue=reand.sub('',tissue)tissue=common.upper0(tissue)tissue=tissue.replace(' ',' ')ifany(tissue.startswith(e)foreinexclude_startswith):continueiftissueintissue_excludeorlen(tissue)<3:continueresult[uniprot].add(UniprotTissue(tissue=tissue,level=levelor'undefined',))returndict(result)
[docs]defuniprot_taxonomy(ncbi_tax_ids:bool=False,)->dict[str,set[str]]|dict[str,int]:""" From UniProt IDs to organisms Args: ncbi_tax_ids: Translate the names to NCBI Taxonomy numeric identifiers. Returns: A dictionary with SwissProt IDs as keys and sets of various taxon names as values. """rename=re.compile(r'\(?(\w[\w\s\',/\.-]+\w)\)?')reac=re.compile(r'\s*\w+\s+\(([A-Z\d]+)\)\s*,')url=urls.urls['uniprot_basic']['speindex']c=curl.Curl(url,large=True,silent=False)result=collections.defaultdict(set)forlineinc.result:ifline[0]!=' ':names=set(rename.findall(line))else:foracinreac.findall(line):result[ac].update(names)ifncbi_tax_ids:new_result={}forac,namesinresult.items():fornameinnames:nti=taxonomy.ensure_ncbi_tax_id(name)ifnti:new_result[ac]=ntibreakresult=new_resultreturndict(result)