#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#from__future__importannotationsfromtypingimportList,Dict,Union,Literalimportcollectionsfromlxmlimportetreeimportgzipimportshutilimportmathimportpypath.resources.urlsasurlsimportpypath.share.curlascurlimportpypath.share.sessionassessionimportpypath.inputs.commonasinputs_common_logger=session.Logger(name='inputs.interpro')_log=_logger._log
[docs]definterpro_entries()->List[tuple]:""" Downloads detailed InterPro entry information. Returns A list of named tuples, each representing information about one InterPro entry. """InterproEntry=collections.namedtuple('InterproEntry',('interpro_id','protein_count','name','type','publications','parent_list','child_list','member_list'),)result=[]url=urls.urls['interpro']['entries']path=curl.Curl(url,silent=False,large=False).fileobj.namewithgzip.open(path,'rb')asf_in:withopen(path.split('.gz')[0],'wb')asf_out:shutil.copyfileobj(f_in,f_out)parser=etree.iterparse(path.split('.gz')[0],events=('end',),tag='interpro')forev,eleminparser:ifelem.find('pub_list')isnotNone:pubs=[]forpubinelem.find('pub_list'):pubs.append(pub.attrib['id'])else:pubs=''ifelem.find('parent_list')isnotNone:parent_ids=[]forparentinelem.find('parent_list'):parent_ids.append(parent.attrib['ipr_ref'])else:parent_ids=''ifelem.find('child_list')isnotNone:child_ids=[]forchildinelem.find('child_list'):child_ids.append(child.attrib['ipr_ref'])else:child_ids=''member_ids={}formemberinelem.find('member_list'):ifmember.attrib['db']inmember_ids:member_ids[member.attrib['db']].append(member.attrib['dbkey'])else:member_ids[member.attrib['db']]=[]member_ids[member.attrib['db']].append(member.attrib['dbkey'])result.append(InterproEntry(interpro_id=elem.attrib['id'],protein_count=elem.attrib['protein_count'],name=elem.attrib['short_name'],type=elem.attrib['type'],publications=pubs,parent_list=parent_ids,child_list=child_ids,member_list=member_ids,))returnresult
[docs]definterpro_xrefs(db_type:Literal['go','structural','external',],)->Dict[str,Union[List[str],Dict[str,List[str]]]]:""" Downloads cross-references for each InterPro entry. Args db_type: Type of the cross-reference databases. Returns A dictionary; keys are InterPro IDs. If 'db_type' is 'go', values are list of GO terms related to the InterPro entry. Otherwise values are dictionaries, where keys are database names and the values are list of cross references related to the InterPro entry. """db_type_dict={'go':'class_list','structural':'structure_db_links','external':'external_doc_list',}db_type_name=db_type_dict[db_type]result={}url=urls.urls['interpro']['entries']path=curl.Curl(url,silent=False,large=False).fileobj.namewithgzip.open(path,'rb')asf_in:withopen(path.split('.gz')[0],'wb')asf_out:shutil.copyfileobj(f_in,f_out)parser=etree.iterparse(path.split('.gz')[0],events=('end',),tag='interpro')forev,eleminparser:interpro_id=elem.attrib['id']ifdb_type=='go':go_terms=([go.attrib['id']forgoinelem.find(db_type_name)]ifelem.find(db_type_name)isnotNoneelseNone)result[interpro_id]=go_termselse:other_db_keys={}ifelem.find(db_type_name)isnotNone:forlinkinelem.find(db_type_name):iflink.attrib['db']inother_db_keys:other_db_keys[link.attrib['db']].append(link.attrib['dbkey'])else:other_db_keys[link.attrib['db']]=[]other_db_keys[link.attrib['db']].append(link.attrib['dbkey'])result[interpro_id]=other_db_keysreturnresult
[docs]definterpro_annotations(page_size:int=200,reviewed:bool=True,tax_id:str|int=9606,)->dict:""" Downloads UniProtKB proteins and the InterPro entries they match. Args page_size: Number of results returned at a time. reviewed: Downloads only reviewed UniprotKB proteins if True, Downloads all UniprotKB proteins otherwise. Returns A dictionary. Keys are Uniprot IDs, values are sets of annotations. """InterproAnnotation=collections.namedtuple('InterproAnnotation',('interpro_id','organism','start','end',),)annotations=collections.defaultdict(set)page=0proteins=('reviewed'ifreviewedelse'uniprot')base_url=(urls.urls['interpro']['annotations']%(proteins,tax_id,page_size))next_page_url=base_urlwhilenext_page_url:c=curl.Curl(next_page_url,silent=False,large=False)res=inputs_common.json_read(c.result)totalrec=int(res['count'])_log('Downloading page %u (total: %s).'%(page+1,'unknown'iftotalrec<0elsestr(math.ceil(totalrec/page_size))))forentryinres['results']:entry_info=entry['metadata']forproteininentry['protein_subset']:locations=protein['entry_protein_locations']forlocationinlocations:forfragmentinlocation['fragments']:uniprot_id=protein['accession'].upper()annotations[uniprot_id].add(InterproAnnotation(interpro_id=entry_info['accession'],organism=protein['organism'],start=int(fragment['start']),end=int(fragment['end']),))next_page_url=res.get('next')page+=1returnannotations
[docs]definterpro2go_annotations()->dict[str,set[tuple]]:""" Downloads GO term annotations for InterPro entries. Returns Dict of InterPro entries as keys and sets of GO terms as values. """url=urls.urls['interpro']['interpro2go']c=curl.Curl(url,large=True,silent=False)Interpro2GOAnnotation=collections.namedtuple('Interpro2GOAnnotation',('go_term_id','go_term_name',),)annotations=collections.defaultdict(set)forrinc.result:ifnotr.startswith('!'):r=r.strip()interpro_id=r.split('InterPro:')[1].split(' ')[0]go_term_name=r.split('> GO:')[1].split(' ; ')[0]go_term_id=r.split('> GO:')[1].split(' ; ')[1]annotations[interpro_id].add(Interpro2GOAnnotation(go_term_id=go_term_id,go_term_name=go_term_name))returnannotations