#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#from__future__importannotations"""Interface to UniProt protein datasheets."""fromfuture.utilsimportiteritemsimportosimportsysimportreimportshutilimportimportlibasimpimportcollectionsimportitertoolsimportpypath.inputs.uniprotasuniprot_inputimportpypath.inputs.genecardsasgenecards_inputimportpypath.share.commonascommonimportpypath_common._constantsas_constimportpypath.share.settingsassettingsimportpypath.core.entityasentity
defreload(self):modname=self.__class__.__module__mod=__import__(modname,fromlist=[modname.split('.')[0]])imp.reload(mod)new=getattr(mod,self.__class__.__name__)setattr(self,'__class__',new)defload(self):self.raw=uniprot_input.protein_datasheet(self.uniprot_id)@propertydefis_reviewed(self):return'Reviewed'inself.raw[0][1]@propertydefid(self):returnself.raw[0][1].split()[0]@propertydefac(self):returnnext(self.itertag('AC')).split(';')[0]@propertydeflength(self):""" Returns the length (number of residues) of the canonical sequence. """returnint(self._relength.search(self.raw[0][1]).groups()[0])@propertydeforganism(self):returnint(self._retaxid.search(next(self.itertag('OX'))).groups()[0])@propertydeffull_name(self):returnself._rerecname.search(next(self.itertag('DE'))).groups()[0]@propertydefec(self):returnset(self._reec.findall(''.join(self.itertag('DE'))))@propertydefinfo(self):ifnothasattr(self,'_info'):self.update_info()returnself._infodefupdate_info(self):result=collections.defaultdict(list)title=Noneforccinself.itertag('CC'):ifcc.startswith('---'):breakm=self._recc.match(cc)ifm:title,cc=m.groups()line=cc.strip()ifline:result[title].append(line)self._info=dict((title,' '.join(line))fortitle,lineiniteritems(result))@propertydeffunction_genecards(self):summaries=genecards_input.genecards_summaries(self.genesymbol)return' '.join('%s: %s'%(resource,summary)forresource,summaryiniteritems(summaries))@propertydeffunction_with_xrefs(self):returnself.info_section('FUNCTION')@propertydeffunction(self):returnself.remove_xrefs(self.function_with_xrefs)@propertydeffunction_with_genecards(self):return'%s%s'%(self.function,self.function_genecards,)@propertydeffunction_or_genecards(self):returnself.functionorself.function_genecards@propertydefsubcellular_location(self):returnself.remove_xrefs(self.subcellular_location_with_xrefs)@propertydeftissue_specificity(self):returnself.remove_xrefs(self.tissue_specificity_with_xrefs)@propertydefsubunit(self):returnself.remove_xrefs(self.subunit_with_xrefs)@propertydefinteraction(self):returnself.remove_xrefs(self.interaction_with_xrefs)@propertydefsequence_caution(self):returnself.remove_xrefs(self.sequence_caution_with_xrefs)@propertydefcatalytic_activity(self):returnself.remove_xrefs(self.catalytic_activity_with_xrefs)@propertydefactivity_regulation(self):returnself.remove_xrefs(self.activity_regulation_with_xrefs)@propertydefalternative_products(self):returnself.remove_xrefs(self.alternative_products_with_xrefs)@propertydefptm(self):returnself.remove_xrefs(self.ptm_with_xrefs)@propertydefdisease(self):returnself.remove_xrefs(self.disease_with_xrefs)@propertydefsimilarity(self):returnself.remove_xrefs(self.similarity_with_xrefs)@propertydefweb_resource(self):returnself.remove_xrefs(self.web_resource_with_xrefs)@propertydefsubcellular_location_with_xrefs(self):returnself.info_section('SUBCELLULAR LOCATION')@propertydeftissue_specificity_with_xrefs(self):returnself.info_section('TISSUE SPECIFICITY')@propertydefsubunit_with_xrefs(self):returnself.info_section('SUBUNIT')@propertydefinteraction_with_xrefs(self):returnself.info_section('INTERACTION')@propertydefsequence_caution_with_xrefs(self):returnself.info_section('SEQUENCE CAUTION')@propertydefcatalytic_activity_with_xrefs(self):returnself.info_section('CATALYTIC ACTIVITY')@propertydefactivity_regulation_with_xrefs(self):returnself.info_section('ACTIVITY REGULATION')@propertydefalternative_products_with_xrefs(self):returnself.info_section('ALTERNATIVE PRODUCTS')@propertydefptm_with_xrefs(self):returnself.info_section('PTM')@propertydefdisease_with_xrefs(self):returnself.info_section('DISEASE')@propertydefsimilarity_with_xrefs(self):returnself.info_section('SIMILARITY')@propertydefweb_resource_with_xrefs(self):returnself.info_section('WEB RESOURCE')@propertydeflengths(self):""" Returns the length of all isoforms as a list. """return[int(self._relength.search(sq).groups()[0])forsqinself.itertag('SQ')]@propertydefweight(self):""" Returns the molecular weight of the canonical isoform in Daltons. """try:returnint(self._remw.search(next(self.itertag('SQ'))).groups()[0])exceptStopIteration:returnNone@propertydefweights(self):""" Returns the molecular weights of all isoforms as a list. """return[int(self._remw.search(sq).groups()[0])forsqinself.itertag('SQ')]@propertydefdatabases(self):""" Returns the database identifiers (cross-references) as a dict of database names and identifiers. """ifnothasattr(self,'_databases'):self.update_databases()returnself._databasesdefupdate_databases(self):result=collections.defaultdict(set)fordbinself.itertag('DR'):m=self._redb.match(db)ifm:dbname,ids,subtype=m.groups()ids=self._redbsep.split(ids)ids=tuple(_idfor_idinidsif_id!='-')ifsubtype:ids+=(subtype,)ids=ids[0]iflen(ids)==1elseidsresult[dbname].add(ids)self._databases=dict(result)
[docs]definfo_section(self,title):""" Retrieves a section from the description. If the section is not availeble, returns ``None``. """info=self.infoiftitleininfo:returninfo[title]
@propertydefgenesymbol(self):try:m=self._rename.search(next(self.itertag('GN')))returnm.groups()[0]ifmelseself.acexceptStopIteration:returnself.ac@propertydefkeywords_with_xrefs(self):""" Returns the keywords as a list with keeping the cross-references. """return[kwforkwinitertools.chain(*(self._redbsep.split(kw.strip('.'))forkwinself.itertag('KW')))ifkw]@propertydefkeywords(self):""" Returns the keywords as a list. """return(self.remove_xrefs('\t'.join(self.keywords_with_xrefs)).split('\t'))@classmethoddefremove_xrefs(cls,value):returncls._rexref.sub('',value)ifvalueelsevalue@propertydefsequence(self):""" Returns the canonical sequence (the first one) as a string of standard capital letter residue symbols. """result=[]collect=Falsefortag,lineinself:ifnotcollectandtag=='SQ':collect=Trueelifcollect:iftag==' ':result.append(line)else:breakreturn''.join(x.replace(' ','')forxinresult)def__iter__(self):returnself.raw.__iter__()defitertag(self,tag):for_tag,lineinself:if_tag==tag:yieldlinedefhas_tag(self,tag):returnany(line[0]==tagforlineinself)def__repr__(self):return'<UniProt datasheet %s (%s)>'%(self.ac,self.genesymbol)
[docs]defquery(*uniprot_ids):""" Queries the datasheet of one or more UniProt IDs. Returns a single ``UniprotProtein`` object or a list of those objects. """if(len(uniprot_ids)>0andisinstance(uniprot_ids[0],_const.LIST_LIKE)):uniprot_ids=uniprot_ids[0]uniprot_ids=common.to_list(uniprot_ids)uniprot_ids=entity.Entity.only_proteins(uniprot_ids)single_id=len(uniprot_ids)==1result=[UniprotProtein(uniprot_id)foruniprot_idinuniprot_ids]result=[uforuinresultifu.raw]returncommon.first(result)ifsingle_idelseresult
[docs]defcollect(uniprot_ids,*features):""" Collects data about one or more UniProt IDs. :param str,list uniprot_ids: One or more UniProt IDs. :param *str,list features: Features to query: these must be method (property) names of the ``UniprotProtein`` class. E.g. ``['ac', 'genesymbol', 'function']``. :return: A ``collections.OrderedDict`` object with feature names as keys and list of values for each UniProt ID as values. """uniprot_ids=entity.Entity.only_proteins(uniprot_ids)resources=[UniprotProtein(uniprot_id)foruniprot_idinuniprot_ids]# this is mainly for removal of obsolate records# where the response from the server is empty# most of the times it removes nothingresources=[uforuinresourcesifu.raw]features=featuresordefault_featuresif'ac'notinfeatures:features=['ac']+list(features)table=collections.OrderedDict((feature_name,[getattr(resource,feature_name)forresourceinresources])forfeature_nameinfeatures)returntable
[docs]deffeatures_table(uniprot_ids,*features,width=40,maxlen=None,tablefmt='fancy_grid',**kwargs):""" Returns a table with the requested features of a list of UniProt IDs. The underlying table formatting module is ``tabulate``, a versatile module to export various ascii tables as well as HTML or LaTeX -- check the docs for formatting options: https://github.com/astanin/python-tabulate Args kwargs: Passed to ``tabulate.tabulate``. Returns The table as a string. """maxlen=maxlenorsettings.get('uniprot_info_maxlen')features=featuresordefault_featurestbl=collect(uniprot_ids,*features)returncommon.table_format(tbl,width=width,maxlen=maxlen,tablefmt=tablefmt,**kwargs)
[docs]defprint_features(uniprot_ids,*features,fileobj=None,width=None,maxlen=None,tablefmt='fancy_grid',**kwargs):""" Prints a table with the requested features of a list of UniProt IDs. The underlying table formatting module is ``tabulate``, a versatile module to export various ascii tables as well as HTML or LaTeX -- check the docs for formatting options: https://github.com/astanin/python-tabulate Args kwargs: Passed to ``tabulate.tabulate``. """maxlen=maxlenorsettings.get('uniprot_info_maxlen')features=featuresordefault_featuresterm_width=(shutil.get_terminal_size().columns-60)*2+40width=widthorint(term_width/len(features))ifterm_widthelse40fileobj=fileobjorsys.stdoutfileobj.write(features_table(uniprot_ids,*features,width=width,maxlen=maxlen,tablefmt=tablefmt,**kwargs))fileobj.write(os.linesep)fileobj.flush()
[docs]definfo(*uniprot_ids,features=None,fileobj=None,header=None,**kwargs):""" Prints a table with the most important (or the requested) features of a list of UniProt IDs. """if(len(uniprot_ids)==1andisinstance(uniprot_ids,_const.LIST_LIKE)):uniprot_ids=uniprot_ids[0]features=featuresordefault_featuresfileobj=fileobjorsys.stdoutheader=(headeror'=====> [%u proteins] <=====\n'%len(list(entity.Entity.filter_entity_type(common.to_list(uniprot_ids),entity_type='protein',))))fileobj.write(header)print_features(common.to_list(uniprot_ids),*features,fileobj=fileobj,**kwargs)
[docs]defbrowse(groups,start=0,fileobj=None,**kwargs):""" Browses through a series of protein groups, printing an information table for each group. ``kwargs`` passed to ``info`` and then to print_features``. Parameters for ``common.table_format`` can be provided. """labels=sorted(groups.keys())n_groups=len(labels)stop=Falsemaxlen_default=kwargs['maxlen']if'maxlen'inkwargselse500fileobj=fileobjorsys.stdoutforn,labelinenumerate(labels):ifstart>n+1:continueifstop:breakkwargs['maxlen']=maxlen_defaultwhileTrue:uniprots=groups[label]uniprots=(uniprots.membersifhasattr(uniprots,'members')elseuniprots)header=('[%u/%u] =====> %s <===== [%u proteins]\n'%(n+1,n_groups,label,len(uniprots)))info(uniprots,fileobj=fileobj,header=header,**kwargs)inp=input()ifinp=='q':stop=Truebreakelifinp.isdigit():kwargs['maxlen']=int(inp)else:fileobj.write(os.linesep*2)breaksys.stdout.write(os.linesep)sys.stdout.flush()