#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#frompast.builtinsimportxrange,rangefromfuture.utilsimportiteritemsimportosimportpickleimportreimportitertoolsimportxml.etree.cElementTreeasETimportpypath.share.progressasprogressimportpypath.utils.taxonomyastaxonomyimportpypath.internals.interaasinteraimportpypath.share.curlascurlimportpypath.resources.urlsasurlsimportpypath.inputs.uniprot_dbasuniprot_dbimportpypath.inputs.commonasinputs_commonimportpypath.inputs.homologeneashomologeneimportpypath.utils.mappingasmappingimportpypath.share.commonascommonimportpypath.share.sessionassession_logger=session.Logger(name='phosphosite_input')
[docs]defphosphosite_enzyme_substrate(raw=True,organism='human',strict=True,):""" Downloads and preprocesses phosphorylation site data from PhosphoSitePlus. """url=urls.urls['psite_kin']['url']c=curl.Curl(url,silent=False,compr='gz',encoding='iso-8859-1',large=True,)orto={}data=c.resultcols={'kinase':2,'kinase_org':3,'substrate':6,'substrate_org':8,'residue':9,'motif':11}data=inputs_common.read_table(cols=cols,fileObject=data,sep='\t',hdr=4,)result=[]non_digit=re.compile(r'[^\d.-]+')motre=re.compile(r'(_*)([A-Za-z]+)(_*)')forrindata:iforganismisNoneor \
((r['kinase_org']==organismornotstrict)and \
r['substrate_org']==organism):ifr['kinase_org']!=organism:korg=r['kinase_org']# attempting to map by orthology:ifkorgintaxonomy.taxaandorganismintaxonomy.taxa:ktaxid=taxonomy.taxa[korg]taxid=taxonomy.taxa[organism]ifkorgnotinorto:orto[korg]=homologene.homologene_dict(ktaxid,taxid,'refseqp',)korg_refseq=mapping.map_name(r['kinase'],'uniprot','refseqp',ktaxid)kin_uniprot= \
list(itertools.chain(*map(lambdaors:mapping.map_name(ors,'refseqp','uniprot',taxid),itertools.chain(*map(lambdars:orto[korg][rs],filter(lambdars:rsinorto[korg],korg_refseq))))))else:kin_uniprot=[r['kinase']]forkinaseinkin_uniprot:r['resaa']=r['residue'][0]r['resnum']=int(non_digit.sub('',r['residue'][1:]))mot=motre.match(r['motif'])# excluding e.g. Q12809_VAR_014388r['substrate']=r['substrate'].split('_')[0]sisoform=1if'-'notinr['substrate']else \
int(r['substrate'].split('-')[1])r['substrate']=r['substrate'].split('-')[0]kisoform=(1if'-'notinkinaseelseint(kinase.split('-')[1]))kinase=kinase.split('-')[0]r['substrate']=r['substrate'].split('-')[0]ifmot:r['start']=r['resnum']-7+len(mot.groups()[0])r['end']=r['resnum']+7-len(mot.groups()[2])r['instance']=r['motif'].replace('_','').upper()else:r['start']=Noner['end']=Noner['instance']=Noneifraw:r['kinase']=kinaseresult.append(r)else:res=intera.Residue(r['resnum'],r['resaa'],r['substrate'],isoform=sisoform)mot=intera.Motif(r['substrate'],r['start'],r['end'],instance=r['instance'],isoform=sisoform)ptm=intera.Ptm(protein=r['substrate'],residue=res,motif=mot,typ='phosphorylation',source='PhosphoSite',isoform=sisoform)dom=intera.Domain(protein=kinase,isoform=kisoform)dommot=intera.DomainMotif(domain=dom,ptm=ptm,sources=['PhosphoSite'])result.append(dommot)returnresult
[docs]defphosphosite_ptm_orthology():""" Returns an orthology translation dict of phosphosites based on phosphorylation sites table from PhosphoSitePlus. In the result all PTMs represented by a tuple of the following 6 elements: UniProt ID, isoform (int), residue one letter code, residue number (int), NCBI Taxonomy ID (int), modification type. :param int source: Source taxon (NCBI Taxonomy). :param int target: Target taxon (NCBI Taxonomy). """result={}nondigit=re.compile(r'[^\d]+')unknown_taxa=set([])fortypincommon.psite_mod_types:groups={}url=urls.urls['psite_%s'%typ[0]]['url']c=curl.Curl(url,silent=False,large=True)data=c.resultfor_inxrange(4):__=next(data)forrindata:r=common.decode(r,'utf-8').split('\t')iflen(r)<10:continueuniprot=r[2]isoform=1if'-'notinuniprotelseint(uniprot.split('-')[1])uniprot=uniprot.split('-')[0]aa=r[4][0]num=int(nondigit.sub('',r[4]))ifr[6]notintaxonomy.taxa:unknown_taxa.add(r[6])continuetax=taxonomy.taxa[r[6]]group=int(r[5])this_site=(uniprot,isoform,aa,num,tax,typ[1])ifgroupnotingroups:groups[group]=set([])groups[group].add(this_site)forgroup,sitesiniteritems(groups):forsite1insites:forsite2insites:ifsite1[4]==site2[4]:continueifsite1notinresult:result[site1]={}ifsite2[4]notinresult[site1]:result[site1][site2[4]]=set([])result[site1][site2[4]].add(site2)iflen(unknown_taxa):_logger._log('Unknown organisms encountered: %s'%', '.join(sorted(unknown_taxa)))returnresult
[docs]defphosphosite_ptms(organism='human'):""" Downloads the phosphorylation site dataset from PhosphoSitePlus. """result=[]url=urls.urls['psite_p']['url']nondigit=re.compile(r'[^\d]+')remot=re.compile(r'(_*)([A-Za-z]+)(_*)')c=curl.Curl(url,silent=False,large=True)data=c.resultfor_inxrange(4):_=next(c.result)forrindata:r=r.split('\t')iflen(r)>9and(organismisNoneorr[6]==organism):uniprot=r[2]isoform=1if'-'notinuniprotelseint(uniprot.split('-')[1])uniprot=uniprot.split('-')[0]typ=r[3].lower()iflen(typ)==0:typ=r[4].split('-')[1]if'-'inr[4]elseNoneaa=r[4][0]num=int(nondigit.sub('',r[4]))motif=remot.match(r[9])ifmotif:start=num-7+len(motif.groups()[0])end=num+7-len(motif.groups()[2])instance=r[9].replace('_','').upper()else:start=Noneend=Noneinstance=Noneres=intera.Residue(num,aa,uniprot,isoform=isoform,)mot=intera.Motif(uniprot,start,end,instance=instance,isoform=isoform,)ptm=intera.Ptm(uniprot,typ=typ,motif=mot,residue=res,evidences='PhosphoSite',isoform=isoform,)result.append(ptm)returnresult
[docs]defphosphosite_regsites():""" Downloads and preprocesses the regulatory sites dataset from PhosphoSitePlus. This data provides information about which proteins a PTM disrupts or induces the interaction with. """kwds_pos={'enzymatic activity, induced','activity, induced','protein stabilization','receptor inactivation, inhibited','receptor desensitization, inhibited','receptor internalization, inhibited','receptor recycling, induced'}kwds_neg={'enzymatic activity, inhibited','activity, inhibited','protein degradation','receptor inactivation, induced','receptor desensitization, induced','receptor internalization, induced','receptor recycling, inhibited'}url=urls.urls['psite_reg']['url']c=curl.Curl(url,silent=False,compr='gz',encoding='iso-8859-1',large=True)data=c.resultcols={'uniprot':3,'organism':6,'mod':7,'on_function':11,'on_process':12,'on_interact':13,'pmids':15,'comments':19}data=inputs_common.read_table(cols=cols,fileObject=data,sep='\t',hdr=4,)regsites={}forrindata:interact=[[y.replace(')','').strip()foryinx.split('(')]forxinr['on_interact'].strip().split(';')iflen(x)>0]induces=[x[0]forxininteractifx[1]=='INDUCES']disrupts=[x[0]forxininteractifx[1]=='DISRUPTS']mod=r['mod']modt=r['mod'].split('-')mod=list(modt[0])aa=mod.pop(0)modt=modt[1]res=''.join(mod)isoform=(int(r['uniprot'].split('-')[1])if'-'inr['uniprot']else1)uniprot=r['uniprot'].split('-')[0]ifuniprotnotinregsites:regsites[uniprot]=[]function=set(map(lambdaf:f.strip(),r['on_function'].split(';')))regsites[uniprot].append({'aa':aa,'res':res,'modt':modt,'organism':r['organism'],'pmids':set(map(lambdaf:f.strip(),r['pmids'].split(';'))),'induces':induces,'disrupts':disrupts,'isoform':isoform,'function':function,'process':set(map(lambdaf:f.strip(),r['on_process'].split(';'))),'comments':r['comments'],'positive':bool(kwds_pos&function),'negative':bool(kwds_neg&function)})returnregsites
[docs]defphosphosite_regsites_one_organism(organism=9606):""" Returns PhosphoSitePlus regulatory sites translated to one organism by orthology. Residue numbers will be translated where necessary, while gene symbols will be translated to UniProt IDs of the given organism. This works with human, mouse or rat. :param int organism: NCBI Taxonomy ID of the target organism. In this method possible values are human, mouse or rat, as these species provide the vast majority of the data, and are close enough to each other that the sites can be safely translated between orthologous proteins by sequence alignement. """defgenesymbols2uniprots(genesymbols,tax):return(set(itertools.chain(*map(lambdags:mapping.map_name(gs,'genesymbol','uniprot',ncbi_tax_id=tax,),genesymbols))))deftranslate_uniprots(uniprots,homo):return(set(itertools.chain(*map(lambdausrc:homo[usrc]ifusrcinhomoelse[],uniprots))))result={}organisms=set([9606,10090,10116])mod_types=dict(common.psite_mod_types2)regsites=phosphosite_regsites()other_organisms=organisms-set([organism])homology=(dict(map(lambdaother:(other,homologene.homologene_uniprot_dict(source=other,target=organism,)),other_organisms)))ptm_homology=phosphosite_ptm_orthology()proteome=uniprot_db.all_uniprots(organism=organism,swissprot='YES',)forsubstrate,regsiniteritems(regsites):subs=[]ifsubstrateinproteome:subs=[substrate]else:forother,homoiniteritems(homology):ifsubstrateinhomo:subs=homo[substrate]forsubinsubs:ifsubnotinresult:result[sub]={}forreginregs:reg_organism=taxonomy.taxa[reg['organism']]ifreg_organismnotinorganisms:continueifreg['modt']notinmod_types:_logger._log('Unknown PhosphoSite modification ''type code: %s'%reg['modt'])continuemod_type=mod_types[reg['modt']]resnum=int(reg['res'])psite_key=(substrate,reg['isoform'],reg['aa'],resnum,reg_organism,mod_type,)ifreg_organism!=organism:regs_target=[]disrupts=[]induces=[]ifpsite_keyinptm_homology:iforganisminptm_homology[psite_key]:regs_target=ptm_homology[psite_key][organism]iflen(regs_target):disrupts=genesymbols2uniprots(reg['disrupts'],reg_organism,)disrupts=translate_uniprots(disrupts,homology[reg_organism],)induces=genesymbols2uniprots(reg['induces'],reg_organism,)induces=translate_uniprots(induces,homology[reg_organism],)else:regs_target=[psite_key]disrupts=genesymbols2uniprots(reg['disrupts'],organism)induces=genesymbols2uniprots(reg['induces'],organism)forregtinregs_target:modkey=(regt[2],regt[3],regt[5])ifmodkeynotinresult[sub]:result[sub][modkey]={'induces':set([]),'disrupts':set([]),'pmids':set([]),'isoforms':set([]),'process':set([]),'function':set([]),'positive':False,'negative':False,'comments':[]}result[sub][modkey]['induces'].update(induces)result[sub][modkey]['disrupts'].update(disrupts)result[sub][modkey]['process'].update(reg['process'])result[sub][modkey]['function'].update(reg['function'])result[sub][modkey]['isoforms'].update([regt[1]])result[sub][modkey]['pmids'].update(reg['pmids'])result[sub][modkey]['positive']= \
result[sub][modkey]['positive']orreg['positive']result[sub][modkey]['negative']= \
result[sub][modkey]['negative']orreg['negative']iflen(reg['comments']):result[sub][modkey]['comments'].append(reg['comments'])returnresult
[docs]defregsites_tab(regsites,outfile=None):""" Exports PhosphoSite regulatory sites as a tabular file, all IDs translated to UniProt. """header=['uniprot_a','isoform_a','a_res_aa','a_res_num','a_mod_type','effect','uniprot_b','references']result=[]foruniprot,regsiteiniteritems(regsites):isoform='1'uniprot=uniprot.split('-')iflen(uniprot)>1:isoform=uniprot[1]uniprot=uniprot[0]forrinregsite:ifr['organism']=='human':foriinr['induces']:other=mapping.map_name(i,'genesymbol','uniprot')foroinother:ifo!='unmapped':result.append([uniprot,isoform,r['aa'],r['res'],r['modt'],'+',o])foriinr['disrupts']:other=mapping.map_name(i,'genesymbol','uniprot')foroinother:ifo!='unmapped':result.append([uniprot,isoform,r['aa'],r['res'],r['modt'],'-',o,';'.join(r['pmids'])])ifoutfileisnotNone:out='\t'.join(header)+'\n'forrinresult:out+='\t'.join(r)+'\n'withopen(outfile,'w')asf:f.write(out)returnresult
[docs]defphosphosite_interactions(cache=True,ncbi_tax_id=9606):""" Downloads curated and HTP data from Phosphosite, from preprocessed cache file if available. Processes BioPAX format. Returns list of interactions. """curated_cache=urls.files['phosphosite']['curated']noref_cache=urls.files['phosphosite']['noref']ifcacheandos.path.exists(curated_cache)andos.path.exists(noref_cache):return(pickle.load(open(curated_cache,'rb')),pickle.load(open(noref_cache,'rb')),)result_curated=[]result_noref=[]url=urls.urls['psite_bp']['url']c=curl.Curl(url,silent=False,large=True)bpax=c.gzfilexml=ET.parse(bpax)xmlroot=xml.getroot()bpprefix='{http://www.biopax.org/release/biopax-level3.owl#}'rdfprefix='{http://www.w3.org/1999/02/22-rdf-syntax-ns#}'proteins={}forpinxmlroot.iter(bpprefix+'ProteinReference'):psid=p.attrib[rdfprefix+'ID']db=p.find(bpprefix+'xref').find(bpprefix+'UnificationXref').find(bpprefix+'db').textup=p.find(bpprefix+'xref').find(bpprefix+'UnificationXref').find(bpprefix+'id').texttax=''ifp.find(bpprefix+'organism')isnotNone:tmp=p.find(bpprefix+'organism')ifrdfprefix+'resource'intmp.attrib:tax=tmp.attrib[rdfprefix+'resource'].split('_')[1]ifdb=='UniProtKB':up=up[0:6]proteins[psid]={'id':up,'db':db,'species':tax,'psid':psid}evidences={}forpinxmlroot.iter(bpprefix+'EvidenceCodeVocabulary'):evid=p.attrib[rdfprefix+'ID'].split('_')[1]evname=p.find(bpprefix+'term').textevidences[evid]=evnameev_short={'0113':'WB','0427':'MS','0074':'MA','0421':'AB'}nosrc=[]notgt=[]norefs=[]noev=[]noth=[]edges=[]forcinxmlroot.findall(bpprefix+'Catalysis'):ifrdfprefix+'resource'inc.find(bpprefix+'controller').attrib:src='po_'+ \
c.find(bpprefix+'controller').attrib[rdfprefix+'resource'].split('_')[1]else:srcProt=c.find(bpprefix+'controller').find(bpprefix+'Protein')ifsrcProtisnotNone:src='po_'+srcProt.attrib[rdfprefix+'ID'].split('_')[1]else:nosrc.append(c)tgtProt=c.find(bpprefix+'controlled').iter(bpprefix+'ProteinReference')tgt=next(tgtProt,None)iftgtisnotNone:tgt=tgt.attrib[rdfprefix+'ID']else:tgtProt=c.find(bpprefix+'controlled').iter(bpprefix+'entityReference')tgt=next(tgtProt,None)iftgtisnotNone:ifrdfprefix+'resource'intgt.attrib:tgt=tgt.attrib[rdfprefix+'resource'][1:]else:tgtProt=c.find(bpprefix+'controlled').iter(bpprefix+'left')tgt=next(tgtProt,None)iftgtisnotNone:ifrdfprefix+'resource'intgt.attrib:tgt='po_'+ \
tgt.attrib[rdfprefix+'resource'].split('_')[1]else:notgt.append(c)refs=c.iter(bpprefix+'PublicationXref')pmids=[]forrinrefs:pm=r.attrib[rdfprefix+'ID'].split('_')ifpm[0]=='pmid':pmids.append(pm[1])refs=c.iter(bpprefix+'evidence')forrinrefs:rrefs=r.iter(bpprefix+'xref')forrrinrrefs:ifrdfprefix+'resource'inrr.attrib:pm=rr.attrib[rdfprefix+'resource'].split('_')ifpm[0]=='pubmed':pmids.append(pm[1])evs=[]foreinc.iter(bpprefix+'evidenceCode'):ifrdfprefix+'resource'ine.attrib:evs.append(ev_short[e.attrib[rdfprefix+'resource'].split('_')[1]])else:ev=e.find(bpprefix+'EvidenceCodeVocabulary')evs.append(ev_short[ev.attrib[rdfprefix+'ID'].split('_')[1]])foreinc.iter(bpprefix+'evidence'):ifrdfprefix+'resource'ine.attrib:ev=e.attrib[rdfprefix+'resource'].split('_')iflen(ev)==4:iflen(ev[3])==4:evs.append(ev_short[ev[3]])if(srcisnotNoneandtgtisnotNoneandsrcinproteinsandtgtinproteinsandproteins[src]['id']isnotNoneandproteins[tgt]['id']isnotNone):edges.append({'src':proteins[src],'tgt':proteins[tgt],'pmids':list(set(pmids)),'evs':list(set(evs))})iflen(evs)==0:noev.append(c)iflen(pmids)==0:norefs.append(c)iflen(evs)==0andlen(pmids)==0:noth.append(c)ifncbi_tax_id:all_uniprots=uniprot_db.all_uniprots(organism=ncbi_tax_id)foreinedges:if(ncbi_tax_idand(e['src']['id']notinall_uniprotsore['tgt']['id']notinall_uniprots)):continuethis_iaction=[e['src']['id'],e['tgt']['id'],e['src']['species'],e['tgt']['species'],';'.join(e['evs']),';'.join(e['pmids'])]iflen(this_iaction[-1])>0:result_curated.append(this_iaction)else:result_noref.append(this_iaction)pickle.dump(result_curated,open(curated_cache,'wb'))pickle.dump(result_noref,open(noref_cache,'wb'))returnresult_curated,result_noref
[docs]defphosphosite_interactions_new(cache=True):""" Downloads curated and HTP data from Phosphosite, from preprocessed cache file if available. Processes BioPAX format. Returns list of interactions. """curated_cache=urls.files['phosphosite']['curated']noref_cache=urls.files['phosphosite']['noref']if(cacheandos.path.exists(curated_cache)andos.path.exists(noref_cache)):withopen(curated_cache,'rb')asfp:data_curated=pickle.load(fp)withopen(noref_cache,'rb')asfp:data_noref=pickle.load(fp)returndata_curated,data_norefdefcollect_items(tagname,process_method):result={}forpinxmlroot.iter(tagname):key,value=process_method(p)result[key]=valuereturnresultdefprocess_protein(protein):protein_id=protein.attrib['%sID'%rdfprefix]database=(protein.find('%sxref'%bpprefix).find('%sUnificationXref'%bpprefix).find('%sdb'%bpprefix).text)identifier=(protein.find('%sxref'%bpprefix).find('%sUnificationXref'%bpprefix).find('%sid'%bpprefix).text)organism=Nonee_organism=protein.find('%sorganism'%bpprefix)if(e_organismisnotNoneand'%sresource'%rdfprefixine_organism.attrib):organism=(e_organism.attrib['%sresource'%rdfprefix].split('_')[1])returnprotein_id,(databas,identifier,organism)defprocess_site(site):site_id=site.attrib['%sID'%rdfprefix]site_offset=site.find('%ssequencePosition').textreturnsite_id,site_offsetdefprocess_modification(seqmodvoc):mod_id=seqmodvoc.attrib['%sID'%rdfprefix]residue,mod=mod_id.split('_').split('-')returnmod_id,(residue,mod)defget_resource(elem,resource_tag):res_attr='%sresource'%rdfprefixifres_attrinelem.attrib:returnelem.attrib[res_attr][1:]else:returnelem.find(resource_tag).attrib['%sID'%rdfprefix]defprocess_feature(feature):feature_id=feature.attrib['%sID'%rdfprefix]site=get_resource(feature.find('%sfeatureLocation'%bpprefix),'%sSequenceSite'%bpprefix,)modification=get_resource(feature.find('%smodificationType'%bpprefix),'%sSequenceModificationVocabulary'%bpprefix,)returnfeature_id,(site,modification)result_curated=[]result_noref=[]bpprefix='{http://www.biopax.org/release/biopax-level3.owl#}'rdfprefix='{http://www.w3.org/1999/02/22-rdf-syntax-ns#}'url=urls.urls['psite_bp']['url']c=curl.Curl(url,silent=False,large=True)bpax=c.gzfilexml=ET.parse(bpax)xmlroot=xml.getroot()proteins=collect_items('%sProtein'%bpprefix,process_method=process_protein,)sites=collect_items('%sSequenceSite'%bpprefix,process_method=process_site,)modifications=collect_items('%sSequenceModificationVocabulary'%bpprefix,process_method=process_modification,)features=collect_items('%sModificationFeature'%bpprefix,process_method=process_feature,)evidences={}forpinxmlroot.iter(bpprefix+'EvidenceCodeVocabulary'):evid=p.attrib[rdfprefix+'ID'].split('_')[1]evname=p.find(bpprefix+'term').textevidences[evid]=evnameev_short={'0113':'WB','0427':'MS','0074':'MA','0421':'AB'}nosrc=[]notgt=[]norefs=[]noev=[]noth=[]edges=[]forcinxmlroot.findall(bpprefix+'Catalysis'):ifrdfprefix+'resource'inc.find(bpprefix+'controller').attrib:src='po_'+ \
c.find(bpprefix+'controller').attrib[rdfprefix+'resource'].split('_')[1]else:srcProt=c.find(bpprefix+'controller').find(bpprefix+'Protein')ifsrcProtisnotNone:src='po_'+srcProt.attrib[rdfprefix+'ID'].split('_')[1]else:nosrc.append(c)tgtProt=c.find(bpprefix+'controlled').iter(bpprefix+'ProteinReference')tgt=next(tgtProt,None)iftgtisnotNone:tgt=tgt.attrib[rdfprefix+'ID']else:tgtProt=c.find(bpprefix+'controlled').iter(bpprefix+'entityReference')tgt=next(tgtProt,None)iftgtisnotNone:ifrdfprefix+'resource'intgt.attrib:tgt=tgt.attrib[rdfprefix+'resource'][1:]else:tgtProt=c.find(bpprefix+'controlled').iter(bpprefix+'left')tgt=next(tgtProt,None)iftgtisnotNone:ifrdfprefix+'resource'intgt.attrib:tgt='po_'+ \
tgt.attrib[rdfprefix+'resource'].split('_')[1]else:notgt.append(c)refs=c.iter(bpprefix+'PublicationXref')pmids=[]forrinrefs:pm=r.attrib[rdfprefix+'ID'].split('_')ifpm[0]=='pmid':pmids.append(pm[1])refs=c.iter(bpprefix+'evidence')forrinrefs:rrefs=r.iter(bpprefix+'xref')forrrinrrefs:ifrdfprefix+'resource'inrr.attrib:pm=rr.attrib[rdfprefix+'resource'].split('_')ifpm[0]=='pubmed':pmids.append(pm[1])evs=[]foreinc.iter(bpprefix+'evidenceCode'):ifrdfprefix+'resource'ine.attrib:evs.append(ev_short[e.attrib[rdfprefix+'resource'].split('_')[1]])else:ev=e.find(bpprefix+'EvidenceCodeVocabulary')evs.append(ev_short[ev.attrib[rdfprefix+'ID'].split('_')[1]])foreinc.iter(bpprefix+'evidence'):ifrdfprefix+'resource'ine.attrib:ev=e.attrib[rdfprefix+'resource'].split('_')iflen(ev)==4:iflen(ev[3])==4:evs.append(ev_short[ev[3]])if(srcisnotNoneandtgtisnotNoneandsrcinproteinsandtgtinproteinsandproteins[src]['id']isnotNoneandproteins[tgt]['id']isnotNone):edges.append({'src':proteins[src],'tgt':proteins[tgt],'pmids':list(set(pmids)),'evs':list(set(evs))})iflen(evs)==0:noev.append(c)iflen(pmids)==0:norefs.append(c)iflen(evs)==0andlen(pmids)==0:noth.append(c)foreinedges:this_iaction=[e['src']['id'],e['tgt']['id'],e['src']['species'],e['tgt']['species'],';'.join(e['evs']),';'.join(e['pmids'])]iflen(this_iaction[-1])>0:result_curated.append(this_iaction)else:result_noref.append(this_iaction)pickle.dump(result_curated,open(curated_cache,'wb'))pickle.dump(result_noref,open(noref_cache,'wb'))returnresult_curated,result_noref
[docs]defphosphosite_interactions_curated(ncbi_tax_id=9606):""" Loads literature curated PhosphoSite data, from preprocessed cache file if available. Returns list of interactions. """curated_cache=urls.files['phosphosite']['curated']ifnotos.path.exists(curated_cache):curated,noref=phosphosite_interactions(ncbi_tax_id=ncbi_tax_id)result=curatedelse:result=pickle.load(open(curated_cache,'rb'))return_phosphosite_filter_organism(result,ncbi_tax_id)
[docs]defphosphosite_interactions_noref(ncbi_tax_id=9606):""" Loads HTP PhosphoSite data, from preprocessed cache file if available. Returns list of interactions. """noref_cache=urls.files['phosphosite']['noref']ifnotos.path.exists(noref_cache):curated,noref=phosphosite_interactions(ncbi_tax_id=ncbi_tax_id)result=norefelse:result=pickle.load(open(noref_cache,'rb'))return_phosphosite_filter_organism(result,ncbi_tax_id)
[docs]defphosphosite_directions(organism='human'):""" From curated and HTP PhosphoSite data generates a list of directions. """curated,noref=phosphosite_interactions()return[i[:2]foriincurated+norefifi[2]==organismandi[3]==organism]