Source code for pypath.inputs.phosphosite

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from past.builtins import xrange, range
from future.utils import iteritems

import os
import pickle
import re
import itertools

import xml.etree.cElementTree as ET

import pypath.share.progress as progress
import pypath.utils.taxonomy as taxonomy
import pypath.internals.intera as intera
import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.inputs.uniprot_db as uniprot_db
import pypath.inputs.common as inputs_common
import pypath.inputs.homologene as homologene
import pypath.utils.mapping as mapping
import pypath.share.common as common
import pypath.share.session as session

_logger = session.Logger(name = 'phosphosite_input')


[docs] def phosphosite_enzyme_substrate( raw = True, organism = 'human', strict = True, ): """ Downloads and preprocesses phosphorylation site data from PhosphoSitePlus. """ url = urls.urls['psite_kin']['url'] c = curl.Curl( url, silent = False, compr = 'gz', encoding = 'iso-8859-1', large = True, ) orto = {} data = c.result cols = { 'kinase': 2, 'kinase_org': 3, 'substrate': 6, 'substrate_org': 8, 'residue': 9, 'motif': 11 } data = inputs_common.read_table( cols = cols, fileObject = data, sep = '\t', hdr = 4, ) result = [] non_digit = re.compile(r'[^\d.-]+') motre = re.compile(r'(_*)([A-Za-z]+)(_*)') for r in data: if organism is None or \ ((r['kinase_org'] == organism or not strict) and \ r['substrate_org'] == organism): if r['kinase_org'] != organism: korg = r['kinase_org'] # attempting to map by orthology: if korg in taxonomy.taxa and organism in taxonomy.taxa: ktaxid = taxonomy.taxa[korg] taxid = taxonomy.taxa[organism] if korg not in orto: orto[korg] = homologene.homologene_dict( ktaxid, taxid, 'refseqp', ) korg_refseq = mapping.map_name(r['kinase'], 'uniprot', 'refseqp', ktaxid) kin_uniprot = \ list( itertools.chain( *map( lambda ors: mapping.map_name(ors, 'refseqp', 'uniprot', taxid), itertools.chain( *map( lambda rs: orto[korg][rs], filter( lambda rs: rs in orto[korg], korg_refseq ) ) ) ) ) ) else: kin_uniprot = [r['kinase']] for kinase in kin_uniprot: r['resaa'] = r['residue'][0] r['resnum'] = int(non_digit.sub('', r['residue'][1:])) mot = motre.match(r['motif']) # excluding e.g. Q12809_VAR_014388 r['substrate'] = r['substrate'].split('_')[0] sisoform = 1 if '-' not in r['substrate'] else \ int(r['substrate'].split('-')[1]) r['substrate'] = r['substrate'].split('-')[0] kisoform = ( 1 if '-' not in kinase else int(kinase.split('-')[1]) ) kinase = kinase.split('-')[0] r['substrate'] = r['substrate'].split('-')[0] if mot: r['start'] = r['resnum'] - 7 + len(mot.groups()[0]) r['end'] = r['resnum'] + 7 - len(mot.groups()[2]) r['instance'] = r['motif'].replace('_', '').upper() else: r['start'] = None r['end'] = None r['instance'] = None if raw: r['kinase'] = kinase result.append(r) else: res = intera.Residue(r['resnum'], r['resaa'], r['substrate'], isoform = sisoform) mot = intera.Motif( r['substrate'], r['start'], r['end'], instance = r['instance'], isoform = sisoform) ptm = intera.Ptm(protein = r['substrate'], residue = res, motif = mot, typ = 'phosphorylation', source = 'PhosphoSite', isoform = sisoform) dom = intera.Domain(protein = kinase, isoform = kisoform) dommot = intera.DomainMotif( domain = dom, ptm = ptm, sources = ['PhosphoSite']) result.append(dommot) return result
[docs] def phosphosite_ptm_orthology(): """ Returns an orthology translation dict of phosphosites based on phosphorylation sites table from PhosphoSitePlus. In the result all PTMs represented by a tuple of the following 6 elements: UniProt ID, isoform (int), residue one letter code, residue number (int), NCBI Taxonomy ID (int), modification type. :param int source: Source taxon (NCBI Taxonomy). :param int target: Target taxon (NCBI Taxonomy). """ result = {} nondigit = re.compile(r'[^\d]+') unknown_taxa = set([]) for typ in common.psite_mod_types: groups = {} url = urls.urls['psite_%s' % typ[0]]['url'] c = curl.Curl(url, silent = False, large = True) data = c.result for _ in xrange(4): __ = next(data) for r in data: r = common.decode(r, 'utf-8').split('\t') if len(r) < 10: continue uniprot = r[2] isoform = 1 if '-' not in uniprot else int(uniprot.split('-')[1]) uniprot = uniprot.split('-')[0] aa = r[4][0] num = int(nondigit.sub('', r[4])) if r[6] not in taxonomy.taxa: unknown_taxa.add(r[6]) continue tax = taxonomy.taxa[r[6]] group = int(r[5]) this_site = (uniprot, isoform, aa, num, tax, typ[1]) if group not in groups: groups[group] = set([]) groups[group].add(this_site) for group, sites in iteritems(groups): for site1 in sites: for site2 in sites: if site1[4] == site2[4]: continue if site1 not in result: result[site1] = {} if site2[4] not in result[site1]: result[site1][site2[4]] = set([]) result[site1][site2[4]].add(site2) if len(unknown_taxa): _logger._log( 'Unknown organisms encountered: %s' % ', '.join(sorted(unknown_taxa)) ) return result
[docs] def phosphosite_ptms(organism = 'human'): """ Downloads the phosphorylation site dataset from PhosphoSitePlus. """ result = [] url = urls.urls['psite_p']['url'] nondigit = re.compile(r'[^\d]+') remot = re.compile(r'(_*)([A-Za-z]+)(_*)') c = curl.Curl(url, silent = False, large = True) data = c.result for _ in xrange(4): _ = next(c.result) for r in data: r = r.split('\t') if len(r) > 9 and (organism is None or r[6] == organism): uniprot = r[2] isoform = 1 if '-' not in uniprot else int(uniprot.split('-')[1]) uniprot = uniprot.split('-')[0] typ = r[3].lower() if len(typ) == 0: typ = r[4].split('-')[1] if '-' in r[4] else None aa = r[4][0] num = int(nondigit.sub('', r[4])) motif = remot.match(r[9]) if motif: start = num - 7 + len(motif.groups()[0]) end = num + 7 - len(motif.groups()[2]) instance = r[9].replace('_', '').upper() else: start = None end = None instance = None res = intera.Residue( num, aa, uniprot, isoform = isoform, ) mot = intera.Motif( uniprot, start, end, instance = instance, isoform = isoform, ) ptm = intera.Ptm( uniprot, typ = typ, motif = mot, residue = res, evidences = 'PhosphoSite', isoform = isoform, ) result.append(ptm) return result
[docs] def phosphosite_regsites(): """ Downloads and preprocesses the regulatory sites dataset from PhosphoSitePlus. This data provides information about which proteins a PTM disrupts or induces the interaction with. """ kwds_pos = { 'enzymatic activity, induced', 'activity, induced', 'protein stabilization', 'receptor inactivation, inhibited', 'receptor desensitization, inhibited', 'receptor internalization, inhibited', 'receptor recycling, induced' } kwds_neg = { 'enzymatic activity, inhibited', 'activity, inhibited', 'protein degradation', 'receptor inactivation, induced', 'receptor desensitization, induced', 'receptor internalization, induced', 'receptor recycling, inhibited' } url = urls.urls['psite_reg']['url'] c = curl.Curl(url, silent = False, compr = 'gz', encoding = 'iso-8859-1', large = True) data = c.result cols = { 'uniprot': 3, 'organism': 6, 'mod': 7, 'on_function': 11, 'on_process': 12, 'on_interact': 13, 'pmids': 15, 'comments': 19 } data = inputs_common.read_table( cols = cols, fileObject = data, sep = '\t', hdr = 4, ) regsites = {} for r in data: interact = [[y.replace(')', '').strip() for y in x.split('(')] for x in r['on_interact'].strip().split(';') if len(x) > 0] induces = [x[0] for x in interact if x[1] == 'INDUCES'] disrupts = [x[0] for x in interact if x[1] == 'DISRUPTS'] mod = r['mod'] modt = r['mod'].split('-') mod = list(modt[0]) aa = mod.pop(0) modt = modt[1] res = ''.join(mod) isoform = ( int(r['uniprot'].split('-')[1]) if '-' in r['uniprot'] else 1 ) uniprot = r['uniprot'].split('-')[0] if uniprot not in regsites: regsites[uniprot] = [] function = set(map(lambda f: f.strip(), r['on_function'].split(';'))) regsites[uniprot].append({ 'aa': aa, 'res': res, 'modt': modt, 'organism': r['organism'], 'pmids': set(map(lambda f: f.strip(), r['pmids'].split(';'))), 'induces': induces, 'disrupts': disrupts, 'isoform': isoform, 'function': function, 'process': set(map(lambda f: f.strip(), r['on_process'].split(';'))), 'comments': r['comments'], 'positive': bool(kwds_pos & function), 'negative': bool(kwds_neg & function) }) return regsites
[docs] def phosphosite_regsites_one_organism(organism = 9606): """ Returns PhosphoSitePlus regulatory sites translated to one organism by orthology. Residue numbers will be translated where necessary, while gene symbols will be translated to UniProt IDs of the given organism. This works with human, mouse or rat. :param int organism: NCBI Taxonomy ID of the target organism. In this method possible values are human, mouse or rat, as these species provide the vast majority of the data, and are close enough to each other that the sites can be safely translated between orthologous proteins by sequence alignement. """ def genesymbols2uniprots(genesymbols, tax): return ( set( itertools.chain( *map( lambda gs: mapping.map_name( gs, 'genesymbol', 'uniprot', ncbi_tax_id = tax, ), genesymbols ) ) ) ) def translate_uniprots(uniprots, homo): return ( set( itertools.chain( *map( lambda usrc: homo[usrc] if usrc in homo else [], uniprots ) ) ) ) result = {} organisms = set([9606, 10090, 10116]) mod_types = dict(common.psite_mod_types2) regsites = phosphosite_regsites() other_organisms = organisms - set([organism]) homology = ( dict( map( lambda other: ( other, homologene.homologene_uniprot_dict( source = other, target = organism, ) ), other_organisms ) ) ) ptm_homology = phosphosite_ptm_orthology() proteome = uniprot_db.all_uniprots( organism = organism, swissprot = 'YES', ) for substrate, regs in iteritems(regsites): subs = [] if substrate in proteome: subs = [substrate] else: for other, homo in iteritems(homology): if substrate in homo: subs = homo[substrate] for sub in subs: if sub not in result: result[sub] = {} for reg in regs: reg_organism = taxonomy.taxa[reg['organism']] if reg_organism not in organisms: continue if reg['modt'] not in mod_types: _logger._log( 'Unknown PhosphoSite modification ' 'type code: %s' % reg['modt'] ) continue mod_type = mod_types[reg['modt']] resnum = int(reg['res']) psite_key = ( substrate, reg['isoform'], reg['aa'], resnum, reg_organism, mod_type, ) if reg_organism != organism: regs_target = [] disrupts = [] induces = [] if psite_key in ptm_homology: if organism in ptm_homology[psite_key]: regs_target = ptm_homology[psite_key][organism] if len(regs_target): disrupts = genesymbols2uniprots( reg['disrupts'], reg_organism, ) disrupts = translate_uniprots( disrupts, homology[reg_organism], ) induces = genesymbols2uniprots( reg['induces'], reg_organism, ) induces = translate_uniprots( induces, homology[reg_organism], ) else: regs_target = [psite_key] disrupts = genesymbols2uniprots(reg['disrupts'], organism) induces = genesymbols2uniprots(reg['induces'], organism) for regt in regs_target: modkey = (regt[2], regt[3], regt[5]) if modkey not in result[sub]: result[sub][modkey] = { 'induces': set([]), 'disrupts': set([]), 'pmids': set([]), 'isoforms': set([]), 'process': set([]), 'function': set([]), 'positive': False, 'negative': False, 'comments': [] } result[sub][modkey]['induces'].update(induces) result[sub][modkey]['disrupts'].update(disrupts) result[sub][modkey]['process'].update(reg['process']) result[sub][modkey]['function'].update(reg['function']) result[sub][modkey]['isoforms'].update([regt[1]]) result[sub][modkey]['pmids'].update(reg['pmids']) result[sub][modkey]['positive'] = \ result[sub][modkey]['positive'] or reg['positive'] result[sub][modkey]['negative'] = \ result[sub][modkey]['negative'] or reg['negative'] if len(reg['comments']): result[sub][modkey]['comments'].append(reg['comments']) return result
[docs] def regsites_tab(regsites, outfile = None): """ Exports PhosphoSite regulatory sites as a tabular file, all IDs translated to UniProt. """ header = [ 'uniprot_a', 'isoform_a', 'a_res_aa', 'a_res_num', 'a_mod_type', 'effect', 'uniprot_b', 'references' ] result = [] for uniprot, regsite in iteritems(regsites): isoform = '1' uniprot = uniprot.split('-') if len(uniprot) > 1: isoform = uniprot[1] uniprot = uniprot[0] for r in regsite: if r['organism'] == 'human': for i in r['induces']: other = mapping.map_name(i, 'genesymbol', 'uniprot') for o in other: if o != 'unmapped': result.append([ uniprot, isoform, r['aa'], r['res'], r['modt'], '+', o ]) for i in r['disrupts']: other = mapping.map_name(i, 'genesymbol', 'uniprot') for o in other: if o != 'unmapped': result.append([ uniprot, isoform, r['aa'], r['res'], r['modt'], '-', o, ';'.join(r['pmids']) ]) if outfile is not None: out = '\t'.join(header) + '\n' for r in result: out += '\t'.join(r) + '\n' with open(outfile, 'w') as f: f.write(out) return result
[docs] def phosphosite_interactions(cache = True, ncbi_tax_id = 9606): """ Downloads curated and HTP data from Phosphosite, from preprocessed cache file if available. Processes BioPAX format. Returns list of interactions. """ curated_cache = urls.files['phosphosite']['curated'] noref_cache = urls.files['phosphosite']['noref'] if cache and os.path.exists(curated_cache) and os.path.exists(noref_cache): return ( pickle.load(open(curated_cache, 'rb')), pickle.load(open(noref_cache, 'rb')), ) result_curated = [] result_noref = [] url = urls.urls['psite_bp']['url'] c = curl.Curl(url, silent = False, large = True) bpax = c.gzfile xml = ET.parse(bpax) xmlroot = xml.getroot() bpprefix = '{http://www.biopax.org/release/biopax-level3.owl#}' rdfprefix = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}' proteins = {} for p in xmlroot.iter(bpprefix + 'ProteinReference'): psid = p.attrib[rdfprefix + 'ID'] db = p.find(bpprefix + 'xref').find(bpprefix + 'UnificationXref').find( bpprefix + 'db').text up = p.find(bpprefix + 'xref').find(bpprefix + 'UnificationXref').find( bpprefix + 'id').text tax = '' if p.find(bpprefix + 'organism') is not None: tmp = p.find(bpprefix + 'organism') if rdfprefix + 'resource' in tmp.attrib: tax = tmp.attrib[rdfprefix + 'resource'].split('_')[1] if db == 'UniProtKB': up = up[0:6] proteins[psid] = {'id': up, 'db': db, 'species': tax, 'psid': psid} evidences = {} for p in xmlroot.iter(bpprefix + 'EvidenceCodeVocabulary'): evid = p.attrib[rdfprefix + 'ID'].split('_')[1] evname = p.find(bpprefix + 'term').text evidences[evid] = evname ev_short = {'0113': 'WB', '0427': 'MS', '0074': 'MA', '0421': 'AB'} nosrc = [] notgt = [] norefs = [] noev = [] noth = [] edges = [] for c in xmlroot.findall(bpprefix + 'Catalysis'): if rdfprefix + 'resource' in c.find(bpprefix + 'controller').attrib: src = 'po_' + \ c.find( bpprefix + 'controller').attrib[rdfprefix + 'resource'].split('_')[1] else: srcProt = c.find(bpprefix + 'controller').find(bpprefix + 'Protein') if srcProt is not None: src = 'po_' + srcProt.attrib[rdfprefix + 'ID'].split('_')[1] else: nosrc.append(c) tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix + 'ProteinReference') tgt = next(tgtProt, None) if tgt is not None: tgt = tgt.attrib[rdfprefix + 'ID'] else: tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix + 'entityReference') tgt = next(tgtProt, None) if tgt is not None: if rdfprefix + 'resource' in tgt.attrib: tgt = tgt.attrib[rdfprefix + 'resource'][1:] else: tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix + 'left') tgt = next(tgtProt, None) if tgt is not None: if rdfprefix + 'resource' in tgt.attrib: tgt = 'po_' + \ tgt.attrib[rdfprefix + 'resource'].split('_')[1] else: notgt.append(c) refs = c.iter(bpprefix + 'PublicationXref') pmids = [] for r in refs: pm = r.attrib[rdfprefix + 'ID'].split('_') if pm[0] == 'pmid': pmids.append(pm[1]) refs = c.iter(bpprefix + 'evidence') for r in refs: rrefs = r.iter(bpprefix + 'xref') for rr in rrefs: if rdfprefix + 'resource' in rr.attrib: pm = rr.attrib[rdfprefix + 'resource'].split('_') if pm[0] == 'pubmed': pmids.append(pm[1]) evs = [] for e in c.iter(bpprefix + 'evidenceCode'): if rdfprefix + 'resource' in e.attrib: evs.append(ev_short[e.attrib[rdfprefix + 'resource'].split('_') [1]]) else: ev = e.find(bpprefix + 'EvidenceCodeVocabulary') evs.append(ev_short[ev.attrib[rdfprefix + 'ID'].split('_')[1]]) for e in c.iter(bpprefix + 'evidence'): if rdfprefix + 'resource' in e.attrib: ev = e.attrib[rdfprefix + 'resource'].split('_') if len(ev) == 4: if len(ev[3]) == 4: evs.append(ev_short[ev[3]]) if (src is not None and tgt is not None and src in proteins and tgt in proteins and proteins[src]['id'] is not None and proteins[tgt]['id'] is not None): edges.append({ 'src': proteins[src], 'tgt': proteins[tgt], 'pmids': list(set(pmids)), 'evs': list(set(evs)) }) if len(evs) == 0: noev.append(c) if len(pmids) == 0: norefs.append(c) if len(evs) == 0 and len(pmids) == 0: noth.append(c) if ncbi_tax_id: all_uniprots = uniprot_db.all_uniprots(organism = ncbi_tax_id) for e in edges: if ( ncbi_tax_id and ( e['src']['id'] not in all_uniprots or e['tgt']['id'] not in all_uniprots ) ): continue this_iaction = [ e['src']['id'], e['tgt']['id'], e['src']['species'], e['tgt']['species'], ';'.join(e['evs']), ';'.join(e['pmids']) ] if len(this_iaction[-1]) > 0: result_curated.append(this_iaction) else: result_noref.append(this_iaction) pickle.dump(result_curated, open(curated_cache, 'wb')) pickle.dump(result_noref, open(noref_cache, 'wb')) return result_curated, result_noref
[docs] def phosphosite_interactions_new(cache = True): """ Downloads curated and HTP data from Phosphosite, from preprocessed cache file if available. Processes BioPAX format. Returns list of interactions. """ curated_cache = urls.files['phosphosite']['curated'] noref_cache = urls.files['phosphosite']['noref'] if ( cache and os.path.exists(curated_cache) and os.path.exists(noref_cache) ): with open(curated_cache, 'rb') as fp: data_curated = pickle.load(fp) with open(noref_cache, 'rb') as fp: data_noref = pickle.load(fp) return data_curated, data_noref def collect_items(tagname, process_method): result = {} for p in xmlroot.iter(tagname): key, value = process_method(p) result[key] = value return result def process_protein(protein): protein_id = protein.attrib['%sID' % rdfprefix] database = ( protein.find( '%sxref' % bpprefix ).find( '%sUnificationXref' % bpprefix ).find( '%sdb' % bpprefix ).text ) identifier = ( protein.find( '%sxref' % bpprefix ).find( '%sUnificationXref' % bpprefix ).find( '%sid' % bpprefix ).text ) organism = None e_organism = protein.find('%sorganism' % bpprefix) if ( e_organism is not None and '%sresource' % rdfprefix in e_organism.attrib ): organism = ( e_organism.attrib['%sresource' % rdfprefix].split('_')[1] ) return protein_id, (databas, identifier, organism) def process_site(site): site_id = site.attrib['%sID' % rdfprefix] site_offset = site.find('%ssequencePosition').text return site_id, site_offset def process_modification(seqmodvoc): mod_id = seqmodvoc.attrib['%sID' % rdfprefix] residue, mod = mod_id.split('_').split('-') return mod_id, (residue, mod) def get_resource(elem, resource_tag): res_attr = '%sresource' % rdfprefix if res_attr in elem.attrib: return elem.attrib[res_attr][1:] else: return elem.find(resource_tag).attrib['%sID' % rdfprefix] def process_feature(feature): feature_id = feature.attrib['%sID' % rdfprefix] site = get_resource( feature.find('%sfeatureLocation' % bpprefix), '%sSequenceSite' % bpprefix, ) modification = get_resource( feature.find('%smodificationType' % bpprefix), '%sSequenceModificationVocabulary' % bpprefix, ) return feature_id, (site, modification) result_curated = [] result_noref = [] bpprefix = '{http://www.biopax.org/release/biopax-level3.owl#}' rdfprefix = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}' url = urls.urls['psite_bp']['url'] c = curl.Curl(url, silent = False, large = True) bpax = c.gzfile xml = ET.parse(bpax) xmlroot = xml.getroot() proteins = collect_items( '%sProtein' % bpprefix, process_method = process_protein, ) sites = collect_items( '%sSequenceSite' % bpprefix, process_method = process_site, ) modifications = collect_items( '%sSequenceModificationVocabulary' % bpprefix, process_method = process_modification, ) features = collect_items( '%sModificationFeature' % bpprefix, process_method = process_feature, ) evidences = {} for p in xmlroot.iter(bpprefix + 'EvidenceCodeVocabulary'): evid = p.attrib[rdfprefix + 'ID'].split('_')[1] evname = p.find(bpprefix + 'term').text evidences[evid] = evname ev_short = {'0113': 'WB', '0427': 'MS', '0074': 'MA', '0421': 'AB'} nosrc = [] notgt = [] norefs = [] noev = [] noth = [] edges = [] for c in xmlroot.findall(bpprefix + 'Catalysis'): if rdfprefix + 'resource' in c.find(bpprefix + 'controller').attrib: src = 'po_' + \ c.find( bpprefix + 'controller').attrib[rdfprefix + 'resource'].split('_')[1] else: srcProt = c.find(bpprefix + 'controller').find(bpprefix + 'Protein') if srcProt is not None: src = 'po_' + srcProt.attrib[rdfprefix + 'ID'].split('_')[1] else: nosrc.append(c) tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix + 'ProteinReference') tgt = next(tgtProt, None) if tgt is not None: tgt = tgt.attrib[rdfprefix + 'ID'] else: tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix + 'entityReference') tgt = next(tgtProt, None) if tgt is not None: if rdfprefix + 'resource' in tgt.attrib: tgt = tgt.attrib[rdfprefix + 'resource'][1:] else: tgtProt = c.find(bpprefix + 'controlled').iter(bpprefix + 'left') tgt = next(tgtProt, None) if tgt is not None: if rdfprefix + 'resource' in tgt.attrib: tgt = 'po_' + \ tgt.attrib[rdfprefix + 'resource'].split('_')[1] else: notgt.append(c) refs = c.iter(bpprefix + 'PublicationXref') pmids = [] for r in refs: pm = r.attrib[rdfprefix + 'ID'].split('_') if pm[0] == 'pmid': pmids.append(pm[1]) refs = c.iter(bpprefix + 'evidence') for r in refs: rrefs = r.iter(bpprefix + 'xref') for rr in rrefs: if rdfprefix + 'resource' in rr.attrib: pm = rr.attrib[rdfprefix + 'resource'].split('_') if pm[0] == 'pubmed': pmids.append(pm[1]) evs = [] for e in c.iter(bpprefix + 'evidenceCode'): if rdfprefix + 'resource' in e.attrib: evs.append(ev_short[e.attrib[rdfprefix + 'resource'].split('_') [1]]) else: ev = e.find(bpprefix + 'EvidenceCodeVocabulary') evs.append(ev_short[ev.attrib[rdfprefix + 'ID'].split('_')[1]]) for e in c.iter(bpprefix + 'evidence'): if rdfprefix + 'resource' in e.attrib: ev = e.attrib[rdfprefix + 'resource'].split('_') if len(ev) == 4: if len(ev[3]) == 4: evs.append(ev_short[ev[3]]) if (src is not None and tgt is not None and src in proteins and tgt in proteins and proteins[src]['id'] is not None and proteins[tgt]['id'] is not None): edges.append({ 'src': proteins[src], 'tgt': proteins[tgt], 'pmids': list(set(pmids)), 'evs': list(set(evs)) }) if len(evs) == 0: noev.append(c) if len(pmids) == 0: norefs.append(c) if len(evs) == 0 and len(pmids) == 0: noth.append(c) for e in edges: this_iaction = [ e['src']['id'], e['tgt']['id'], e['src']['species'], e['tgt']['species'], ';'.join(e['evs']), ';'.join(e['pmids']) ] if len(this_iaction[-1]) > 0: result_curated.append(this_iaction) else: result_noref.append(this_iaction) pickle.dump(result_curated, open(curated_cache, 'wb')) pickle.dump(result_noref, open(noref_cache, 'wb')) return result_curated, result_noref
def _phosphosite_filter_organism(psite_data, ncbi_tax_id = 9606): all_uniprots = uniprot_db.all_uniprots(organism = ncbi_tax_id) return [ rec for rec in psite_data if rec[0] in all_uniprots and rec[1] in all_uniprots ]
[docs] def phosphosite_interactions_curated(ncbi_tax_id = 9606): """ Loads literature curated PhosphoSite data, from preprocessed cache file if available. Returns list of interactions. """ curated_cache = urls.files['phosphosite']['curated'] if not os.path.exists(curated_cache): curated, noref = phosphosite_interactions(ncbi_tax_id = ncbi_tax_id) result = curated else: result = pickle.load(open(curated_cache, 'rb')) return _phosphosite_filter_organism(result, ncbi_tax_id)
[docs] def phosphosite_interactions_noref(ncbi_tax_id = 9606): """ Loads HTP PhosphoSite data, from preprocessed cache file if available. Returns list of interactions. """ noref_cache = urls.files['phosphosite']['noref'] if not os.path.exists(noref_cache): curated, noref = phosphosite_interactions(ncbi_tax_id = ncbi_tax_id) result = noref else: result = pickle.load(open(noref_cache, 'rb')) return _phosphosite_filter_organism(result, ncbi_tax_id)
[docs] def phosphosite_directions(organism = 'human'): """ From curated and HTP PhosphoSite data generates a list of directions. """ curated, noref = phosphosite_interactions() return [ i[:2] for i in curated + noref if i[2] == organism and i[3] == organism ]
[docs] def phosphosite_interactions_all(): return phosphosite_interactions_curated() + phosphosite_interactions_noref()