Source code for pypath.inputs.signor

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import sys
import re
import collections
import itertools
import bs4
import csv

import pypath.inputs.common as inputs_common
import pypath.share.progress as progress
import pypath.utils.taxonomy as taxonomy
import pypath.utils.mapping as mapping
import pypath.internals.intera as intera
import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.share.common as common


[docs] def signor_interactions( organism = 9606, raw_records = False, expand_families = 0 ): """ Downloads the full dataset from SIGNOR (https://signor.uniroma2.it/). Returns the records with the most important fields. If ``raw_records`` is `True` it returns the table split to list of lists but unchanged content. Args organism (int, str): The NCBI Taxonomy ID or name of the organism. Human (9606), mouse (10090) and rat (10116) are available. raw_records (bool): Process the records or return them raw, as they are. expand_families (int): Expand protein families up to this size. Zero or one means no expansion. Return list: A list with processed records as named tuples or dicts of raw records if ``raw_records`` is True. """ def process_name(name): isoform = None if name in families: main = ( families[name] if len(families[name]) <= expand_families else () ) elif name in complexes_by_id: main = complexes_by_id[name] else: main, isoform = inputs_common._try_isoform(name) main = (main,) return main, isoform SignorInteraction = collections.namedtuple( 'SignorInteraction', ( 'source', 'target', 'source_isoform', 'target_isoform', 'source_type', 'target_type', 'effect', 'mechanism', 'ncbi_tax_id', 'pubmeds', 'direct', 'ptm_type', 'ptm_residue', 'ptm_motif', ) ) families = signor_protein_families(organism = organism) complexes = signor_complexes(organism = organism) complexes_by_id = collections.defaultdict(set) for cplex in complexes.values(): for cplex_id in cplex.ids['SIGNOR']: complexes_by_id[cplex_id].add(cplex) if isinstance(organism, int): if organism in taxonomy.taxids: _organism = taxonomy.taxids[organism] else: sys.stdout.write('\t:: Unknown organism: `%u`.\n' % organism) return [] else: _organism = organism if _organism not in {'human', 'rat', 'mouse'}: return [] url = urls.urls['signor']['all_url_new'] binary_data = [ (b'organism', _organism.encode('utf-8')), (b'format', b'csv'), (b'submit', b'Download'), ] c = curl.Curl( url, silent = False, large = True, follow = True, timeout = 180, binary_data = binary_data, return_headers = True, ) reader = csv.DictReader(c.result, delimiter = '\t') if raw_records: return list(reader) result = [] for line in reader: sources, source_isoform = process_name(line['IDA']) targets, target_isoform = process_name(line['IDB']) for source, target in itertools.product(sources, targets): this_record = SignorInteraction( source = source, target = target, source_isoform = source_isoform, target_isoform = target_isoform, source_type = line['TYPEA'], target_type = line['TYPEB'], effect = line['EFFECT'], mechanism = line['MECHANISM'], ncbi_tax_id = line['TAX_ID'], pubmeds = line['PMID'], direct = line['DIRECT'] == 'YES', ptm_type = line['MECHANISM'], ptm_residue = line['RESIDUE'], ptm_motif = line['SEQUENCE'], ) result.append(this_record) return result
[docs] def signor_enzyme_substrate(organism = 9606): """ Loads and processes Signor PTMs. Returns dict of dicts. """ reres = re.compile(r'([A-Za-z]{3})([0-9]+)') result = [] aalet = dict((k.lower().capitalize(), v) for k, v in iteritems(common.aaletters)) data = signor_interactions(organism = organism) for d in data: resm = reres.match(d.ptm_residue) if resm is not None: aa = aalet[resm.groups()[0].capitalize()] aanum = int(resm.groups()[1]) typ = d.ptm_type, inst = d.ptm_motif.upper() result.append({ 'typ': d.ptm_type, 'resnum': aanum, 'instance': inst, 'substrate': d.target, 'start': aanum - 7, 'end': aanum + 7, 'kinase': d.source, 'resaa': aa, 'motif': inst, 'enzyme_isoform': d.source_isoform, 'substrate_isoform': d.target_isoform, 'references': {d.pubmeds} if d.pubmeds != 'Other' else set() }) return result
[docs] def signor_pathways(**kwargs): """ Obtains pathway annotations from Signor. """ url = urls.urls['signor']['list_url'] baseurl = urls.urls['signor']['all_url_new'] proteins_pathways = {} interactions_pathways = {} c = curl.Curl(url, silent = True) soup = bs4.BeautifulSoup(c.result, 'html.parser') pathway_names = [ (opt['value'], opt.text) for opt in soup.find( 'select', {'name': 'pathway_list'} ).findAll('option') ] prg = progress.Progress( len(pathway_names), 'Downloading data from Signor', 1, percent = False ) for short, full in pathway_names: prg.step() if not short: continue binary_data = [ (b'pathway_list', short.encode('ascii')), (b'submit', b'Download') ] c_pw = curl.Curl( baseurl, silent = True, binary_data = binary_data, encoding = 'utf-8', ) #csv.DictReader(c_pw.result) sep = '@#@#@' lines = inputs_common.csv_sep_change( c_pw.result, '\t', sep ).split('\n')[1:] data = list( filter( lambda l: len(l) > 6, map( lambda l: l.strip().split(sep), lines ) ) ) proteins_pathways[full] = set() interactions_pathways[full] = set() for row in data: for uniprot1, uniprot2 in itertools.product( mapping.map_name(row[4], 'uniprot', 'uniprot'), mapping.map_name(row[8], 'uniprot', 'uniprot'), ): proteins_pathways[full].add(uniprot1) proteins_pathways[full].add(uniprot2) interactions_pathways[full].add((uniprot1, uniprot2)) prg.terminate() return proteins_pathways, interactions_pathways
[docs] def signor_pathway_annotations(): SignorPathway = collections.namedtuple( 'SignorPathway', ['pathway'] ) result = collections.defaultdict(set) proteins, interactions = signor_pathways() for pathway, uniprots in iteritems(proteins): record = SignorPathway(pathway = pathway) for uniprot in uniprots: result[uniprot].add(record) return dict(result)
[docs] def signor_protein_families(organism = 9606): #TODO: implement organism families = {} url = urls.urls['signor']['complexes'] c = curl.Curl( url, binary_data = [(b'submit', b'Download protein family data')], large = True, ) _ = next(c.result) for rec in c.result: rec = rec.split(';') components = [u.strip('\n\r" ') for u in rec[2].split(',')] families[rec[0]] = components return families
[docs] def signor_complexes(organism = 9606): #TODO: implement organism def process_on_hold(on_hold, complexes_by_id, complexes): on_hold_next = [] for name, components, id_ in on_hold: components = [ [comp.components for comp in complexes_by_id[comp_id]] if comp_id in complexes_by_id else ((comp_id,),) for comp_id in components ] for components0 in itertools.product(*components): this_components = list(itertools.chain(*components0)) if any( comp.startswith('SIGNOR-C') for comp in this_components ): on_hold_next.append((name, this_components, id_)) else: cplex = intera.Complex( name = name.replace('"', '').strip(), components = this_components, sources = 'SIGNOR', ids = id_, ) complexes[cplex.__str__()] = cplex complexes_by_id[id_].add(cplex) return on_hold_next, complexes_by_id, complexes complexes = {} on_hold = [] families = signor_protein_families(organism = organism) url = urls.urls['signor']['complexes'] c = curl.Curl( url, binary_data = [(b'submit', b'Download complex data')], large = True, ) _ = next(c.result) complexes_by_id = collections.defaultdict(set) for rec in c.result: rec = rec.split(';') components = [u.strip('\n\r" ') for u in rec[2].split(',')] components = [ families[comp] if comp in families else [comp] for comp in components ] for this_components in itertools.product(*components): # some complex contains other complexes if any(comp.startswith('SIGNOR-C') for comp in this_components): on_hold.append((rec[1], this_components, rec[0])) else: cplex = intera.Complex( name = rec[1].replace('"', '').strip(), components = this_components, sources = 'SIGNOR', ids = rec[0], ) complexes[cplex.__str__()] = cplex complexes_by_id[rec[0]].add(cplex) while True: # complexes are defined recursively count_on_hold = len(on_hold) on_hold, complexes_by_id, complexes = ( process_on_hold(on_hold, complexes_by_id, complexes) ) if len(on_hold) == count_on_hold: break return complexes