#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
from future.utils import iteritems
import sys
import re
import collections
import itertools
import bs4
import csv
import pypath.inputs.common as inputs_common
import pypath.share.progress as progress
import pypath.utils.taxonomy as taxonomy
import pypath.utils.mapping as mapping
import pypath.internals.intera as intera
import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.share.common as common
[docs]
def signor_interactions(
organism = 9606,
raw_records = False,
expand_families = 0
):
"""
Downloads the full dataset from SIGNOR (https://signor.uniroma2.it/).
Returns the records with the most important fields.
If ``raw_records`` is `True` it returns the table split to list of
lists but unchanged content.
Args
organism (int, str): The NCBI Taxonomy ID or name of the organism.
Human (9606), mouse (10090) and rat (10116) are available.
raw_records (bool): Process the records or return them raw,
as they are.
expand_families (int): Expand protein families up to this size.
Zero or one means no expansion.
Return
list: A list with processed records as named tuples or dicts of
raw records if ``raw_records`` is True.
"""
def process_name(name):
isoform = None
if name in families:
main = (
families[name]
if len(families[name]) <= expand_families else
()
)
elif name in complexes_by_id:
main = complexes_by_id[name]
else:
main, isoform = inputs_common._try_isoform(name)
main = (main,)
return main, isoform
SignorInteraction = collections.namedtuple(
'SignorInteraction',
(
'source',
'target',
'source_isoform',
'target_isoform',
'source_type',
'target_type',
'effect',
'mechanism',
'ncbi_tax_id',
'pubmeds',
'direct',
'ptm_type',
'ptm_residue',
'ptm_motif',
)
)
families = signor_protein_families(organism = organism)
complexes = signor_complexes(organism = organism)
complexes_by_id = collections.defaultdict(set)
for cplex in complexes.values():
for cplex_id in cplex.ids['SIGNOR']:
complexes_by_id[cplex_id].add(cplex)
if isinstance(organism, int):
if organism in taxonomy.taxids:
_organism = taxonomy.taxids[organism]
else:
sys.stdout.write('\t:: Unknown organism: `%u`.\n' % organism)
return []
else:
_organism = organism
if _organism not in {'human', 'rat', 'mouse'}:
return []
url = urls.urls['signor']['all_url_new']
binary_data = [
(b'organism', _organism.encode('utf-8')),
(b'format', b'csv'),
(b'submit', b'Download'),
]
c = curl.Curl(
url,
silent = False,
large = True,
follow = True,
timeout = 180,
binary_data = binary_data,
return_headers = True,
)
reader = csv.DictReader(c.result, delimiter = '\t')
if raw_records:
return list(reader)
result = []
for line in reader:
sources, source_isoform = process_name(line['IDA'])
targets, target_isoform = process_name(line['IDB'])
for source, target in itertools.product(sources, targets):
this_record = SignorInteraction(
source = source,
target = target,
source_isoform = source_isoform,
target_isoform = target_isoform,
source_type = line['TYPEA'],
target_type = line['TYPEB'],
effect = line['EFFECT'],
mechanism = line['MECHANISM'],
ncbi_tax_id = line['TAX_ID'],
pubmeds = line['PMID'],
direct = line['DIRECT'] == 'YES',
ptm_type = line['MECHANISM'],
ptm_residue = line['RESIDUE'],
ptm_motif = line['SEQUENCE'],
)
result.append(this_record)
return result
[docs]
def signor_enzyme_substrate(organism = 9606):
"""
Loads and processes Signor PTMs.
Returns dict of dicts.
"""
reres = re.compile(r'([A-Za-z]{3})([0-9]+)')
result = []
aalet = dict((k.lower().capitalize(), v)
for k, v in iteritems(common.aaletters))
data = signor_interactions(organism = organism)
for d in data:
resm = reres.match(d.ptm_residue)
if resm is not None:
aa = aalet[resm.groups()[0].capitalize()]
aanum = int(resm.groups()[1])
typ = d.ptm_type,
inst = d.ptm_motif.upper()
result.append({
'typ': d.ptm_type,
'resnum': aanum,
'instance': inst,
'substrate': d.target,
'start': aanum - 7,
'end': aanum + 7,
'kinase': d.source,
'resaa': aa,
'motif': inst,
'enzyme_isoform': d.source_isoform,
'substrate_isoform': d.target_isoform,
'references': {d.pubmeds} if d.pubmeds != 'Other' else set()
})
return result
[docs]
def signor_pathways(**kwargs):
"""
Obtains pathway annotations from Signor.
"""
url = urls.urls['signor']['list_url']
baseurl = urls.urls['signor']['all_url_new']
proteins_pathways = {}
interactions_pathways = {}
c = curl.Curl(url, silent = True)
soup = bs4.BeautifulSoup(c.result, 'html.parser')
pathway_names = [
(opt['value'], opt.text)
for opt in soup.find(
'select', {'name': 'pathway_list'}
).findAll('option')
]
prg = progress.Progress(
len(pathway_names),
'Downloading data from Signor',
1,
percent = False
)
for short, full in pathway_names:
prg.step()
if not short:
continue
binary_data = [
(b'pathway_list', short.encode('ascii')),
(b'submit', b'Download')
]
c_pw = curl.Curl(
baseurl,
silent = True,
binary_data = binary_data,
encoding = 'utf-8',
)
#csv.DictReader(c_pw.result)
sep = '@#@#@'
lines = inputs_common.csv_sep_change(
c_pw.result,
'\t',
sep
).split('\n')[1:]
data = list(
filter(
lambda l:
len(l) > 6,
map(
lambda l:
l.strip().split(sep),
lines
)
)
)
proteins_pathways[full] = set()
interactions_pathways[full] = set()
for row in data:
for uniprot1, uniprot2 in itertools.product(
mapping.map_name(row[4], 'uniprot', 'uniprot'),
mapping.map_name(row[8], 'uniprot', 'uniprot'),
):
proteins_pathways[full].add(uniprot1)
proteins_pathways[full].add(uniprot2)
interactions_pathways[full].add((uniprot1, uniprot2))
prg.terminate()
return proteins_pathways, interactions_pathways
[docs]
def signor_pathway_annotations():
SignorPathway = collections.namedtuple(
'SignorPathway', ['pathway']
)
result = collections.defaultdict(set)
proteins, interactions = signor_pathways()
for pathway, uniprots in iteritems(proteins):
record = SignorPathway(pathway = pathway)
for uniprot in uniprots:
result[uniprot].add(record)
return dict(result)
[docs]
def signor_protein_families(organism = 9606):
#TODO: implement organism
families = {}
url = urls.urls['signor']['complexes']
c = curl.Curl(
url,
binary_data = [(b'submit', b'Download protein family data')],
large = True,
)
_ = next(c.result)
for rec in c.result:
rec = rec.split(';')
components = [u.strip('\n\r" ') for u in rec[2].split(',')]
families[rec[0]] = components
return families
[docs]
def signor_complexes(organism = 9606):
#TODO: implement organism
def process_on_hold(on_hold, complexes_by_id, complexes):
on_hold_next = []
for name, components, id_ in on_hold:
components = [
[comp.components for comp in complexes_by_id[comp_id]]
if comp_id in complexes_by_id else
((comp_id,),)
for comp_id in components
]
for components0 in itertools.product(*components):
this_components = list(itertools.chain(*components0))
if any(
comp.startswith('SIGNOR-C') for comp in this_components
):
on_hold_next.append((name, this_components, id_))
else:
cplex = intera.Complex(
name = name.replace('"', '').strip(),
components = this_components,
sources = 'SIGNOR',
ids = id_,
)
complexes[cplex.__str__()] = cplex
complexes_by_id[id_].add(cplex)
return on_hold_next, complexes_by_id, complexes
complexes = {}
on_hold = []
families = signor_protein_families(organism = organism)
url = urls.urls['signor']['complexes']
c = curl.Curl(
url,
binary_data = [(b'submit', b'Download complex data')],
large = True,
)
_ = next(c.result)
complexes_by_id = collections.defaultdict(set)
for rec in c.result:
rec = rec.split(';')
components = [u.strip('\n\r" ') for u in rec[2].split(',')]
components = [
families[comp] if comp in families else [comp]
for comp in components
]
for this_components in itertools.product(*components):
# some complex contains other complexes
if any(comp.startswith('SIGNOR-C') for comp in this_components):
on_hold.append((rec[1], this_components, rec[0]))
else:
cplex = intera.Complex(
name = rec[1].replace('"', '').strip(),
components = this_components,
sources = 'SIGNOR',
ids = rec[0],
)
complexes[cplex.__str__()] = cplex
complexes_by_id[rec[0]].add(cplex)
while True:
# complexes are defined recursively
count_on_hold = len(on_hold)
on_hold, complexes_by_id, complexes = (
process_on_hold(on_hold, complexes_by_id, complexes)
)
if len(on_hold) == count_on_hold:
break
return complexes