#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
from future.utils import iteritems
import os
import re
import collections
import itertools
import shutil
try:
import cPickle as pickle
except:
import pickle
import bs4
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.share.progress as progress
import pypath.share.cache as cache
import pypath.share.settings as settings
import pypath.share.session as session_mod
import pypath.resources.urls as urls
import pypath.internals.intera as intera
import pypath.utils.mapping as mapping
_logger = session_mod.Logger(name = 'hpmr_input')
_log = _logger._log
HpmrInteraction = collections.namedtuple(
'HpmrInteraction',
(
'receptor',
'partner_role',
'partner',
'references',
'unambiguous',
),
)
[docs]
def get_hpmr(use_cache = None):
"""
Downloads ligand-receptor and receptor-receptor interactions from the
Human Plasma Membrane Receptome database.
Args
use_cache (bool): Use the intermediate cache (pickle file of
processed data).
Returns
(dict): Two elements: "interactions" and "families".
"""
def get_partner(interactors, typ, recname = None, references = None):
"""
typ : str
`Receptor` or `Ligand`.
"""
components = [i[1] for i in interactors if i[0] == typ]
if typ == 'Receptor' and recname:
components.append(recname)
if len(components) == 1:
return components[0]
elif len(components) > 1:
return intera.Complex(
components = components,
sources = 'HPMR',
references = references,
)
cachefile = cache.cache_item('hpmr_preprocessed')
use_cache = (
use_cache
if isinstance(use_cache, bool) else
settings.get('use_intermediate_cache')
)
if os.path.exists(cachefile) and use_cache:
_log('Reading HPMR data from cache file `%s`.' % cachefile)
return pickle.load(open(cachefile, 'rb'))
rerecname = re.compile(r'Receptor ([A-z0-9]+) interacts with:')
reint = re.compile(r'(Receptor|Ligand) ([A-z0-9]+) -')
rerefid = re.compile(r'list_uids=([- \.:,0-9A-z]+)')
refamid = re.compile(r'.*FamId=([0-9\.]+)')
a_family_title = 'Open Family Page'
a_receptor_title = 'Open Receptor Page'
a_titles = {a_family_title, a_receptor_title}
interactions = []
families = {}
complexes = set()
recpages = []
c = curl.Curl(urls.urls['hpmri']['browse_rescued'])
soup = bs4.BeautifulSoup(c.result, 'html.parser')
this_family = ('0', None)
this_subfamily = ('0', None)
this_subsubfamily = ('0', None)
for a in soup.find_all('a'):
a_title = a.attrs['title'] if 'title' in a.attrs else None
if a_title not in a_titles:
continue
if a_title == a_family_title:
family_id = refamid.match(a.attrs['href']).groups()[0]
if family_id.startswith(this_subfamily[0]):
this_subsubfamily = (family_id, a.text)
elif family_id.startswith(this_family[0]):
this_subfamily = (family_id, a.text)
this_subsubfamily = ('0', None)
else:
this_family = (family_id, a.text)
this_subfamily = ('0', None)
this_subsubfamily = ('0', None)
elif a_title == a_receptor_title:
recpages.append((
a.attrs['href'],
this_family[1],
this_subfamily[1],
this_subsubfamily[1],
))
prg = progress.Progress(len(recpages), 'Downloading HPMR data', 1)
i_complex = 0
regene = re.compile(r'Param=([^&]+)&ProtId=(\d+)&ProtType=(\w+)')
genes_curl = curl.Curl(
urls.urls['hpmri']['genes_rescued'],
silent = False,
large = True,
)
for url, family, subfamily, subsubfamily in recpages:
protein, prot_id, prot_type = regene.search(url).groups()
fname = 'gene_%s-%s-%s.html' % (protein, prot_id, prot_type)
prg.step(status = 'Processing `%s`' % fname)
_log(
'Accessing `%s` from `%s` (%s).' % (
fname,
genes_curl.cache_file_name,
genes_curl.url,
)
)
if fname not in genes_curl.result:
_log('File `%s` not found in the archive.' % fname)
continue
soup = bs4.BeautifulSoup(
genes_curl.result[fname].read(),
'html.parser',
)
ints = soup.find('div', {'id': 'GeneInts'})
if not ints:
_log('No interactions: `%s`' % url)
continue
recname = rerecname.search(
ints.find_previous_sibling('span').text
)
recname = recname.groups()[0] if recname else 'Unknown'
if recname == 'Unknown':
_log('Could not find receptor name: `%s`' % url)
continue
recname_u = mapping.map_name0(recname, 'genesymbol', 'uniprot')
if not recname_u:
continue
families[recname_u] = (
family,
subfamily,
subsubfamily,
)
for td in ints.find_all('td'):
interactors = []
for span in td.find_all('span', {'class': 'IntRow'}):
ints = reint.search(span.text)
if ints:
interactors.append(ints.groups())
references = []
for ref in td.find_all(
'a', {'title': 'click to open reference in new window'}
):
references.append(
rerefid.search(ref.attrs['href']).groups()[0].strip()
)
interactors_u = []
for role, genesymbol in interactors:
uniprot = (
mapping.map_name0(genesymbol, 'genesymbol', 'uniprot')
)
if uniprot:
interactors_u.append((role, uniprot))
partner_role = (
'receptor'
if all(i[0] == 'Receptor' for i in interactors_u) else
'ligand'
)
receptors = (
recname_u
if partner_role == 'receptor' else
get_partner(
interactors_u,
'Receptor',
recname = recname_u,
references = references,
)
)
partners = (
{u[1] for u in interactors_u} - {recname_u}
if partner_role == 'receptor' else
get_partner(
interactors_u,
'Ligand',
references = references,
)
)
receptors = common.to_list(receptors)
partners = common.to_list(partners)
unambiguous = (
partner_role == 'ligand' or
(
len(receptors) == 1 and
len(partners) == 1
)
)
for receptor, partner in itertools.product(receptors, partners):
interactions.append(
HpmrInteraction(
receptor = receptor,
partner = partner,
partner_role = partner_role,
references = ';'.join(references),
unambiguous = unambiguous,
)
)
for entity in itertools.chain(receptors, partners):
if hasattr(entity, 'components'):
complexes.add(entity)
prg.terminate()
result = {
'interactions': interactions,
'families': families,
}
_log('Saving HPMR data to cache file `%s`.' % cachefile)
pickle.dump(result, open(cachefile, 'wb'))
return result
[docs]
def hpmr_complexes(use_cache = None):
"""
HPMR does not contain unambiguous protein complex data, and considering
the resource is unmaintained, probably it never will. Hence this function
always returns an empty dict.
"""
hpmr_data = get_hpmr(use_cache = use_cache)
complexes = dict(
(
cplex.__str__(),
cplex,
)
for cplex in hpmr_data.get('complexes', ())
)
return complexes
[docs]
def hpmr_interactions(use_cache = None):
hpmr_data = get_hpmr(use_cache = use_cache)
return hpmr_data['interactions']
[docs]
def hpmr_annotations(use_cache = None):
annot = collections.defaultdict(set)
HPMRAnnotation = collections.namedtuple(
'HPMRAnnotation',
('role', 'mainclass', 'subclass', 'subsubclass'),
)
hpmr_data = get_hpmr(use_cache = use_cache)
for i in hpmr_data['interactions']:
# first partner is always a receptor
# (because ligand pages simply don't work on HPMR webpage)
args1 = ('Receptor',) + (
hpmr_data['families'][i[0]]
if i[0] in hpmr_data['families'] else
(None, None, None)
)
# the second is either a ligand or another receptor
args2 = (i[1],) + (
hpmr_data['families'][i[2]]
if i[2] in hpmr_data['families'] else
(None, None, None)
)
annot[i[0]].add(HPMRAnnotation(*args1))
annot[i[2]].add(HPMRAnnotation(*args2))
for uniprot, classes in iteritems(hpmr_data['families']):
args = ('Receptor',) + classes
annot[uniprot].add(HPMRAnnotation(*args))
return dict(annot)