Source code for pypath.inputs.hpmr

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import os
import re
import collections
import itertools
import shutil

try:
    import cPickle as pickle
except:
    import pickle

import bs4

import pypath.share.curl as curl
import pypath.share.common as common
import pypath.share.progress as progress
import pypath.share.cache as cache
import pypath.share.settings as settings
import pypath.share.session as session_mod
import pypath.resources.urls as urls
import pypath.internals.intera as intera
import pypath.utils.mapping as mapping

_logger = session_mod.Logger(name = 'hpmr_input')
_log = _logger._log


HpmrInteraction = collections.namedtuple(
    'HpmrInteraction',
    (
        'receptor',
        'partner_role',
        'partner',
        'references',
        'unambiguous',
    ),
)


[docs] def get_hpmr(use_cache = None): """ Downloads ligand-receptor and receptor-receptor interactions from the Human Plasma Membrane Receptome database. Args use_cache (bool): Use the intermediate cache (pickle file of processed data). Returns (dict): Two elements: "interactions" and "families". """ def get_partner(interactors, typ, recname = None, references = None): """ typ : str `Receptor` or `Ligand`. """ components = [i[1] for i in interactors if i[0] == typ] if typ == 'Receptor' and recname: components.append(recname) if len(components) == 1: return components[0] elif len(components) > 1: return intera.Complex( components = components, sources = 'HPMR', references = references, ) cachefile = cache.cache_item('hpmr_preprocessed') use_cache = ( use_cache if isinstance(use_cache, bool) else settings.get('use_intermediate_cache') ) if os.path.exists(cachefile) and use_cache: _log('Reading HPMR data from cache file `%s`.' % cachefile) return pickle.load(open(cachefile, 'rb')) rerecname = re.compile(r'Receptor ([A-z0-9]+) interacts with:') reint = re.compile(r'(Receptor|Ligand) ([A-z0-9]+) -') rerefid = re.compile(r'list_uids=([- \.:,0-9A-z]+)') refamid = re.compile(r'.*FamId=([0-9\.]+)') a_family_title = 'Open Family Page' a_receptor_title = 'Open Receptor Page' a_titles = {a_family_title, a_receptor_title} interactions = [] families = {} complexes = set() recpages = [] c = curl.Curl(urls.urls['hpmri']['browse_rescued']) soup = bs4.BeautifulSoup(c.result, 'html.parser') this_family = ('0', None) this_subfamily = ('0', None) this_subsubfamily = ('0', None) for a in soup.find_all('a'): a_title = a.attrs['title'] if 'title' in a.attrs else None if a_title not in a_titles: continue if a_title == a_family_title: family_id = refamid.match(a.attrs['href']).groups()[0] if family_id.startswith(this_subfamily[0]): this_subsubfamily = (family_id, a.text) elif family_id.startswith(this_family[0]): this_subfamily = (family_id, a.text) this_subsubfamily = ('0', None) else: this_family = (family_id, a.text) this_subfamily = ('0', None) this_subsubfamily = ('0', None) elif a_title == a_receptor_title: recpages.append(( a.attrs['href'], this_family[1], this_subfamily[1], this_subsubfamily[1], )) prg = progress.Progress(len(recpages), 'Downloading HPMR data', 1) i_complex = 0 regene = re.compile(r'Param=([^&]+)&ProtId=(\d+)&ProtType=(\w+)') genes_curl = curl.Curl( urls.urls['hpmri']['genes_rescued'], silent = False, large = True, ) for url, family, subfamily, subsubfamily in recpages: protein, prot_id, prot_type = regene.search(url).groups() fname = 'gene_%s-%s-%s.html' % (protein, prot_id, prot_type) prg.step(status = 'Processing `%s`' % fname) _log( 'Accessing `%s` from `%s` (%s).' % ( fname, genes_curl.cache_file_name, genes_curl.url, ) ) if fname not in genes_curl.result: _log('File `%s` not found in the archive.' % fname) continue soup = bs4.BeautifulSoup( genes_curl.result[fname].read(), 'html.parser', ) ints = soup.find('div', {'id': 'GeneInts'}) if not ints: _log('No interactions: `%s`' % url) continue recname = rerecname.search( ints.find_previous_sibling('span').text ) recname = recname.groups()[0] if recname else 'Unknown' if recname == 'Unknown': _log('Could not find receptor name: `%s`' % url) continue recname_u = mapping.map_name0(recname, 'genesymbol', 'uniprot') if not recname_u: continue families[recname_u] = ( family, subfamily, subsubfamily, ) for td in ints.find_all('td'): interactors = [] for span in td.find_all('span', {'class': 'IntRow'}): ints = reint.search(span.text) if ints: interactors.append(ints.groups()) references = [] for ref in td.find_all( 'a', {'title': 'click to open reference in new window'} ): references.append( rerefid.search(ref.attrs['href']).groups()[0].strip() ) interactors_u = [] for role, genesymbol in interactors: uniprot = ( mapping.map_name0(genesymbol, 'genesymbol', 'uniprot') ) if uniprot: interactors_u.append((role, uniprot)) partner_role = ( 'receptor' if all(i[0] == 'Receptor' for i in interactors_u) else 'ligand' ) receptors = ( recname_u if partner_role == 'receptor' else get_partner( interactors_u, 'Receptor', recname = recname_u, references = references, ) ) partners = ( {u[1] for u in interactors_u} - {recname_u} if partner_role == 'receptor' else get_partner( interactors_u, 'Ligand', references = references, ) ) receptors = common.to_list(receptors) partners = common.to_list(partners) unambiguous = ( partner_role == 'ligand' or ( len(receptors) == 1 and len(partners) == 1 ) ) for receptor, partner in itertools.product(receptors, partners): interactions.append( HpmrInteraction( receptor = receptor, partner = partner, partner_role = partner_role, references = ';'.join(references), unambiguous = unambiguous, ) ) for entity in itertools.chain(receptors, partners): if hasattr(entity, 'components'): complexes.add(entity) prg.terminate() result = { 'interactions': interactions, 'families': families, } _log('Saving HPMR data to cache file `%s`.' % cachefile) pickle.dump(result, open(cachefile, 'wb')) return result
[docs] def hpmr_complexes(use_cache = None): """ HPMR does not contain unambiguous protein complex data, and considering the resource is unmaintained, probably it never will. Hence this function always returns an empty dict. """ hpmr_data = get_hpmr(use_cache = use_cache) complexes = dict( ( cplex.__str__(), cplex, ) for cplex in hpmr_data.get('complexes', ()) ) return complexes
[docs] def hpmr_interactions(use_cache = None): hpmr_data = get_hpmr(use_cache = use_cache) return hpmr_data['interactions']
[docs] def hpmr_annotations(use_cache = None): annot = collections.defaultdict(set) HPMRAnnotation = collections.namedtuple( 'HPMRAnnotation', ('role', 'mainclass', 'subclass', 'subsubclass'), ) hpmr_data = get_hpmr(use_cache = use_cache) for i in hpmr_data['interactions']: # first partner is always a receptor # (because ligand pages simply don't work on HPMR webpage) args1 = ('Receptor',) + ( hpmr_data['families'][i[0]] if i[0] in hpmr_data['families'] else (None, None, None) ) # the second is either a ligand or another receptor args2 = (i[1],) + ( hpmr_data['families'][i[2]] if i[2] in hpmr_data['families'] else (None, None, None) ) annot[i[0]].add(HPMRAnnotation(*args1)) annot[i[2]].add(HPMRAnnotation(*args2)) for uniprot, classes in iteritems(hpmr_data['families']): args = ('Receptor',) + classes annot[uniprot].add(HPMRAnnotation(*args)) return dict(annot)