Source code for pypath.inputs.comppi

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import itertools
import collections

import pypath.share.curl as curl
import pypath.share.session as session
import pypath.share.settings as settings
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.utils.taxonomy as taxonomy

_logger = session.Logger(name = 'comppi_input')
_log = _logger._log


[docs] def comppi_interaction_locations(organism = 9606): """ Downloads and preprocesses protein interaction and cellular compartment association data from the ComPPI database. This data provides scores for occurrence of protein-protein interactions in various compartments. """ ComppiLocation = collections.namedtuple( 'ComppiLocation', [ 'location', 'score', ], ) ComppiInteraction = collections.namedtuple( 'ComppiInteraction', [ 'id_a', 'id_b', 'loc_a', 'loc_b', ], ) def process_locations(loc): return tuple( ComppiLocation(location = llloc[0], score = float(llloc[1])) for llloc in (lloc.split(':') for lloc in loc.split('|')) ) organisms = { 9606: 0, 7227: 1, 6239: 2, 4932: 3, } ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(organism) if ncbi_tax_id not in organisms: raise ValueError( 'Can not recognize organism: `%s`. ' 'Available organisms are human (9606), Drosophila (7227), ' 'C. elegans (6239) and S. cerevisiae (4932).' % str(organism) ) url = urls.urls['comppi']['url'] headers = [settings.get('user_agent')] # obtaining cookie c = curl.Curl(url, cache = False, req_headers = headers) cookie = ';'.join([ h.decode().split(':')[1].split(';')[0].strip() for h in c.resp_headers if h.startswith(b'Set-Cookie') ]) cookie_hdr = ['Cookie: %s' % cookie] _log('Cookie from ComPPI: %s' % cookie) # performing the download post = { 'fDlSet': 'comp', 'fDlSpec': '%u' % ncbi_tax_id, 'fDlMLoc': 'all', 'fDlSubmit': 'Download' } c = curl.Curl( url, req_headers = headers + cookie_hdr, post = post, large = True, silent = False, compr = 'gz', ) _ = next(c.result) for l in c.result: l = l.strip('\r\n').split('\t') organism_a = int(l[7]) organism_b = int(l[15]) if organism and (organism_a != organism or organism_b != organism): continue for uniprot1, uniprot2 in itertools.product( mapping.map_name(l[0], 'uniprot', 'uniprot'), mapping.map_name(l[8], 'uniprot', 'uniprot'), ): yield ComppiInteraction( id_a = uniprot1, id_b = uniprot2, loc_a = process_locations(l[2]), loc_b = process_locations(l[10]), )
[docs] def comppi_locations(organism = 9606, score_threshold = .7): result = collections.defaultdict(set) for iloc in comppi_interaction_locations(organism = organism): for label in ('a', 'b'): for loc in getattr(iloc, 'loc_%s' % label): if loc.location == 'N/A' or loc.score < score_threshold: continue result[getattr(iloc, 'id_%s' % label)].add(loc) return dict(result)