Source code for pypath.inputs.compleat

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import csv

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.internals.intera as intera
import pypath.utils.mapping as mapping



[docs]
def compleat_raw():
    """
    Raw protein complex data from the Compleat database.
    """

    url = urls.urls['compleat']['rescued']
    c = curl.Curl(url, large = True, silent = False)
    tab = list(csv.DictReader(
        c.result,
        delimiter = '\t',
        fieldnames = (
            'compleat_id',
            'member_count',
            'predicted',
            'functions',
            'functions2',
            'nothing',
            'sources',
            'name',
            'method',
            'organisms',
            'pubmeds',
            'members',
        )
    ))

    return tab




[docs]
def compleat_complexes(predicted = True):
    """
    Retrieves and processes protein complexes from the Compleat database.
    """

    raw = compleat_raw()
    complexes = {}

    for rec in raw:

        is_predicted = (
            rec['predicted'] and
            rec['predicted'].strip() == 'Predicted'
        )

        if is_predicted and not predicted:

            continue

        if not rec['members']:

            continue

        uniprots = []

        for entrez in rec['members'].split():

            uniprot = mapping.map_name0(entrez.strip(), 'entrez', 'uniprot')

            if uniprot:
                uniprots.append(uniprot)

        if not uniprots:
            continue

        name = rec['name']
        references = rec['pubmeds'].split(',') if rec['pubmeds'] else None
        sources = set(rec['sources'].split(',')) if is_predicted else set()
        sources.add('Compleat')

        cplex = intera.Complex(
            components = uniprots,
            sources = sources,
            references = references,
            name = name,
            ids = {'Compleat': rec['compleat_id']},
        )

        if cplex.__str__() in complexes:

            complexes[cplex.__str__()] += cplex

        else:

            complexes[cplex.__str__()] = cplex

    return complexes