Source code for pypath.inputs.lambert2018

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import re
import collections

import pypath.inputs.common as inputs_common
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.inputs.cell as cell_input
import pypath.share.common as common



[docs]
def lambert2018_s1_raw():

    def process_field(f):

        f = common.try_bool(common.try_float(f.strip()))

        return None if f in {'', '#N/A'} else f


    path = cell_input.cell_supplementary(
        supp_url = urls.urls['lambert2018']['s1'],
        article_url = urls.urls['lambert2018']['article'],
    )

    content = inputs_common.read_xls(path, sheet = 1)

    h0, h1 = content.pop(0), content.pop(0)
    h1[3] = h0[3]

    names = ['%s_%s' % n for n in zip()]

    record = collections.namedtuple(
        'Lambert2018Raw',
        [
            nn for nn in (
                re.sub('[- ?;:]', '_', n).lower().strip('_ ')
                for n in h1
            )
            if nn
        ]
    )

    nfields = len(record._fields)

    return [
        record(*(process_field(f) for f in r[:nfields]))
        for r in content
    ]




[docs]
def lambert2018_annotations():

    Lambert2018Annotation = collections.namedtuple(
        'Lambert2018Annotation',
        (
            'ensg',
            'genesymbol',
            'is_tf',
            'tf_assessment',
            'binding_mode',
            'binding_domain',
            'tf_disagree',
            'binding_disagree',
            'binding1',
            'binding2',
            'assessment1',
            'assessment2',
            'vaquerizas2009',
            'cisbp',
            'tfclass',
            'tfcat_annot',
            'tfcat_pmids',
            'go',
            'pdb',
        )
    )

    result = collections.defaultdict(set)

    for r in lambert2018_s1_raw():

        uniprots = mapping.map_name(r.name, 'genesymbol', 'uniprot')
        vaquerizas = r.vaquerizas_2009_tf_classification or 'no'

        tfcat_annot = (
            tuple(common.del_empty(sorted(
                a.strip() for a in
                re.split(
                    '[_;]',
                    re.sub(
                        'PMIDS:[\d;]+', '',
                        r.tf_cat_classification
                    ).
                    replace('tf', 'TF').
                    replace('Transcription Factor', 'TF')
                )
            )))
                if r.tf_cat_classification else
            ()
        )

        tfcat_pmids = common.re_safe_groups(
            'PMIDS:([\d;]+)',
            r.tf_cat_classification.strip()
        )[0] if r.tf_cat_classification else None

        tfcat_pmids = None if tfcat_pmids == '0' else tfcat_pmids

        for uniprot in uniprots:

            result[uniprot].add(
                Lambert2018Annotation(
                    ensg = r.id,
                    genesymbol = r.name,
                    is_tf = r.is_tf,
                    tf_assessment = r.tf_assessment,
                    binding_mode = r.binding_mode,
                    binding_domain = r.dbd,
                    tf_disagree = r.disagree_on_assessment == 'Disagree',
                    binding_disagree = r.disagree_on_binding == 'Disagree',
                    binding1 = r.binding1,
                    binding2 = r.binding2,
                    assessment1 = r.assesment1,
                    assessment2 = r.assesment2,
                    vaquerizas2009 = vaquerizas,
                    cisbp = r.cisbp_considers_it_as_a_tf,
                    tfclass = r.tfclass_considers_it_as_a_tf,
                    tfcat_annot = tfcat_annot,
                    tfcat_pmids = tfcat_pmids,
                    go = r.is_a_go_tf,
                    pdb = r.pdb,
                )
            )

    return dict(result)