Source code for pypath.core.intercell

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import re
import importlib as imp
import collections
import itertools

import numpy as np
import pandas as pd

import pypath.share.settings as settings
import pypath.share.common as common
import pypath.share.session as session
import pypath.core.annot as annot
import pypath.core.intercell_annot as intercell_annot
import pypath.core.network as network_mod
import pypath.internals.annot_formats as af



[docs]
class IntercellAnnotation(annot.CustomAnnotation):



[docs]
    def __init__(
            self,
            class_definitions = None,
            excludes = None,
            excludes_extra = None,
            cellphonedb_categories = None,
            baccin_categories = None,
            hpmr_categories = None,
            surfaceome_categories = None,
            gpcrdb_categories = None,
            icellnet_categories = None,
            build = True,
            composite_resource_name = None,
            **kwargs
        ):
        """
        Builds a database about roles of proteins and complexes in
        intercellular communication. The built-in category definitions
        defining the default contents of this database can be found in the
        ``pypath.core.intercell_annot`` module.

        :param tuple class_definitions:
            A series of annotation class definitions, each represented by
            an instance of ``pypath.internals.annot_formats.AnnotDef``.
            These definitions carry the attributes and instructions to
            populate the classes.
        :param dict excludes:
            A dict with parent category names (strings) or category keys
            (tuples) as keys and sets if identifiers as values.
            The identifiers in this dict will be excluded from all the
            respective categories while building the database. E.g. if
            the UniProt ID `P00533` (EGFR) is in the set under the key of
            `adhesion` it will be excluded from the category `adhesion` and
            all it's direct children.
        :param dict excludes_extra:
            Same kind of dict as `excludes` but it will be added to the
            built-in default. The built in and the provided extra sets
            will be merged. If you want to overwrite or modify the built-in
            sets provide your custom dict as `excludes`.
        :param bool build:
            Execute the build upon instantiation or set up an empty object
            the build can be executed on later.
        """

        if not hasattr(self, '_log_name'):

            session.Logger.__init__(self, name = 'intercell')

        class_definitions = (
            class_definitions or
            intercell_annot.annot_combined_classes
        )
        excludes = (
            excludes or
            intercell_annot.excludes
        )

        locals_ = locals()
        self._resource_categories = dict(
            (
                res,
                locals_['%s_categories' % res]
                    if locals_['%s_categories' % res] is not None else
                settings.get('intercell_%s_categories' % res)
            )
            for res in (
                'baccin',
                'cellphonedb',
                'hpmr',
                'surfaceome',
                'gpcrdb',
                'icellnet',
            )
        )

        annot.CustomAnnotation.__init__(
            self,
            class_definitions = class_definitions,
            excludes = excludes,
            excludes_extra = excludes_extra,
            build = build,
            composite_resource_name = composite_resource_name,
            **kwargs
        )




[docs]
    def reload(self):
        """
        Reloads the object from the module level.
        """

        imp.reload(af)
        imp.reload(annot)
        modname = self.__class__.__module__
        mod = __import__(modname, fromlist = [modname.split('.')[0]])
        imp.reload(mod)
        new = getattr(mod, self.__class__.__name__)
        setattr(self, '__class__', new)

        for k, v in iteritems(self.classes):

            k.__class__ = getattr(af, k.__class__.__name__)
            v.__class__ = getattr(af, v.__class__.__name__)



    def set_classes(self):

        self.class_names = set(itertools.chain(
            *intercell_annot.class_types.values()
        ))
        self.class_types = dict(
            (cls, typ)
            for typ, ccls in intercell_annot.class_types.items()
            for cls in ccls
        )

        self.main_classes = {}

        for cls in set(self.classes.keys()):

            mainclass = None

            cls_split = cls.split('_')

            for j in range(len(cls_split)):

                this_part = '_'.join(cls_split[:j])

                if this_part in self.class_names:

                    mainclass = this_part

            self.main_classes[cls] = mainclass


    def add_classes_to_df(self):

        if not hasattr(self, 'df'):

            return

        self.df['mainclass'] = (
            np.array([self.main_classes[c] for c in self.df.category])
        )
        self.df['mainclass'] = self.df['mainclass'].astype('category')
        self.df['class_type'] = (
            np.array([
                (
                    self.class_types[c]
                        if c in self.class_types else
                    'sub'
                )
                for c in self.df.category
            ])
        )
        self.df['class_type'] = self.df['class_type'].astype('category')


    def collect_classes(self):

        self.class_names = set(
            itertools.chain(
                *intercell_annot.class_types.values()
            )
        )

        self.class_types = dict(
            (cls, typ)
            for typ, ccls in intercell_annot.class_types.items()
            for cls in ccls
        )

        self.children = collections.defaultdict(set)
        self.parents = {}
        self.class_labels = {}
        self.resource_labels = {}

        for cls in self.classes.keys():

            mainclass = None

            if cls in intercell_annot.class_types['misc']:

                self.parents[cls] = None

            else:

                cls_split = cls.split('_')

                for j in range(len(cls_split) + 1):

                    this_part = '_'.join(cls_split[:j])

                    if this_part in self.class_names:

                        mainclass = this_part

                self.children[mainclass].add(cls)
                self.parents[cls] = mainclass

                resource = cls_split[-1]

            if mainclass is not None and resource not in mainclass:

                self.resource_labels[cls] = (
                    intercell_annot.get_resource_label(resource)
                )

            self.class_labels[cls] = (
                intercell_annot.get_class_label(mainclass or cls)
            )



[docs]
    def make_df(self):

        annot.CustomAnnotation.make_df(self)

        self.df_add_causality()
        self.df_add_locations()



    def load_from_pickle(self, pickle_file):

        annot.CustomAnnotation.load_from_pickle(
            self,
            pickle_file = pickle_file,
        )


    def df_add_causality(self):

        if not hasattr(self, 'df'):

            self.make_df()
            return

        for causality in ('transmitter', 'receiver'):

            self.df[causality] = [
                bool(getattr(self.classes[key], causality))
                for key in zip(
                    self.df.category,
                    self.df.parent,
                    self.df.database,
                )
            ]


    def df_add_locations(self, locations = None):

        if not hasattr(self, 'df'):

            self.make_df()
            return

        self._log('Adding location columns to data frame.')

        locations = (
            locations or
            (
                'secreted',
                'plasma_membrane_transmembrane',
                'plasma_membrane_peripheral',
            )
        )
        location_classes = dict(
            (
                location,
                self.select(location),
            )
            for location in locations
        )

        for location, entities in iteritems(location_classes):

            self.df[location] = [
                uniprot in entities
                for uniprot in self.df.uniprot
            ]


    def pre_build(self):

        annot.CustomAnnotation.pre_build(self)
        self.add_extra_categories()


    def add_extra_categories(self):

        self.add_cellphonedb_categories()
        self.add_baccin_categories()
        self.add_hpmr_categories()
        self.add_surfaceome_categories()
        self.add_gpcrdb_categories()
        self.add_icellnet_categories()


    def add_cellphonedb_categories(self):

        if self._resource_categories['cellphonedb']:

            self.ensure_annotdb()

            cellphonedb_categories = []

            for mainclass in ('receptor', 'secreted'):

                cpdb = self.annotdb.annots['CellPhoneDB']
                attr = '%s_class' % mainclass

                categories = cpdb.get_values(attr)

                for category in categories:

                    if category in {'secreted', 'receptor'}:

                        continue

                    parent = (
                        'receptor'
                            if mainclass == 'receptor' else
                        'ligand'
                    )

                    cellphonedb_categories.append(
                        af.AnnotDef(
                            name = category,
                            parent = parent,
                            resource = 'CellPhoneDB',
                            args = {
                                mainclass: bool,
                                attr: category,
                            },
                        )
                    )

            self._class_definitions_provided += tuple(cellphonedb_categories)


    def add_baccin_categories(self):

        if self._resource_categories['baccin']:

            self.ensure_annotdb()
            baccin_categories = []
            baccin = self.annotdb.annots['Baccin2019']
            fields = baccin.get_names()
            locations = {
                'surface': {'membrane', 'both'},
                'secreted': {'secreted', 'both', 'ecm'},
            }

            subclasses = baccin.get_values('subclass') - {'other', None}

            this_fields = fields[1:]

            for subclass in subclasses:

                receptor = 'receptor' in subclass

                args = {'subclass': subclass}

                for location in ('surface', 'secreted'):

                    if receptor and location == 'secreted':

                        continue

                    if not receptor:

                        args['location'] = location

                    members = baccin.select(**args)

                    if not members:

                        continue

                    parent = (
                        'receptor'
                        if receptor else
                        (
                            'cell_surface_ligand'
                                if location == 'surface' else
                            'ligand'
                        )
                    )

                    name = subclass.replace('_receptor', '')

                    baccin_categories.append(
                        af.AnnotDef(
                            name = name,
                            parent = parent,
                            resource = 'Baccin2019',
                            args = args,
                        )
                    )

            self._class_definitions_provided += tuple(baccin_categories)


    def add_hpmr_categories(self):

        resep = re.compile(r'[- /\(\),]+')

        hpmr_categories = []

        if self._resource_categories['hpmr']:

            self.ensure_annotdb()

            hpmr = self.annotdb['HPMR']

            fields = hpmr.get_names()

            for i in range(2, len(fields) + 1):

                combinations = {
                    a[:i]
                    for entity, annots in iteritems(hpmr.annot)
                    for a in annots
                }

                for values in combinations:

                    if not values[0]:

                        continue

                    this_fields = fields[1:i]
                    this_values = values[1:i]

                    if not this_values[-1]:

                        continue

                    args = dict(zip(
                        this_fields,
                        this_values,
                    ))

                    members = hpmr.select(**args)
                    parent = values[0].lower()

                    if not members:

                        continue

                    name_parts = (
                        this_values[1:]
                            if len(this_values) > 1 else
                        this_values
                    )

                    name = '_'.join(
                        name_part.strip('_').replace('_receptors', '')
                        for name_part in
                        (
                            resep.sub('_', val).lower()
                                if val else
                            None
                            for val in reversed(name_parts)
                        )
                        if name_part
                    )

                    hpmr_categories.append(
                        af.AnnotDef(
                            name = name,
                            resource = 'HPMR',
                            args = args,
                            parent = parent,
                        )
                    )

            self._class_definitions_provided += tuple(hpmr_categories)


    def add_gpcrdb_categories(self):

        resep = re.compile(r'[- /\(\),]+')

        gpcrdb_categories = []

        if self._resource_categories['gpcrdb']:

            self.ensure_annotdb()

            gpcrdb = self.annotdb['GPCRdb']

            fields = gpcrdb.get_names()

            for i in range(1, len(fields) + 1):

                combinations = {
                    a[:i]
                    for entity, annots in iteritems(gpcrdb.annot)
                    for a in annots
                }

                for values in combinations:

                    if not values[0]:

                        continue

                    this_fields = fields[:i]
                    this_values = values[:i]

                    args = dict(zip(
                        this_fields,
                        this_values,
                    ))

                    members = gpcrdb.select(**args)

                    if not members:

                        continue

                    name = '_'.join(
                        resep.sub('_', val).lower().strip('_')
                        for val in
                        this_values
                    )
                    name = name.replace('_receptors', '')

                    gpcrdb_categories.append(
                        af.AnnotDef(
                            name = name,
                            resource = 'GPCRdb',
                            args = args,
                            parent = 'receptor',
                        )
                    )

            self._class_definitions_provided += tuple(gpcrdb_categories)


    def add_surfaceome_categories(self):

        resep = re.compile(r'[- /\(\),\.]+')
        recls = re.compile(r'_(?:transporters|receptors|ion_channels)')

        mainclasses = {
            'Receptors': 'receptor',
            'Transporters': 'transporter',
            'Enzymes': 'surface_enzyme',
        }

        if self._resource_categories['surfaceome']:

            self.ensure_annotdb()
            surfaceome = self.annotdb['Surfaceome']
            surfaceome_categories = []

            for mainclass, parent in iteritems(mainclasses):

                subclasses = {
                    sc
                    for annots in surfaceome.annot.values()
                    for a in annots
                    for sc in (a.subclasses or ())
                    if (
                        a.mainclass == mainclass and
                        sc is not None and
                        not sc[0].isdigit()
                    )
                }

                for subclass in subclasses:

                    if subclass.startswith('Other'):

                        continue

                    name = '%s_%s' % (
                        resep.sub('_', subclass).lower().strip('_'),
                        mainclass.lower(),
                    )
                    _parent = (
                        'ion_channel'
                            if 'ion_channel' in name else
                        parent
                    )
                    name = recls.sub('', name)

                    surfaceome_categories.append(
                        af.AnnotDef(
                            name = name,
                            resource = 'Surfaceome',
                            args = {
                                'mainclass': mainclass,
                                'subclasses': subclass,
                            },
                            parent = _parent,
                        )
                    )

            self._class_definitions_provided += tuple(surfaceome_categories)


    def add_icellnet_categories(self):

        icellnet_categories = []

        if self._resource_categories['icellnet']:

            self.ensure_annotdb()

            icellnet = self.annotdb['ICELLNET']

            names = icellnet.get_names()[:3]

            combinations = {
                a[:3]
                for aa in icellnet.annot.values()
                for a in aa
            }

            for values in combinations:

                for l in (2, 3):

                    _fields = names[:l]
                    _values = values[:l]

                    if _values[-1] is None:

                        continue

                    args = dict(zip(_fields, _values))

                    members = icellnet.select(**args)

                    if not members:

                        continue

                    name = '_'.join(
                        val.lower().replace('.', '').replace(' ', '_')
                        for val in _values[1:]
                        if val is not None
                    )

                    icellnet_categories.append(
                        af.AnnotDef(
                            name = name,
                            resource = 'ICELLNET',
                            args = args,
                            parent = values[0],
                        )
                    )

            self._class_definitions_provided += tuple(icellnet_categories)


    def post_load(self):

        self.make_df()


    def __repr__(self):

        return (
            '<Intercell annotations: %s records about %s entities>' % (
                self.numof_records(),
                self.numof_entities(),
            )
        )


    @classmethod
    def filter_df(
            cls,
            annot_df,
            category = None,
            name = None,
            parent = None,
            database = None,
            scope = None,
            aspect = None,
            source = None,
            entities = None,
            entity_type = None,
            causality = None,
            topology = None,
            postfix = None,
        ):

        category = category or name
        args = locals()

        _topologies = {
            'pmtm': 'plasma_membrane_transmembrane',
            'pmp': 'plasma_membrane_peripheral',
            'sec': 'secreted',
        }

        entities = args.pop('entities')
        causality = args.pop('causality') or ()
        topology = args.pop('topology') or ()

        topology = [
            _topologies[top] if top in _topologies else top
            for top in common.to_set(topology)
        ]

        query = cls._process_query_args(
            df = annot_df,
            entities = entities,
            args = args,
            postfix = postfix,
        )

        if causality:

            query.append(cls._process_boolean_group_args(causality, postfix))

        if topology:

            query.append(cls._process_boolean_group_args(topology, postfix))

        args = cls._args_add_postfix(args, postfix)

        query = ' and '.join(query)

        return annot_df.query(query) if query else annot_df


    @staticmethod
    def _process_boolean_group_args(values, postfix):

        if postfix:

            values = {
                '%s%s' % (val, postfix)
                for val in common.to_list(values)
            }

        return ' or '.join(common.to_set(values))



[docs]
    def network_df(
            self,
            annot_df = None,
            network = None,
            combined_df = None,
            network_args = None,
            annot_args = None,
            annot_args_source = None,
            annot_args_target = None,
            entities = None,
            only_directed = False,
            only_undirected = False,
            undirected_orientation = None,
            only_signed = None,
            only_effect = None,
            only_proteins = False,
            swap_undirected = True,
            entities_or = False,
            transmitter_receiver = False,
            only_generic = True,
            only_composite = True,
            only_functional = True,
            exclude_intracellular = True,
        ):
        """
        Combines the annotation data frame and a network data frame.
        Creates a ``pandas.DataFrame`` where each record is an interaction
        between a pair of molecular enitities labeled by their annotations.

        network : pypath.network.Network,pandas.DataFrame
            A ``pypath.network.Network`` object or a data frame with network
            data.
        combined_df : pandas.DataFrame
            Optional, a network data frame already combined with annotations
            for filtering only.
        resources : set,None
            Use only these network resources.
        entities : set,None
            Limit the network only to these molecular entities.
        entities_source : set,None
            Limit the source side of network connections only to these
            molecular entities.
        entities_target : set,None
            Limit the target side of network connections only to these
            molecular entities.
        annot_args : dict,None
            Parameters for filtering annotation classes; note, the defaults
            might include some filtering, provide an empty dict if you want
            no filtering at all; however this might result in huge data
            frame and consequently memory issues. Passed to the ``filtered``
            method.
        annot_args_source : dict,None
            Same as ``annot_args`` but only for the source side of the
            network connections.
        annot_args_target : dict,None
            Same as ``annot_args`` but only for the target side of the
            network connections.
        only_directed : bool
            Use only the directed interactions.
        only_undirected : bool
            Use only the undirected interactions. Specifically for retrieving
            and counting the interactions without direction information.
        undirected_orientation : str,None
            Ignore the direction at all interactions and make sure all of
            them have a uniform orientation. If `id`, all interactions will
            be oriented by the identifiers of the partenrs; if `category`,
            the interactions will be oriented by the categories of the
            partners.
        only_effect : int,None
            Use only the interactions with this effect. Either -1 or 1.
        only_signed : bool
            Use only the interactions with effect sign.
        only_proteins : bool
            Use only the interactions where each of the partners is a protein
            (i.e. not complex, miRNA, small molecule or other kind of entity).
        transmitter_receiver : bool
            On the source side only transmitters, on the target side only
            receivers.
        only_generic : bool
            Use only the generic classes. If specific classes allowed the
            size of the combined data frame might be huge.
        only_composite : bool
            Use only the composite classes. If resource_specific classes
            allowed the size of the combined data frame might be huge.
        only_functional : bool
            Use only the functional classes. Locational classes are often
            not relevant and they largely increase the size of the
            combined data frame.
        exclude_intracellular : bool
            Remove the intracellular parent class and it's children. These
            classes are not relevant in intercellular signaling and having
            them largely increases the size of the combined data frame.
        """

        annot_df = annot_df or self.get_df()

        if exclude_intracellular:

            if combined_df is None:

                annot_df = annot_df[annot_df.parent != 'intracellular']

            else:

                combined_df = combined_df.query(
                    'parent_a != "intracellular" and '
                    'parent_b != "intracellular"'
                )

        annot_args = annot_args or {}
        annot_args_source = annot_args_source or {}
        annot_args_target = annot_args_target or {}

        if only_generic:

            annot_args['scope'] = 'generic'

        if only_composite:

            annot_args['source'] = 'composite'

        if only_functional:

            annot_args['aspect'] = 'functional'

        if transmitter_receiver:

            annot_args_source['causality'] = 'transmitter'
            annot_args_target['causality'] = 'receiver'

        return annot.CustomAnnotation.network_df(
            self,
            annot_df = annot_df,
            network = network,
            combined_df = combined_df,
            network_args = network_args,
            annot_args = annot_args,
            annot_args_source = annot_args_source,
            annot_args_target = annot_args_target,
            entities = entities,
            only_directed = only_directed,
            only_undirected = only_undirected,
            only_signed = only_signed,
            only_effect = only_effect,
            only_proteins = only_proteins,
            swap_undirected = swap_undirected,
            entities_or = entities_or,
            undirected_orientation = undirected_orientation,
        )



    # this became a synonym
    filter_interclass_network = network_df


    def update_summaries(self):

        self.summaries = {}

        for key, group in iteritems(self.classes):

            if group.source == 'resource_specific':

                continue

            self.summaries[key] = {
                'name': group.name,
                'aspect': group.aspect,
                'transmitter': group.transmitter,
                'receiver': group.receiver,
                'resources': self.resources_in_category(key),
                'n_proteins': group.n_proteins,
                'n_mirnas': group.n_mirnas,
                'n_complexes': group.n_complexes,
            }

        self.summaries[('Total', 'Total', 'OmniPath')] = {
            'name': 'Total',
            'aspect': '',
            'transmitter': '',
            'receiver': '',
            'resources': self.all_resources(),
            'n_proteins': self.numof_proteins(),
            'n_mirnas': self.numof_mirnas(),
            'n_complexes': self.numof_complexes(),
        }


    def summaries_tab(self, outfile = None, return_table = False):

        columns = (
            ('name', 'Category'),
            ('aspect', 'Aspect'),
            ('transmitter', 'Transmitter'),
            ('receiver', 'Receiver'),
            ('n_proteins', 'Proteins'),
            ('n_mirnas', 'miRNAs'),
            ('n_complexes', 'Complexes'),
            ('resources', 'Resources'),
        )

        tab = []
        tab.append([f[1] for f in columns])

        tab.extend([
            [
                (
                    ', '.join(self.summaries[key][f[0]])
                        if isinstance(self.summaries[key][f[0]], list) else
                    str(self.summaries[key][f[0]])
                )
                for f in columns
            ]
            for key in sorted(
                self.summaries.keys(),
                key = lambda k: k[0] if k[0] != 'Total' else 'zzzz',
            )
        ])

        if outfile:

            with open(outfile, 'w') as fp:

                fp.write('\n'.join('\t'.join(row) for row in tab))

        if return_table:

            return tab




[docs]
def init_db(**kwargs):

    globals()['db'] = IntercellAnnotation(**kwargs)




[docs]
def get_db(**kwargs):

    if 'db' not in globals():

        init_db(**kwargs)

    return globals()['db']