Source code for pypath.inputs.oma

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

from typing import Literal
import collections
import itertools

import pandas as pd

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.inputs.common as inputs_common
import pypath.utils.taxonomy as taxonomy
import pypath.utils.mapping as mapping



[docs]
def oma_orthologs(
        organism_a: str | int = 'human',
        organism_b: str | int = 'mouse',
        id_type: str | None = None,
        rel_type: set[Literal['1:1', '1:n', 'm:1', 'm:n']] | None = None,
        score: float = None,
        return_df: bool = False,
    ) -> list[tuple] | pd.DataFrame:
    """
    Retrieves pairwise relations between two genomes from the OMA
    (Orthologous Matrix) database (https://omabrowser.org/oma/home/).

    Args:
        organism_a:
            Name or NCBI Taxonomy ID of the first organism.
        organism_b:
            Name or NCBI Taxonomy ID of the second organism.
        id_type:
            OMA by default uses UniProt entry IDs and sometimes other
            identifiers for genes. Set this parameter to control which
            ID type all the identifiers are to be mapped to.
        rel_type:
            Restrict relations to certain types.
        score:
            Lower threshold for similarity metric.
        return_df:
            If True, returns a data frame instead of a list of tuples.

    Returns:
        A list with tuples of pairwise orthologous relationships or a data
        frame with the same records.
    """

    OmaGene = collections.namedtuple(
        'OmaGene',
        (
            'id',
            'oma_group',
            'hog',
            'taxon',
            'chr',
            'start',
            'end',
            'strand',
            'main_isoform',
        )
    )

    OmaOrthology = collections.namedtuple(
            'OmaOrthology',
            (
                'a',
                'b',
                'rel_type',
                'dist',
                'score',
            ),
        )

    organism_a = taxonomy.ensure_ncbi_tax_id(organism_a)
    organism_b = taxonomy.ensure_ncbi_tax_id(organism_b)
    rel_type = common.to_set(rel_type)
    url = urls.urls['oma']['url']
    page = 1
    n_pages = 1e6
    # first decleration is set to prevent recurrency.
    # But at the end it will return as a list
    result = set()

    while True:

        page_url = f'{url}{organism_a}/{organism_b}/?page={page}&per_page=1000'
        c = curl.Curl(page_url, silent = False)

        if not c.result: break

        c.get_headers()
        n_pages = float(c.resp_headers_dict.get('x-total-count', 1e8)) / 100
        page += 1
        data = inputs_common.json_read(c.result)

        for rec in data:

            if (
                (score and rec['score'] < score) or
                (rel_type and rec['rel_type'] not in rel_type)
            ):
                continue

            a, b = (
                [
                    OmaGene(
                        id = id_,
                        oma_group = e['oma_group'],
                        hog = e['oma_hog_id'],
                        taxon = e['species']['taxon_id'],
                        chr = e['chromosome'],
                        start = int(e['locus']['start']),
                        end = int(e['locus']['end']),
                        strand = int(e['locus']['strand']),
                        main_isoform = e['is_main_isoform'],

                    )
                    for id_ in _id_translate(
                        id_ = e['canonicalid'],
                        taxon = e['species']['taxon_id'],
                        id_type = id_type,
                    )
                ]
                for e in (rec[f'entry_{ei}'] for ei in (1, 2))
            )


            result.update(
                {
                    OmaOrthology(
                        a = _a,
                        b = _b,
                        rel_type = rec['rel_type'],
                        dist = float(rec['distance']),
                        score = float(rec['score']),
                    )
                    for _a, _b in itertools.product(a, b)
                }
            )

        if page > n_pages: break

    result = list(result)

    if return_df:

        result = pd.DataFrame(result)
        result = pd.concat(
            [
                pd.DataFrame(result.a.tolist()).add_suffix('_a', axis = 1),
                pd.DataFrame(result.b.tolist()).add_suffix('_b', axis = 1),
                result.drop(['a', 'b'], axis = 1),
            ],
            axis = 1,
        )

    return result




[docs]
def oma_table(
        organism_a: str | int = 'human',
        organism_b: str | int = 'mouse',
        id_type: str | None = None,
        rel_type: set[Literal['1:1', '1:n', 'm:1', 'm:n']] | None = None,
        score: float = None,
        return_df: bool = False,
    ) -> dict[str[set[str]]] | pd.DataFrame:
    """
    Translation table of orthologous gene pairs between two organisms from the
    OMA (Orthologous Matrix) database (https://omabrowser.org/oma/home/).

    Args:
        organism_a:
            Name or NCBI Taxonomy ID of the first organism.
        organism_b:
            Name or NCBI Taxonomy ID of the second organism.
        id_type:
            OMA by default uses UniProt entry IDs and sometimes other
            identifiers for genes. Set this parameter to control which
            ID type all the identifiers are to be mapped to.
        rel_type:
            Restrict relations to certain types.
        score:
            Lower threshold for similarity metric.
        return_df:
            If True, returns a data frame instead of a list of tuples.

    Returns:
        A dict with source organism identifiers as keys and sets of target
        organism identifiers as values; or a two column data frame with
        source organism-target organism identifier pairs.
    """

    full = oma_orthologs(**locals())

    if return_df:

        result = full[['id_a', 'id_b']]

    else:

        result = collections.defaultdict(set)

        for o in full:

            result[o.a.id].add(o.b.id)

    return result



def _id_translate(id_: str, taxon: int, id_type: str | None) -> set[str]:

    if not id_type: return {id_}

    s_id_type = (
        'ensg'
            if id_.startswith('ENS') else
        'uniprot-entry'
            if '_' in id_ else
        'uniprot'
    )

    uniprots = mapping.map_name(
        id_,
        s_id_type,
        'uniprot',
        ncbi_tax_id = taxon,
    )

    return mapping.map_names(
        uniprots,
        'uniprot',
        id_type,
        ncbi_tax_id = taxon,
    ) if uniprots else set()