Source code for pypath.inputs.oma

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

from typing import Literal
import collections
import itertools

import pandas as pd

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.inputs.common as inputs_common
import pypath.utils.taxonomy as taxonomy
import pypath.utils.mapping as mapping


[docs] def oma_orthologs( organism_a: str | int = 'human', organism_b: str | int = 'mouse', id_type: str | None = None, rel_type: set[Literal['1:1', '1:n', 'm:1', 'm:n']] | None = None, score: float = None, return_df: bool = False, ) -> list[tuple] | pd.DataFrame: """ Retrieves pairwise relations between two genomes from the OMA (Orthologous Matrix) database (https://omabrowser.org/oma/home/). Args: organism_a: Name or NCBI Taxonomy ID of the first organism. organism_b: Name or NCBI Taxonomy ID of the second organism. id_type: OMA by default uses UniProt entry IDs and sometimes other identifiers for genes. Set this parameter to control which ID type all the identifiers are to be mapped to. rel_type: Restrict relations to certain types. score: Lower threshold for similarity metric. return_df: If True, returns a data frame instead of a list of tuples. Returns: A list with tuples of pairwise orthologous relationships or a data frame with the same records. """ OmaGene = collections.namedtuple( 'OmaGene', ( 'id', 'oma_group', 'hog', 'taxon', 'chr', 'start', 'end', 'strand', 'main_isoform', ) ) OmaOrthology = collections.namedtuple( 'OmaOrthology', ( 'a', 'b', 'rel_type', 'dist', 'score', ), ) organism_a = taxonomy.ensure_ncbi_tax_id(organism_a) organism_b = taxonomy.ensure_ncbi_tax_id(organism_b) rel_type = common.to_set(rel_type) url = urls.urls['oma']['url'] page = 1 n_pages = 1e6 # first decleration is set to prevent recurrency. # But at the end it will return as a list result = set() while True: page_url = f'{url}{organism_a}/{organism_b}/?page={page}&per_page=1000' c = curl.Curl(page_url, silent = False) if not c.result: break c.get_headers() n_pages = float(c.resp_headers_dict.get('x-total-count', 1e8)) / 100 page += 1 data = inputs_common.json_read(c.result) for rec in data: if ( (score and rec['score'] < score) or (rel_type and rec['rel_type'] not in rel_type) ): continue a, b = ( [ OmaGene( id = id_, oma_group = e['oma_group'], hog = e['oma_hog_id'], taxon = e['species']['taxon_id'], chr = e['chromosome'], start = int(e['locus']['start']), end = int(e['locus']['end']), strand = int(e['locus']['strand']), main_isoform = e['is_main_isoform'], ) for id_ in _id_translate( id_ = e['canonicalid'], taxon = e['species']['taxon_id'], id_type = id_type, ) ] for e in (rec[f'entry_{ei}'] for ei in (1, 2)) ) result.update( { OmaOrthology( a = _a, b = _b, rel_type = rec['rel_type'], dist = float(rec['distance']), score = float(rec['score']), ) for _a, _b in itertools.product(a, b) } ) if page > n_pages: break result = list(result) if return_df: result = pd.DataFrame(result) result = pd.concat( [ pd.DataFrame(result.a.tolist()).add_suffix('_a', axis = 1), pd.DataFrame(result.b.tolist()).add_suffix('_b', axis = 1), result.drop(['a', 'b'], axis = 1), ], axis = 1, ) return result
[docs] def oma_table( organism_a: str | int = 'human', organism_b: str | int = 'mouse', id_type: str | None = None, rel_type: set[Literal['1:1', '1:n', 'm:1', 'm:n']] | None = None, score: float = None, return_df: bool = False, ) -> dict[str[set[str]]] | pd.DataFrame: """ Translation table of orthologous gene pairs between two organisms from the OMA (Orthologous Matrix) database (https://omabrowser.org/oma/home/). Args: organism_a: Name or NCBI Taxonomy ID of the first organism. organism_b: Name or NCBI Taxonomy ID of the second organism. id_type: OMA by default uses UniProt entry IDs and sometimes other identifiers for genes. Set this parameter to control which ID type all the identifiers are to be mapped to. rel_type: Restrict relations to certain types. score: Lower threshold for similarity metric. return_df: If True, returns a data frame instead of a list of tuples. Returns: A dict with source organism identifiers as keys and sets of target organism identifiers as values; or a two column data frame with source organism-target organism identifier pairs. """ full = oma_orthologs(**locals()) if return_df: result = full[['id_a', 'id_b']] else: result = collections.defaultdict(set) for o in full: result[o.a.id].add(o.b.id) return result
def _id_translate(id_: str, taxon: int, id_type: str | None) -> set[str]: if not id_type: return {id_} s_id_type = ( 'ensg' if id_.startswith('ENS') else 'uniprot-entry' if '_' in id_ else 'uniprot' ) uniprots = mapping.map_name( id_, s_id_type, 'uniprot', ncbi_tax_id = taxon, ) return mapping.map_names( uniprots, 'uniprot', id_type, ncbi_tax_id = taxon, ) if uniprots else set()