Source code for pypath.inputs.ctdbase

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

import collections

from pypath.share import curl
from pypath.resources.urls import urls


CTD_URLS = {
    'chemical_gene': 'CTD_chem_gene_ixns.tsv.gz',
    'chemical_disease': 'CTD_chemicals_diseases.tsv.gz',
    'disease_pathway': 'CTD_diseases_pathways.tsv.gz',
    'chemical_phenotype': 'CTD_pheno_term_ixns.tsv.gz',
    'gene_disease': 'CTD_genes_diseases.tsv.gz',
    'chemical_vocabulary': 'CTD_chemicals.tsv.gz',
    'gene_vocabulary': 'CTD_genes.tsv.gz',
    'disease_vocabulary': 'CTD_diseases.tsv.gz',
    'pathway_vocabulary': 'CTD_pathways.tsv.gz',
    'anatomy_vocabulary': 'CTD_anatomy.tsv.gz',
    'phenotype_vocabulary': 'CTD_phenotypes.tsv.gz',
}


def _ctdbase_download(_type: str) -> list[tuple]:
    """
    Retrieves a CTDbase file and returns entries as a list of tuples.
    """

    if '_' not in _type:
        _type = f'{_type}_vocabulary'
    url = urls['ctdbase']['url'] % CTD_URLS[_type]

    c = curl.Curl(
        url,
        silent=False,
        large=True,
        encoding="utf-8",
        default_mode="r",
        compressed=True,
        compr="gz",
    )

    entries = list()
    fieldnames = None

    for line in c.result:

        if line.startswith("#"):

            line = line.strip(" #\n").split("\t")

            if len(line) > 1:
                fieldnames = [fieldname for fieldname in line if fieldname != '']
                record = collections.namedtuple('CTDEntry', fieldnames)

            continue

        data = line.split("\t")

        # if data[-1] == "\n":
        #     del data[-1]

        for i, v in enumerate(data):

            is_list = "|" in v
            has_sublist = "^" in v

            if is_list:
                v = v.split("|")
            
                if has_sublist:
                    v = [element.split("^") for element in v]

            elif has_sublist:
                v = [v.split("^")]

            data[i] = v

        if len(data) != len(fieldnames):
            continue # some lines have missing fields and cannot be parsed

        entry = {}
        for (fieldname, element) in zip(fieldnames, data):
            if element == "":
                element = None
            else:
                if type(element) == str:
                    element = element.strip()
                elif type(element) == list:
                    element = [e.strip() if type(e) == str else e for e in element]
            entry[fieldname] = element

        if _type == 'chemical_phenotype':

            entry = _modify_dict(entry, 
                ('comentionedterms', ['name', 'id', 'source']),
                ('anatomyterms',['sequenceorder', 'name', 'id']),
                ('inferencegenesymbols',['name', 'id']),
                ('interactionactions',['interaction', 'action']),
            )

        if _type == 'gene_disease':

            if entry['DirectEvidence'] == None:
                continue
        
        entries.append(record(**entry))

    return entries


[docs] def ctdbase_relations(relation_type: str) -> list[tuple]: """ Retrieves a CTDbase relation file. For "gene-disease" relation type only curated relations are returned (i.e. those with a "DirectEvidence" field) as the number of non-curated relations is too large. Args: relation_type: One of the following: 'chemical_gene', 'chemical_disease', 'disease_pathway', 'chemical_phenotype', 'gene_disease', Returns: Relations as a list of tuples. """ return _ctdbase_download(relation_type)
[docs] def ctdbase_vocabulary(vocabulary_type: str) -> list[tuple]: """ Retrieves a CTDbase vocabulary file. Args: vocabulary_type: One of the following: 'chemical', 'gene', 'disease', 'pathway', 'anatomy', 'phenotype', Returns: Vocabulary as a list of tuples. """ return _ctdbase_download(vocabulary_type)
def _modify_dict(_dict, *entry_pairs): for key, new_keys in entry_pairs: _dict[key] = _map_keys( new_keys, _dict[key] ) return _dict def _map_keys(keys, entry): if entry == None: return None result = list() for values in entry: temp_dict = dict() for key, value in zip(keys, values): temp_dict[key] = value result.append(temp_dict) return result