Source code for pypath.inputs.clinvar

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

"""
Variant data from the Clinvar database.
"""

from __future__ import annotations

import io
import csv
import ctypes
import collections

import pypath.share.curl as curl
import pypath.resources.urls as urls

csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))


[docs]
def clinvar_raw() -> list[tuple]:
    """
    Retrieves variant data from the Clinvar database.

    Returns:
        Variants as a list of named tuples.
    """

    Variant = collections.namedtuple(
            'Variant',
            [
                'allele',
                'type',
                'variant',
                'entrez',
                'genesymbol',
                'clinical_significance',
                'review_status',
                'rs',
                'phenotype_ids',
                'phenotypes',
                'otherids',
                'origin',
                'variation_id',
                'assembly',
                'chromosome',
                'chromosome_accession',
            ],
            defaults = None
        )

    url = urls.urls['clinvar']['url']
    c = curl.Curl(url, large = True, silent = False)
    c.gzfile.seek(1) # get rid of a stray `#` character

    response = csv.DictReader(
        io.TextIOWrapper(c.gzfile),
        dialect = 'excel-tab',
    )

    result = set()

    for row in response:

        phenotype_ids = tuple(row['PhenotypeIDS'].replace('|', ';').replace(',', ';').split(';'))
        phenotypes = tuple(row['PhenotypeList'].replace('|', ';').replace(',', ';').split(';'))
        otherids = tuple(row['OtherIDs'].replace('|', ';').replace(',', ';').split(';'))

        variant = Variant(
            allele = row['AlleleID'],
            type = row['Type'],
            variant = row['Name'],
            entrez = row['GeneID'],
            genesymbol = row['GeneSymbol'],
            clinical_significance = row['ClinicalSignificance'],
            review_status = row['ReviewStatus'],
            rs = row['RS# (dbSNP)'],
            phenotype_ids = phenotype_ids,
            phenotypes = phenotypes,
            otherids = None if otherids[0] == '-' else otherids,
            origin = row['OriginSimple'],
            variation_id = row['VariationID'],
            assembly = row['Assembly'],
            chromosome = row['Chromosome'],
            chromosome_accession = row['ChromosomeAccession'],
        )
        result.add(variant)

    return list(result)




[docs]
def clinvar_citations() -> list[tuple]:
    """
    Retrieves citation information of variants

    Returns:
        Citations as a list of named tuples.
    """

    Citation = collections.namedtuple(
        'Citation',
        [
            'allele',
            'variation_id',
            'nsv',
            'citation_source',
            'citation_id'
        ],
        defaults=None
    )

    url = urls.urls['clinvar']['url_citations']

    c = curl.Curl(url, large = True, silent = False)

    response = csv.DictReader(
        c.result,
        delimiter = '\t',
    )

    result = set()

    for row in response:
        
        citation = Citation(
            allele = row['#AlleleID'],
            variation_id = row['VariationID'],
            nsv = row['nsv'],
            citation_source = row['citation_source'],
            citation_id = row['citation_id']
        )

        result.add(citation)

    return list(result)