Source code for pypath.inputs.genecards

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import re
import bs4
import warnings

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.session as session
import pypath.share.settings as settings

_logger = session.Logger(name = 'inputs.genecards')
_log = _logger._log


_respace = re.compile(r'\s+')
_summary_sources = {
    'Gene Wiki': 'GeneWiki',
    'UniProtKB/Swiss-Prot': 'UniProt',
}


[docs] def genecards_datasheet(gene): """ Retrieves a gene (protein) datasheet from GeneCards. Returns HTML as string. :param str gene: A Gene Symbol or UniProt ID. """ url = urls.urls['genecards']['url'] % gene c = curl.Curl( url, silent = True, large = False, connect_timeout = settings.get('genecards_datasheet_connect_timeout'), timeout = settings.get('genecards_datasheet_timeout'), ) if c.status not in {0, 200}: _log('Failed to retrieve gene card for ID `%s`.' % gene) return None return c.result
[docs] def genecards_soup(gene): """ Retrieves a gene (protein) datasheet from GeneCards. Returns ``bs4.BeautifulSoup`` object. :param str gene: A Gene Symbol or UniProt ID. """ html = genecards_datasheet(gene) if html: with warnings.catch_warnings(): warnings.simplefilter('ignore') soup = bs4.BeautifulSoup(html) return soup
[docs] def genecards_summaries(gene): """ Retrieves the summaries from a GeneCards datasheet. Returns a dict with the resource names as keys and the summary texts as values. :param str gene: A Gene Symbol or UniProt ID. """ result = {} soup = genecards_soup(gene) if not soup: return result summaries = soup.select_one('section#summaries') if summaries: for summary in summaries.select('div.gc-subsection'): title = summary.select_one('h3').text.strip('\r\n ') if title[:7] in {'No data', 'Additio'}: continue content = _respace.sub( ' ', ' '.join( par.text for par in summary.select(':not(:nth-child(1))') ) ).strip('\n\r ') for gc_name, name in iteritems(_summary_sources): if title.startswith(gc_name): title = name break title = title.split(maxsplit = 1)[0] if content: result[title] = content return result