from future.utils import iteritems

import re
import bs4
import warnings

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.session as session
import pypath.share.settings as settings

_logger = session.Logger(name = 'inputs.genecards')
_log = _logger._log

_respace = re.compile(r'\s+')
_summary_sources = {
    'Gene Wiki': 'GeneWiki',
    'UniProtKB/Swiss-Prot': 'UniProt',

[docs] def genecards_datasheet(gene): """ Retrieves a gene (protein) datasheet from GeneCards. Returns HTML as string. :param str gene: A Gene Symbol or UniProt ID. """ url = urls.urls['genecards']['url'] % gene c = curl.Curl( url, silent = True, large = False, connect_timeout = settings.get('genecards_datasheet_connect_timeout'), timeout = settings.get('genecards_datasheet_timeout'), ) if c.status not in {0, 200}: _log('Failed to retrieve gene card for ID `%s`.' % gene) return None return c.result
[docs] def genecards_soup(gene): """ Retrieves a gene (protein) datasheet from GeneCards. Returns ``bs4.BeautifulSoup`` object. :param str gene: A Gene Symbol or UniProt ID. """ html = genecards_datasheet(gene) if html: with warnings.catch_warnings(): warnings.simplefilter('ignore') soup = bs4.BeautifulSoup(html) return soup
[docs] def genecards_summaries(gene): """ Retrieves the summaries from a GeneCards datasheet. Returns a dict with the resource names as keys and the summary texts as values. :param str gene: A Gene Symbol or UniProt ID. """ result = {} soup = genecards_soup(gene) if not soup: return result summaries = soup.select_one('section#summaries') if summaries: for summary in'div.gc-subsection'): title = summary.select_one('h3').text.strip('\r\n ') if title[:7] in {'No data', 'Additio'}: continue content = _respace.sub( ' ', ' '.join( par.text for par in':not(:nth-child(1))') ) ).strip('\n\r ') for gc_name, name in iteritems(_summary_sources): if title.startswith(gc_name): title = name break title = title.split(maxsplit = 1)[0] if content: result[title] = content return result