Source code for pypath.inputs.genecards

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import re
import bs4
import warnings

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.session as session
import pypath.share.settings as settings

_logger = session.Logger(name = 'inputs.genecards')
_log = _logger._log


_respace = re.compile(r'\s+')
_summary_sources = {
    'Gene Wiki': 'GeneWiki',
    'UniProtKB/Swiss-Prot': 'UniProt',
}



[docs]
def genecards_datasheet(gene):
    """
    Retrieves a gene (protein) datasheet from GeneCards.
    Returns HTML as string.
    
    :param str gene:
        A Gene Symbol or UniProt ID.
    """
    
    url = urls.urls['genecards']['url'] % gene
    
    c = curl.Curl(
        url,
        silent = True,
        large = False,
        connect_timeout = settings.get('genecards_datasheet_connect_timeout'),
        timeout = settings.get('genecards_datasheet_timeout'),
    )
    
    if c.status not in {0, 200}:
        
        _log('Failed to retrieve gene card for ID `%s`.' % gene)
        
        return None
    
    return c.result




[docs]
def genecards_soup(gene):
    """
    Retrieves a gene (protein) datasheet from GeneCards.
    Returns ``bs4.BeautifulSoup`` object.
    
    :param str gene:
        A Gene Symbol or UniProt ID.
    """
    
    html = genecards_datasheet(gene)
    
    if html:
        
        with warnings.catch_warnings():
            
            warnings.simplefilter('ignore')
            soup = bs4.BeautifulSoup(html)
        
        return soup




[docs]
def genecards_summaries(gene):
    """
    Retrieves the summaries from a GeneCards datasheet. Returns a dict with
    the resource names as keys and the summary texts as values.
    
    :param str gene:
        A Gene Symbol or UniProt ID.
    """
    
    result = {}
    
    soup = genecards_soup(gene)
    
    if not soup:
        
        return result
    
    summaries = soup.select_one('section#summaries')
    
    if summaries:
        
        for summary in summaries.select('div.gc-subsection'):
            
            title = summary.select_one('h3').text.strip('\r\n ')
            
            if title[:7] in {'No data', 'Additio'}:
                
                continue
            
            content = _respace.sub(
                ' ',
                ' '.join(
                    par.text
                    for par in summary.select(':not(:nth-child(1))')
                )
            ).strip('\n\r ')
            
            for gc_name, name in iteritems(_summary_sources):
                
                if title.startswith(gc_name):
                    
                    title = name
                    break
                
                title = title.split(maxsplit = 1)[0]
            
            if content:
                
                result[title] = content
    
    return result