Source code for pypath.inputs.expasy

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

import re

import pandas as pd

import pypath.share.curl as curl
import pypath.share.common as common
import pypath.resources.urls as urls


[docs] def expasy_enzyme_classes( return_df: bool = False, ) -> list[tuple] | pd.DataFrame: """ Enzyme classification from the ExPASy database. Args: return_df: Return a pandas data frame. Returns: Tuples with the first 3 digits and the names of EC classes. """ url = urls.urls['expasy']['enzclass'] c = curl.Curl( url, silent=False, large=True, encoding='utf-8', default_mode='r', ) reec = re.compile( r'^(\d+)\.\s?' r'(?:(\d+))?-?\.\s*' r'(?:(\d+))?-?\.\s*' r'-?\s+' r'([-\w\(\)\s\+]+)\.$' ) result = [] for line in c.result: m = reec.match(line) if m: result.append(m.groups()) return pd.DataFrame(result) if return_df else result
[docs] def expasy_enzymes(return_df: bool = False) -> dict[key, tuple] | pd.DataFrame: """ Enzyme data from the ExPASy database. Args: return_df: Return a data frame. """ url = urls.urls['expasy']['enzymes'] c = curl.Curl( url, silent = False, large = True, encoding = 'utf-8', default_mode = 'r', slow = True, ) reid = re.compile(r'(\d+\.\d+\.\d+\.[\dn]+)') reup = re.compile(r'([\w]+), ([\w]+_\w+)') reeq = re.compile(r'\(\d\)') recc = re.compile(r'-!-') result = [] new_enzyme = lambda: { k: [] for k in ('uniprots', 'entries', 'ca', 'cc', 'an') } enzyme = new_enzyme() for line in c.result: prefix, *line = line.split(maxsplit = 1) if prefix == 'ID': enzyme['ec'] = reid.match(line[0]).group(0) elif prefix == 'DE': enzyme['de'] = line[0].strip('.\n ') elif prefix in {'CA', 'CC', 'AN'} and line: enzyme[prefix.lower()].append(line[0].strip('. \n')) elif prefix == 'DR': uniprots, entries = list(zip(*reup.findall(line[0]))) enzyme['uniprots'].extend(uniprots) enzyme['entries'].extend(entries) elif prefix == '//' and enzyme.get('ec', None): for key, rexp in zip(('ca', 'cc'), (reeq, recc)): enzyme[key] = common.del_empty( x.strip() for x in rexp.split(' '.join(enzyme[key])) ) result.append(enzyme) enzyme = new_enzyme() return pd.DataFrame(result) if return_df else {e['ec']: e for e in result}