Source code for pypath.inputs.expasy
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
from __future__ import annotations
import re
import pandas as pd
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.resources.urls as urls
[docs]
def expasy_enzyme_classes(
return_df: bool = False,
) -> list[tuple] | pd.DataFrame:
"""
Enzyme classification from the ExPASy database.
Args:
return_df:
Return a pandas data frame.
Returns:
Tuples with the first 3 digits and the names of EC classes.
"""
url = urls.urls['expasy']['enzclass']
c = curl.Curl(
url,
silent=False,
large=True,
encoding='utf-8',
default_mode='r',
)
reec = re.compile(
r'^(\d+)\.\s?'
r'(?:(\d+))?-?\.\s*'
r'(?:(\d+))?-?\.\s*'
r'-?\s+'
r'([-\w\(\)\s\+]+)\.$'
)
result = []
for line in c.result:
m = reec.match(line)
if m:
result.append(m.groups())
return pd.DataFrame(result) if return_df else result
[docs]
def expasy_enzymes(return_df: bool = False) -> dict[key, tuple] | pd.DataFrame:
"""
Enzyme data from the ExPASy database.
Args:
return_df:
Return a data frame.
"""
url = urls.urls['expasy']['enzymes']
c = curl.Curl(
url,
silent = False,
large = True,
encoding = 'utf-8',
default_mode = 'r',
slow = True,
)
reid = re.compile(r'(\d+\.\d+\.\d+\.[\dn]+)')
reup = re.compile(r'([\w]+), ([\w]+_\w+)')
reeq = re.compile(r'\(\d\)')
recc = re.compile(r'-!-')
result = []
new_enzyme = lambda: {
k: []
for k in ('uniprots', 'entries', 'ca', 'cc', 'an')
}
enzyme = new_enzyme()
for line in c.result:
prefix, *line = line.split(maxsplit = 1)
if prefix == 'ID':
enzyme['ec'] = reid.match(line[0]).group(0)
elif prefix == 'DE':
enzyme['de'] = line[0].strip('.\n ')
elif prefix in {'CA', 'CC', 'AN'} and line:
enzyme[prefix.lower()].append(line[0].strip('. \n'))
elif prefix == 'DR':
uniprots, entries = list(zip(*reup.findall(line[0])))
enzyme['uniprots'].extend(uniprots)
enzyme['entries'].extend(entries)
elif prefix == '//' and enzyme.get('ec', None):
for key, rexp in zip(('ca', 'cc'), (reeq, recc)):
enzyme[key] = common.del_empty(
x.strip()
for x in rexp.split(' '.join(enzyme[key]))
)
result.append(enzyme)
enzyme = new_enzyme()
return pd.DataFrame(result) if return_df else {e['ec']: e for e in result}