Source code for pypath.omnipath.server.run

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import sys
import os
import re
import copy
import collections
import itertools
import hashlib
import warnings
import contextlib

from pypath.share import session as session_mod

_logger = session_mod.Logger(name = 'server')
_log = _logger._log

try:
    import twisted.web.resource
    import twisted.web.server
    import twisted.internet.reactor
    TwistedWebResource = twisted.web.resource.Resource
    TwistedWebSite = twisted.web.server.Site
    TWISTED_NOT_DONE_YET = twisted.web.server.NOT_DONE_YET
    twisted_listen_tcp = twisted.internet.reactor.listenTCP
    twisted_run = twisted.internet.reactor.run
except:
    _log('No module `twisted` available. Necessary to run HTTP server.', -1)
[docs] class TwistedWebResource: pass
[docs] class TwistedWebSite: pass
TWISTED_NOT_DONE_YET = None twisted_listen_tcp = lambda: None twisted_run = lambda: None import urllib import json import mimetypes import pandas as pd import numpy as np import pypath.resources as resources from pypath.omnipath.server import generate_about_page import pypath.omnipath.server._html as _html import pypath.resources.urls as urls import pypath.resources as resources_mod import pypath.share.common as common import pypath_common._constants as _const import pypath.core.intercell_annot as intercell_annot import pypath.share.settings as settings from pypath.share.common import flat_list from pypath._metadata import __version__ if 'unicode' not in __builtins__: unicode = str LICENSE_IGNORE = 'ignore'
[docs] def stop_server(): reactor.removeAll()
[docs] @contextlib.contextmanager def ignore_pandas_copywarn(): try: with warnings.catch_warnings(): warnings.simplefilter('ignore', pd.errors.SettingWithCopyWarning) yield finally: pass
[docs] class BaseServer(TwistedWebResource, session_mod.Logger): recomment = re.compile(r'<!--\s*Title:(.*?)-->')
[docs] def __init__(self): if not hasattr(self, '_log_name'): session_mod.Logger.__init__(name = 'server') self._log('Initializing BaseServer.') self.htmls = ['info', 'error_page.html'] self.welcome_message = ( 'Hello, this is the REST service of pypath %s. Welcome!\n' 'For the descriptions of pathway resources go to `/info`.\n' 'Available query types: interactions, enz_sub, complexes, \n' 'annotations, intercell' ) % __version__ self.isLeaf = True self._set_www_root() self._read_license_secret() self._res_ctrl = resources_mod.get_controller() TwistedWebResource.__init__(self) self._log('Twisted resource initialized.')
def render_GET(self, request): response = [] request.postpath = [i.decode('utf-8') for i in request.postpath if i] self._log( 'Processing request: `%s` from `%s`; headers: [%s].' % ( request.uri.decode('utf-8'), str(request.getClientAddress()), common.dict_str(request.getAllHeaders()), ) ) if not request.postpath: request.postpath = ['index.html'] request.postpath[0] = self._query_type(request.postpath[0]) self._set_headers(request) if ( request.postpath and ( hasattr(self, request.postpath[0]) or request.postpath[0] == 'error_page.html' ) and request.postpath[0][0] != '_' ): if request.postpath[0] == 'error_page.html': toCall = self._error_page else: self._process_postpath(request) toCall = getattr(self, request.postpath[0]) if hasattr(toCall, '__call__'): self._log( 'Query type: `%s`; Arguments: [%s].' % ( request.postpath[0], common.dict_str(request.args), ) ) try: response = toCall(request) response = ( response.encode('utf-8') if hasattr(response, 'encode') else response ) response = [response] except: self._log( 'Error while rendering `%s`:' % request.uri.decode('utf-8') ) self._log_traceback() raise else: local_path = self._local_path(request) if local_path: with open(local_path, 'rb') as fp: response = [fp.read()] response = self._add_html_header(local_path, response) if not response: response = [ ( "Not found: %s%s" % ( '/'.join(request.postpath), '' if len(request.args) == 0 else '?%s' % '&'.join([ '%s=%s' % ( k.decode('utf-8'), v[0].decode('utf-8') ) for k, v in iteritems(request.args) if v ]) ) ).encode('utf-8') ] request.setHeader('Content-Length', str(len(response[0]))) request.write(response[0]) self._log( 'Finished serving request: `%s`.' % request.uri.decode('utf-8') ) request.finish() return TWISTED_NOT_DONE_YET def render_POST(self, request): if ( request.getHeader(b'content-type') and request.getHeader(b'content-type').startswith(b'application/json') ): post_content = request.content.getvalue() if post_content and post_content.strip(): args_raw = json.loads(post_content) request.args = dict( ( k.encode('utf-8'), [v.encode('utf-8')] if type(v) is not list else [','.join(v).encode('utf-8')] ) for k, v in iteritems(args_raw) ) return self.render_GET(request) def _set_www_root(self): self.wwwbuiltin = os.path.join( session_mod.session().module_root, 'data', 'www', ) self.wwwroot = settings.get('www_root') if not os.path.exists(self.wwwroot): self.wwwroot = self.wwwbuiltin def _local_path(self, request): if request.postpath and request.postpath[-1][0] in ('_', '.'): return for wwwroot in (self.wwwroot, self.wwwbuiltin): path = os.path.join(wwwroot, *request.postpath) if os.path.isfile(path): return path def _set_headers(self, request): for k, v in iteritems(request.args): request.args[k] = [b','.join(v)] request.setHeader('Cache-Control', 'Public') request.setHeader('Access-Control-Allow-Origin', '*') if '' in request.postpath: request.postpath.remove('') if not request.postpath: request.postpath = ['index.html'] if request.postpath and request.postpath[0] == 'resources': request.args[b'format'] = [b'json'] local_path = self._local_path(request) if local_path: format_ = mimetypes.guess_type(local_path)[0] format_ = ( tuple(format_.split('/')) if format_ else ('text', 'plain') ) elif ( not request.postpath or request.postpath[0] in self.htmls or request.postpath[0] == 'error_page.html' ): format_ = ('text', 'html') elif ( b'format' in request.args and request.args[b'format'][0] == b'json' ): format_ = ('application', 'json') elif request.postpath[0] == 'favicon.ico': format_ = ('image', 'vnd.microsoft.icon') else: request.args[b'format'] = [b'text'] format_ = ('text', 'plain') request.setHeader( 'Content-Type', '%s/%s%s' % ( format_ + ( '; charset=utf-8' if format_[0] == 'text' else '', ) ) ) request.args[b'header'] = ( [b'1'] if b'header' not in request.args else request.args[b'header'] ) self._set_fields(request) self._set_license(request) def _set_fields(self, req): synonyms = ( self.field_synonyms if hasattr(self, 'field_synonyms') else {} ) if b'fields' in req.args: used = set() fields_checked = [] for field in req.args[b'fields'][0].decode('utf-8').split(','): field = synonyms[field] if field in synonyms else field if field not in used: fields_checked.append(field) used.add(field) req.args[b'fields'] = [','.join(fields_checked).encode('utf-8')] else: req.args[b'fields'] = [] def _set_license(self, req): query_type = req.postpath[0] if req.postpath else None query_type = self._query_type(query_type) if ( not hasattr(self, 'args_reference') or not query_type or query_type not in self.args_reference or 'license' not in self.args_reference[query_type] ): return auth = False if b'password' in req.args: req_secret = hashlib.md5(req.args[b'password'][0]).hexdigest() auth = ( self._license_secret is not None and self._license_secret == req_secret ) # if someone sent a good password # why not to ignore the licenses if auth: req.args[b'license'] = [b'ignore'] # if the license level is not set # or set to `ignore` but no successfull authentication # we fall back to the default license level if ( b'license' not in req.args or ( not auth and req.args[b'license'][0] == b'ignore' ) ): req.args[b'license'] = self._default_license def _process_postpath(self, req): if len(req.postpath) > 1: ids_left = [req.postpath[1].encode('utf-8')] ids_right = ( [req.postpath[2].encode('utf-8')] if ( len(req.postpath) > 2 and req.postpath[2].lower() not in {'and', 'or'} ) else None ) left_right = ( [b'OR'] if req.postpath[-1].lower() not in {'and', 'or'} else [req.postpath[-1].encode('utf-8')] ) if ids_right: if req.postpath[0] == 'enzsub': req.args[b'enzymes'] = ids_left req.args[b'substrates'] = ids_right else: req.args[b'sources'] = ids_left req.args[b'targets'] = ids_right else: req.args[b'partners'] = ids_left if req.postpath[0] == 'enzsub': req.args[b'enzyme_substrate'] = left_right else: req.args[b'source_target'] = left_right def _query_type(self, query_type): return ( self.query_type_synonyms[query_type] if ( hasattr(self, 'query_type_synonyms') and query_type in self.query_type_synonyms ) else query_type ) def _add_html_header(self, local_path, response): if ( local_path.endswith('html') or local_path.endswith('htm') ) and not response[0].startswith(b'<!DOCTYPE html>'): head_foot = [ ( b'<!DOCTYPE html>\n<html lang="en">\n' b'<head><title>%s</title></head>\n<body>\n' ), b'</body>\n</html>', ] for wwwroot in (self.wwwroot, self.wwwbuiltin): for i, part in enumerate(('header', 'footer')): path = os.path.join(wwwroot, '_%s.html' % part) if os.path.exists(path): with open(path, 'rb') as fp: head_foot[i] = fp.read() if b'%s' in head_foot[0]: title = self.recomment.search(response[0]) title = title.groups()[0] if title else b'pypath server' head_foot[0] = head_foot[0] % title.strip() response[0] = head_foot[0] + response[0] + head_foot[1] return response def about(self, req): return self.welcome_message def info(self, req): if ( b'format' in req.args and req.args[b'format'][0] == b'json' and hasattr(self, 'resources') ): return self.resources(req) rc = resources.get_controller() rc.update() return generate_about_page.generate_about_html(rc.data) def _root(self, req): return _html.main_page() def _parse_arg(self, arg): if isinstance(arg, list) and arg: arg = arg[0] if hasattr(arg, 'decode'): arg = arg.decode('utf-8') if hasattr(arg, 'lower'): arg = arg.lower() if hasattr(arg, 'isdigit') and arg.isdigit(): arg = int(arg) if arg in _const.BOOLEAN_FALSE: arg = False if arg in _const.BOOLEAN_TRUE: arg = True return bool(arg) def _read_license_secret(self): self._license_secret = None path = settings.get('license_secret') if os.path.exists(path): self._log('Reading license unlocking secret from `%s`.' % path) with open(path, 'r') as fp: self._license_secret = fp.read().strip() self._default_license = [ settings.get('server_default_license').encode('ascii') ] def _error_page(self, req): req.setResponseCode(500) return _html.http_500()
[docs] class TableServer(BaseServer): query_types = { 'annotations', 'intercell', 'interactions', 'enz_sub', 'enzsub', 'ptms', 'complexes', 'about', 'info', 'queries', 'annotations_summary', 'intercell_summary', } data_query_types = { 'annotations', 'intercell', 'interactions', 'enzsub', 'complexes', } list_fields = { 'sources', 'references', 'isoforms', } int_list_fields = { 'references', 'isoforms', } field_synonyms = { 'organism': 'ncbi_tax_id', 'tfregulons_level': 'dorothea_level', 'tfregulons_curated': 'dorothea_curated', 'tfregulons_chipseq': 'dorothea_chipseq', 'tfregulons_tfbs': 'dorothea_tfbs', 'tfregulons_coexp': 'dorothea_coexp', 'sources': 'resources', 'databases': 'resources', } args_reference = { 'interactions': { 'header': None, 'format': { 'json', 'tab', 'text', 'tsv', 'table' }, 'license': { 'ignore', 'academic', 'non_profit', 'nonprofit', 'for_profit', 'forprofit', 'commercial', }, 'password': None, 'limit': None, 'datasets': { 'omnipath', 'tfregulons', 'dorothea', 'collectri', 'tf_target', 'tf_mirna', 'lncrna_mrna', 'kinaseextra', 'ligrecextra', 'pathwayextra', 'mirnatarget', 'small_molecule', }, 'types': { 'post_translational', 'transcriptional', 'post_transcriptional', 'mirna_transcriptional', 'lncrna_post_transcriptional', 'small_molecule_protein', }, 'sources': None, 'resources': None, 'databases': None, 'targets': None, 'partners': None, 'genesymbols': _const.BOOLEAN_VALUES, 'evidences': None, 'extra_attrs': None, 'fields': { 'entity_type', 'references', 'sources', 'tfregulons_level', 'tfregulons_curated', 'tfregulons_chipseq', 'tfregulons_tfbs', 'tfregulons_coexp', 'dorothea_level', 'dorothea_curated', 'dorothea_chipseq', 'dorothea_tfbs', 'dorothea_coexp', 'type', 'ncbi_tax_id', 'databases', 'resources', 'organism', 'curation_effort', 'datasets', 'extra_attrs', 'evidences', }, 'tfregulons_levels': {'A', 'B', 'C', 'D', 'E'}, 'tfregulons_methods': { 'curated', 'chipseq', 'coexp', 'tfbs', }, 'dorothea_levels': {'A', 'B', 'C', 'D', 'E'}, 'dorothea_methods': { 'curated', 'chipseq', 'coexp', 'tfbs', }, 'organisms': { '9606', '10090', '10116', }, 'source_target': { 'AND', 'OR', 'and', 'or', }, 'directed': _const.BOOLEAN_VALUES, 'signed': _const.BOOLEAN_VALUES, 'loops': _const.BOOLEAN_VALUES, 'entity_types': { 'protein', 'complex', 'mirna', 'lncrna', 'small_molecule', 'drug', 'metabolite', 'lipid', }, }, 'enzsub': { 'header': None, 'format': { 'json', 'tab', 'text', 'tsv', 'table', }, 'license': { 'ignore', 'academic', 'non_profit', 'nonprofit', 'for_profit', 'forprofit', 'commercial', }, 'password': None, 'limit': None, 'enzymes': None, 'substrates': None, 'partners': None, 'genesymbols': _const.BOOLEAN_VALUES, 'organisms': { '9606', '10090', '10116', }, 'databases': None, 'resources': None, 'residues': None, 'modification': None, 'types': None, 'fields': { 'sources', 'references', 'ncbi_tax_id', 'organism', 'databases', 'resources', 'isoforms', 'curation_effort', }, 'enzyme_substrate': { 'AND', 'OR', 'and', 'or', } }, 'annotations': { 'header': None, 'format': { 'json', 'tab', 'text', 'tsv', 'table', }, 'license': { 'ignore', 'academic', 'non_profit', 'nonprofit', 'for_profit', 'forprofit', 'commercial', }, 'password': None, 'limit': None, 'databases': None, 'resources': None, 'proteins': None, 'fields': None, 'genesymbols': _const.BOOLEAN_VALUES, 'entity_types': { 'protein', 'complex', 'mirna', 'lncrna', 'small_molecule', 'drug', 'metabolite', 'lipid', }, }, 'annotations_summary': { 'header': None, 'format': { 'json', 'tab', 'text', 'tsv', 'table', }, 'databases': None, 'resources': None, 'fields': None, 'cytoscape': _const.BOOLEAN_VALUES, }, 'intercell': { 'header': None, 'format': { 'json', 'tab', 'text', 'tsv', 'table', }, 'license': { 'ignore', 'academic', 'non_profit', 'nonprofit', 'for_profit', 'forprofit', 'commercial', }, 'password': None, 'limit': None, 'scope': { 'specific', 'generic', }, 'aspect': { 'functional', 'locational', }, 'source': { 'resource_specific', 'composite', }, 'categories': None, 'databases': None, 'resources': None, 'parent': None, 'proteins': None, 'fields': None, 'entity_types': { 'protein', 'complex', 'mirna', 'lncrna', 'small_molecule', 'drug', 'metabolite', 'lipid', }, 'transmitter': _const.BOOLEAN_VALUES, 'receiver': _const.BOOLEAN_VALUES, 'trans': _const.BOOLEAN_VALUES, 'rec': _const.BOOLEAN_VALUES, 'secreted': _const.BOOLEAN_VALUES, 'plasma_membrane_peripheral': _const.BOOLEAN_VALUES, 'plasma_membrane_transmembrane': _const.BOOLEAN_VALUES, 'sec': _const.BOOLEAN_VALUES, 'pmp': _const.BOOLEAN_VALUES, 'pmtm': _const.BOOLEAN_VALUES, 'causality': { 'transmitter', 'trans', 'receiver', 'rec', 'both' }, 'topology': { 'secreted', 'sec', 'plasma_membrane_peripheral', 'pmp', 'plasma_membrane_transmembrane', 'pmtm', }, }, 'intercell_summary': { 'header': None, 'format': { 'json', 'tab', 'text', 'tsv', 'table', }, 'scope': { 'specific', 'generic', }, 'aspect': { 'functional', 'locational', }, 'source': { 'resource_specific', 'generic', }, 'categories': None, 'resources': None, 'databases': None, 'parent': None, 'fields': None, 'transmitter': _const.BOOLEAN_VALUES, 'receiver': _const.BOOLEAN_VALUES, 'trans': _const.BOOLEAN_VALUES, 'rec': _const.BOOLEAN_VALUES, 'secreted': _const.BOOLEAN_VALUES, 'plasma_membrane_peripheral': _const.BOOLEAN_VALUES, 'plasma_membrane_transmembrane': _const.BOOLEAN_VALUES, 'sec': _const.BOOLEAN_VALUES, 'pmp': _const.BOOLEAN_VALUES, 'pmtm': _const.BOOLEAN_VALUES, }, 'complexes': { 'header': None, 'format': { 'json', 'tab', 'text', 'tsv', 'table', }, 'license': { 'ignore', 'academic', 'non_profit', 'nonprofit', 'for_profit', 'forprofit', 'commercial', }, 'password': None, 'limit': None, 'databases': None, 'resources': None, 'proteins': None, 'fields': None, }, 'resources': { 'license': { 'ignore', 'academic', 'non_profit', 'nonprofit', 'for_profit', 'forprofit', 'commercial', }, 'format': { 'json', }, 'datasets': { 'interactions', 'interaction', 'network', 'enzsub', 'enz_sub', 'enzyme-substrate', 'annotations', 'annotation', 'annot', 'intercell', 'complex', 'complexes', }, 'subtypes': None, }, 'queries': { 'format': { 'tab', 'text', 'tsv', 'table', 'json', }, }, } query_type_synonyms = { 'interactions': 'interactions', 'interaction': 'interactions', 'network': 'interactions', 'enz_sub': 'enzsub', 'enz-sub': 'enzsub', 'ptms': 'enzsub', 'ptm': 'enzsub', 'enzyme-substrate': 'enzsub', 'enzyme_substrate': 'enzsub', 'annotations': 'annotations', 'annotation': 'annotations', 'annot': 'annotations', 'intercell': 'intercell', 'intercellular': 'intercell', 'inter_cell': 'intercell', 'inter-cell': 'intercell', 'complex': 'complexes', 'complexes': 'complexes', } datasets_ = { 'omnipath', 'tfregulons', 'dorothea', 'collectri', 'tf_target', 'kinaseextra', 'ligrecextra', 'pathwayextra', 'mirnatarget', 'tf_mirna', 'lncrna_mrna', 'small_molecule', } dorothea_methods = {'curated', 'coexp', 'chipseq', 'tfbs'} dataset2type = { 'omnipath': 'post_translational', 'tfregulons': 'transcriptional', 'dorothea': 'transcriptional', 'collectri': 'transcriptional', 'tf_target': 'transcriptional', 'kinaseextra': 'post_translational', 'ligrecextra': 'post_translational', 'pathwayextra': 'post_translational', 'mirnatarget': 'post_transcriptional', 'tf_mirna': 'mirna_transcriptional', 'lncrna_mrna': 'lncrna_post_transcriptional', 'small_molecule': 'small_molecule_protein', } interaction_fields = { 'references', 'sources', 'dorothea_level', 'dorothea_curated', 'dorothea_chipseq', 'dorothea_tfbs', 'dorothea_coexp', 'tfregulons_level', 'tfregulons_curated', 'tfregulons_chipseq', 'tfregulons_tfbs', 'tfregulons_coexp', 'type', 'ncbi_tax_id', 'databases', 'organism', 'curation_effort', 'resources', 'entity_type', 'datasets', 'extra_attrs', 'evidences', } enzsub_fields = { 'references', 'sources', 'databases', 'isoforms', 'organism', 'ncbi_tax_id', 'curation_effort', 'resources', } default_input_files = { 'interactions': 'omnipath_webservice_interactions.tsv', 'enzsub': 'omnipath_webservice_enz_sub.tsv', 'annotations': 'omnipath_webservice_annotations.tsv', 'complexes': 'omnipath_webservice_complexes.tsv', 'intercell': 'omnipath_webservice_intercell.tsv', } default_dtypes = collections.defaultdict( dict, interactions = { 'source': 'category', 'target': 'category', 'source_genesymbol': 'category', 'target_genesymbol': 'category', 'is_directed': 'int8', 'is_stimulation': 'int8', 'is_inhibition': 'int8', 'consensus_direction': 'int8', 'consensus_stimulation': 'int8', 'consensus_inhibition': 'int8', 'sources': 'category', 'references': 'category', 'dorothea_curated': 'category', 'dorothea_chipseq': 'category', 'dorothea_tfbs': 'category', 'dorothea_coexp': 'category', 'dorothea_level': 'category', 'type': 'category', 'ncbi_tax_id_source': 'int16', 'ncbi_tax_id_target': 'int16', 'entity_type_source': 'category', 'entity_type_target': 'category', 'curation_effort': 'int16', 'extra_attrs': 'category', 'evidences': 'category', }, annotations = { 'uniprot': 'category', 'genesymbol': 'category', 'entity_type': 'category', 'source': 'category', 'label': 'category', 'value': 'category', 'record_id': 'uint32', }, enzsub = { 'enzyme': 'category', 'substrate': 'category', 'enzyme_genesymbol': 'category', 'substrate_genesymbol': 'category', 'isoforms': 'category', 'residue_type': 'category', 'residue_offset': 'uint16', 'modification': 'category', 'sources': 'category', 'references': 'category', 'ncbi_tax_id': 'int16', 'curation_effort': 'int32', }, complexes = { 'name': 'category', 'stoichiometry': 'category', 'sources': 'category', 'references': 'category', 'identifiers': 'category', }, intercell = { 'category': 'category', 'database': 'category', 'uniprot': 'category', 'genesymbol': 'category', 'parent': 'category', 'aspect': 'category', 'scope': 'category', 'source': 'category', 'entity_type': 'category', 'consensus_score': 'uint16', 'transmitter': 'bool', 'receiver': 'bool', 'secreted': 'bool', 'plasma_membrane_transmembrane': 'bool', 'plasma_membrane_peripheral': 'bool', } ) # the annotation attributes served for the cytoscape app cytoscape_attributes = { ('Zhong2015', 'type'), ('MatrixDB', 'mainclass'), ('Matrisome', ('mainclass', 'subclass', 'subsubclass')), # ('TFcensus', 'in TFcensus'), ('Locate', ('location', 'cls')), ( 'Phosphatome', ( 'family', 'subfamily', #'has_protein_substrates', ) ), ('CancerSEA', 'state'), ('GO_Intercell', 'mainclass'), ('Adhesome', 'mainclass'), ('SignaLink3', 'pathway'), ( 'HPA_secretome', ( 'mainclass', #'secreted', ) ), ( 'OPM', ( 'membrane', 'family', #'transmembrane', ) ), ('KEGG', 'pathway'), #( #'CellPhoneDB', #( ## 'receptor', ## 'peripheral', ## 'secreted', ## 'transmembrane', ## 'receptor_class', ## 'secreted_class', #) #), ('kinase.com', ('group', 'family', 'subfamily')), ('Membranome', ('membrane',)), #('CSPA', 'in CSPA'), #('MSigDB', 'geneset'), #('Integrins', 'in Integrins'), ('HGNC', 'mainclass'), ('CPAD', ('pathway', 'effect_on_cancer', 'cancer', )), ('Signor', 'pathway'), ('Ramilowski2015', 'mainclass'), ('HPA_subcellular', 'location'), #('DisGeNet', 'disease'), ('Surfaceome', ('mainclass', 'subclasses')), ('IntOGen', 'role'), ('HPMR', ('role', 'mainclass', 'subclass', 'subsubclass')), #('CancerGeneCensus', #( ##'hallmark', ##'somatic', ##'germline', #'tumour_types_somatic', #'tumour_types_germline', #) #), #('DGIdb', 'category'), ('ComPPI', 'location'), ('Exocarta', 'vesicle'), ('Vesiclepedia', 'vesicle'), ('Ramilowski_location', 'location'), ('LRdb', ('role', 'cell_type')), }
[docs] def __init__( self, input_files = None, only_tables = None, exclude_tables = None, ): """ Server based on ``pandas`` data frames. :param dict input_files: Paths to tables exported by the ``pypath.websrvtab`` module. """ session_mod.Logger.__init__(self, name = 'server') self._log('TableServer starting up.') self.input_files = copy.deepcopy(self.default_input_files) self.input_files.update(input_files or {}) self.data = {} self.to_load = ( self.data_query_types - common.to_set(exclude_tables) if only_tables is None else common.to_set(only_tables) ) self._log('Datasets to load: %s.' % (', '.join(sorted(self.to_load)))) self._read_tables() self._preprocess_interactions() self._preprocess_enzsub() self._preprocess_annotations() self._preprocess_complexes() self._preprocess_intercell() self._update_resources() BaseServer.__init__(self) self._log('TableServer startup ready.')
def _read_tables(self): self._log('Loading data tables.') for name, fname in iteritems(self.input_files): if name not in self.to_load: continue fname_gz = f'{fname}.gz' fname = fname_gz if os.path.exists(fname_gz) else fname self._log('Loading dataset `%s` from file `%s`.' % (name, fname)) if not os.path.exists(fname): self._log( 'Missing table: `%s`.' % fname ) continue dtype = self.default_dtypes[name] self.data[name] = pd.read_csv( fname, sep = '\t', index_col = False, dtype = dtype, ) self._log( 'Table `%s` loaded from file `%s`.' % (name, fname) ) def _network(self, req): hdr = ['nodes', 'edges', 'is_directed', 'sources'] tbl = self.data['network'].field val = dict(zip(tbl.field, tbl.value)) if b'format' in req.args and req.args[b'format'] == b'json': return json.dumps(val) else: return '%s\n%s' % ('\t'.join(hdr), '\t'.join( [str(val[h]) for h in hdr])) def _preprocess_interactions(self): if 'interactions' not in self.data: return self._log('Preprocessing interactions.') tbl = self.data['interactions'] tbl['set_sources'] = pd.Series( [set(s.split(';')) for s in tbl.sources] ) tbl['set_dorothea_level'] = pd.Series( [ set(s.split(';')) if not pd.isnull(s) else set([]) for s in tbl.dorothea_level ] ) def _preprocess_enzsub(self): if 'enzsub' not in self.data: return self._log('Preprocessing enzyme-substrate relationships.') tbl = self.data['enzsub'] tbl['set_sources'] = pd.Series( [set(s.split(';')) for s in tbl.sources] ) def _preprocess_complexes(self): if 'complexes' not in self.data: return self._log('Preprocessing complexes.') tbl = self.data['complexes'] tbl = tbl[~tbl.components.isna()] with ignore_pandas_copywarn(): tbl['set_sources'] = [set(s.split(';')) for s in tbl.sources] tbl['set_proteins'] = [set(c.split('_')) for c in tbl.components] self.data['complexes'] = tbl def _preprocess_annotations_old(self): if 'annotations' not in self.data: return renum = re.compile(r'[-\d\.]+') def _agg_values(vals): result = ( '#'.join(sorted(set(str(ii) for ii in vals))) if not all( isinstance(i, (int, float)) or ( isinstance(i, str) and i and ( i is None or renum.match(i) ) ) for i in vals ) else '<numeric>' ) return result self._log('Preprocessing annotations.') self.data['annotations_summary'] = self.data['annotations'].groupby( ['source', 'label'], ).agg({'value': _agg_values}).reset_index(drop = False) def _preprocess_annotations(self): if 'annotations' not in self.data: return renum = re.compile(r'[-\d\.]+') self._log('Preprocessing annotations.') values_by_key = collections.defaultdict(set) # we need to do it this way as we are memory limited on the server # and pandas groupby is very memory intensive for row in self.data['annotations'].itertuples(): value = ( '<numeric>' if ( ( not isinstance(row.value, bool) and isinstance(row.value, (int, float)) ) or renum.match(row.value) ) else str(row.value) ) values_by_key[(row.source, row.label)].add(value) for vals in values_by_key.values(): if len(vals) > 1: vals.discard('<numeric>') vals.discard('') vals.discard('nan') self.data['annotations_summary'] = pd.DataFrame( list( (source, label, '#'.join(sorted(values))) for (source, label), values in iteritems(values_by_key) ), columns = ['source', 'label', 'value'], ) def _preprocess_intercell(self): if 'intercell' not in self.data: return self._log('Preprocessing intercell data.') tbl = self.data['intercell'] tbl.drop('full_name', axis = 1, inplace = True, errors = 'ignore') self.data['intercell_summary'] = tbl.filter( ['category', 'parent', 'database'], ).drop_duplicates() def _update_resources(self): self._log('Updating resource information.') self._resources_dict = collections.defaultdict(dict) res_ctrl = resources_mod.get_controller() for query_type in self.data_query_types: if query_type not in self.data: continue tbl = self.data[query_type] # finding out what is the name of the column with the resources # as this is different across the tables for colname, argname in ( ('database', 'databases'), ('sources', 'databases'), ('source', 'databases'), ('category', 'categories') ): if colname in tbl.columns: break # collecting all resource names values = sorted(set( itertools.chain(*( val.split(';') for val in getattr(tbl, colname) )) )) for db in values: if 'license' not in self._resources_dict[db]: license = res_ctrl.license(db) if license is None: msg = 'No license for resource `%s`.' % str(db) self._log(msg) raise RuntimeError(msg) license_data = license.features license_data['name'] = license.name license_data['full_name'] = license.full_name self._resources_dict[db]['license'] = license_data if 'queries' not in self._resources_dict[db]: self._resources_dict[db]['queries'] = {} if query_type not in self._resources_dict[db]['queries']: if query_type == 'interactions': datasets = set() for dataset in self.datasets_: if dataset not in tbl.columns: continue for in_dataset, resources in zip( getattr(tbl, dataset), tbl.set_sources, ): if in_dataset and db in resources: datasets.add(dataset) break self._resources_dict[db]['queries'][query_type] = { 'datasets': sorted(datasets), } elif query_type == 'intercell': tbl_db = tbl[ (tbl.database == db) & (tbl.scope == 'generic') ] self._resources_dict[db]['queries'][query_type] = { 'generic_categories': sorted( set(tbl_db.category) ), } else: self._resources_dict[db]['queries'][query_type] = {} self.args_reference[query_type][argname] = values self._resources_dict = dict(self._resources_dict) self._log('Finished updating resource information.') def _check_args(self, req): result = [] argname = req.postpath[0] ref = ( self.args_reference['resources'] if argname == 'databases' else self.args_reference[argname] ) for arg, val in iteritems(req.args): arg = arg.decode('utf-8') if arg in ref: if not ref[arg] or not val: continue val = ( {val[0]} if type(val[0]) is int else set(val[0].decode('utf-8').split(',')) ) unknowns = val - set(ref[arg]) if unknowns: result.append( ' ==> Unknown values for argument `%s`: `%s`' % ( arg, ', '.join(str(u) for u in unknowns) ) ) else: result.append(' ==> Unknown argument: `%s`' % arg) req.args[b'header'] = self._parse_arg(req.args[b'header']) if result: return ( 'Something is not entirely good:\n%s\n\n' 'Please check the examples at\n' 'https://github.com/saezlab/pypath\n' 'and\n' 'https://github.com/saezlab/DoRothEA\n' 'If you still experiencing issues contact us at\n' 'https://github.com/saezlab/pypath/issues' '' % '\n'.join(result) ) def queries(self, req): query_type = ( req.postpath[1] if len(req.postpath) > 1 else 'interactions' ) query_type = self._query_type(query_type) query_param = ( req.postpath[2] if len(req.postpath) > 2 else None ) if query_type in self.args_reference: result = dict( ( k, sorted(v) if isinstance(v, _const.LIST_LIKE) else v ) for k, v in self.args_reference[query_type].items() ) if query_param is not None and query_param in result: result = {query_param: result[query_param]} else: result = {} result[query_type] = ( 'No possible arguments defined for' 'query `%s` or no such query available.' % query_type ) result = self._dict_set_to_list(result) if b'format' in req.args and req.args[b'format'][0] == b'json': return json.dumps(result) else: return 'argument\tvalues\n%s' % '\n'.join( '%s\t%s' % ( k, ';'.join(v) if isinstance(v, (list, set, tuple)) else str(v) ) for k, v in iteritems(result) ) @classmethod def _dict_set_to_list(cls, dct): return dict( ( key, ( sorted(val) if isinstance(val, _const.LIST_LIKE) else cls._dict_set_to_list(val) if isinstance(val, dict) else val ) ) for key, val in iteritems(dct) ) def databases(self, req): query_type = ( req.postpath[1] if len(req.postpath) > 1 else 'interactions' ) query_type = self._query_type(query_type) datasets = ( set(req.postpath[2].split(',')) if len(req.postpath) > 2 else None ) tbl = ( self.data[query_type] if query_type in self.data else self.data['interactions'] ) # filter for datasets if query_type == 'interactions': if datasets is not None: tbl = tbl.loc[tbl.type.isin(datasets)] else: datasets = self._get_datasets() result = {} for dataset in datasets: result[dataset] = sorted(set.union( *tbl[tbl.type == dataset].set_sources) ) else: result = {} result['*'] = sorted(set.union(*tbl.set_sources)) if b'format' in req.args and req.args[b'format'][0] == b'json': return json.dumps(result) else: return 'dataset\tresources\n%s' % '\n'.join( '%s\t%s' % (k, ';'.join(v)) for k, v in iteritems(result) ) def _get_datasets(self): return list(self.data['interactions'].type.unique()) def datasets(self, req): query_type = ( req.postpath[1] if len(req.postpath) > 1 else 'interactions' ) if query_type == 'interactions': result = self._get_datasets() else: result = [] if b'format' in req.args and req.args[b'format'][0] == b'json': return json.dumps(result) else: return ';'.join(result) def interactions( self, req, datasets = {'omnipath'}, databases = None, dorothea_levels = {'A', 'B'}, organisms = {9606}, source_target = 'OR', ): bad_req = self._check_args(req) if bad_req: return bad_req hdr = [ 'source', 'target', 'is_directed', 'is_stimulation', 'is_inhibition', 'consensus_direction', 'consensus_stimulation', 'consensus_inhibition', ] if b'source_target' in req.args: source_target = ( req.args[b'source_target'][0].decode('utf-8').upper() ) # changes the old, "tfregulons" names to new "dorothea" self._tfregulons_dorothea(req) if b'databases' in req.args: req.args[b'resources'] = req.args[b'databases'] args = {} for arg in ( 'datasets', 'types', 'sources', 'targets', 'partners', 'resources', 'organisms', 'dorothea_levels', 'dorothea_methods', ): args[arg] = self._args_set(req, arg) # here adjust on the defaults otherwise we serve empty # response by default if not args['types']: args['datasets'] = args['datasets'] or datasets # keep only valid dataset names args['datasets'] = args['datasets'] & self.datasets_ args['organisms'] = set( int(t) for t in args['organisms'] if t.isdigit() ) args['organisms'] = args['organisms'] or organisms # do not allow impossible values # those would result KeyError later args['dorothea_levels'] = ( args['dorothea_levels'] or dorothea_levels ) args['dorothea_methods'] = ( args['dorothea_methods'] & self.dorothea_methods ) # provide genesymbols: yes or no if ( b'genesymbols' in req.args and self._parse_arg(req.args[b'genesymbols']) ): genesymbols = True hdr.insert(2, 'source_genesymbol') hdr.insert(3, 'target_genesymbol') else: genesymbols = False self._log('Processed arguments: [%s].' % common.dict_str(args)) # starting from the entire dataset tbl = self.data['interactions'] # filter by type if args['types']: tbl = tbl.loc[tbl.type.isin(args['types'])] # if partners provided those will overwrite # sources and targets args['sources'] = args['sources'] or args['partners'] args['targets'] = args['targets'] or args['partners'] # then we filter by source and target # which matched against both standard names # and gene symbols if args['sources'] and args['targets'] and source_target == 'OR': tbl = tbl.loc[ tbl.target.isin(args['targets']) | tbl.target_genesymbol.isin(args['targets']) | tbl.source.isin(args['sources']) | tbl.source_genesymbol.isin(args['sources']) ] else: if args['sources']: tbl = tbl.loc[ tbl.source.isin(args['sources']) | tbl.source_genesymbol.isin(args['sources']) ] if args['targets']: tbl = tbl.loc[ tbl.target.isin(args['targets']) | tbl.target_genesymbol.isin(args['targets']) ] # filter by datasets if args['datasets']: tbl = tbl.query(' or '.join(args['datasets'])) # filter by organism tbl = tbl.loc[ tbl.ncbi_tax_id_source.isin(args['organisms']) | tbl.ncbi_tax_id_target.isin(args['organisms']) ] dorothea_included = ( 'dorothea' in args['datasets'] or any(res.endswith('DoRothEA') for res in args['resources']) or ( 'transcriptional' in args['types'] and not args['datasets'] ) ) # filter by DoRothEA confidence levels if dorothea_included and args['dorothea_levels']: tbl = tbl.loc[ self._dorothea_dataset_filter(tbl, args) | [ bool(levels & args['dorothea_levels']) for levels in tbl.set_dorothea_level ] ] # filter by databases if args['resources']: tbl = tbl.loc[ [ bool(sources & args['resources']) for sources in tbl.set_sources ] ] # filtering for entity types if b'entity_types' in req.args: entity_types = self._args_set(req, 'entity_types') if len(entity_types) == 1 and 'protein' in entity_types: # pandas is awful: tbl = tbl.loc[ np.logical_and( tbl.entity_type_source.astype('string') == 'protein', tbl.entity_type_target.astype('string') == 'protein', ) ] else: tbl = tbl.loc[ tbl.entity_type_source.isin(entity_types) | tbl.entity_type_target.isin(entity_types) ] # filtering by DoRothEA methods if dorothea_included and args['dorothea_methods']: q = ['dorothea_%s' % m for m in args['dorothea_methods']] tbl = tbl.loc[ self._dorothea_dataset_filter(tbl, args) | tbl[q].any(1) ] # filter directed & signed if ( b'directed' not in req.args or self._parse_arg(req.args[b'directed']) ): tbl = tbl.loc[tbl.is_directed == 1] if ( b'signed' in req.args and self._parse_arg(req.args[b'signed']) ): tbl = tbl.loc[np.logical_or( tbl.is_stimulation == 1, tbl.is_inhibition == 1 )] # loops: remove by default if ( b'loops' not in req.args or not self._parse_arg(req.args[b'loops']) ): # pandas is a disaster: tbl = tbl.loc[ tbl.source.astype('string') != tbl.target.astype('string') ] req.args[b'fields'] = req.args[b'fields'] or [b''] _fields = [ f for f in req.args[b'fields'][0].decode('utf-8').split(',') if f in self.interaction_fields ] for f in (b'evidences', b'extra_attrs'): if f in req.uri and f not in req.args[b'fields'][0]: _fields.append(f.decode('utf-8')) for f in _fields: if f == 'ncbi_tax_id' or f == 'organism': hdr.append('ncbi_tax_id_source') hdr.append('ncbi_tax_id_target') elif f == 'entity_type': hdr.append('entity_type_source') hdr.append('entity_type_target') elif f in {'databases', 'resources'}: hdr.append('sources') elif f == 'datasets': hdr.extend( set(tbl.columns) & self.args_reference['interactions']['datasets'] & args['datasets'] ) else: hdr.append(f) license = self._get_license(req) tbl = self._filter_by_license_interactions(tbl, license) tbl = tbl.loc[:,hdr] return self._serve_dataframe(tbl, req) @classmethod def _dataset_included(cls, dataset: str, args: dict) -> bool: return ( dataset in args['datasets'] or ( not args['datasets'] and cls.dataset2type.get(dataset, None) in args['types'] ) ) @classmethod def _dorothea_dataset_filter(cls, tbl: pd.DataFrame, args: dict): return ( ( # if the tf_target dataset is requested # we need to serve it including the parts which # don't fit the filters below cls._dataset_included('tf_target', args) & tbl.tf_target ) | ( cls._dataset_included('collectri', args) & tbl.collectri ) | (tbl.type != 'transcriptional') ) def _tfregulons_dorothea(self, req): for arg in (b'datasets', b'fields'): if arg in req.args: req.args[arg] = [ it.replace(b'tfregulons', b'dorothea') for it in req.args[arg] ] for postfix in (b'levels', b'methods'): key = b'tfregulons_%s' % postfix new_key = b'dorothea_%s' % postfix if key in req.args and new_key not in req.args: req.args[new_key] = req.args[key] _ = req.args.pop(key) def enzsub( self, req, organisms = {9606}, enzyme_substrate = 'OR' ): bad_req = self._check_args(req) if bad_req: return bad_req hdr = [ 'enzyme', 'substrate', 'residue_type', 'residue_offset', 'modification' ] if b'enzyme_substrate' in req.args: enzyme_substrate = ( req.args[b'enzyme_substrate'][0].decode('utf-8').upper() ) if b'databases' in req.args: req.args[b'resources'] = req.args[b'databases'] args = {} for arg in ( 'enzymes', 'substrates', 'partners', 'resources', 'organisms', 'types', 'residues' ): args[arg] = self._args_set(req, arg) args['organisms'] = set( int(t) for t in args['organisms'] if t.isdigit() ) args['organisms'] = args['organisms'] or organisms # provide genesymbols: yes or no if ( b'genesymbols' in req.args and self._parse_arg(req.args[b'genesymbols']) ): genesymbols = True hdr.insert(2, 'enzyme_genesymbol') hdr.insert(3, 'substrate_genesymbol') else: genesymbols = False # starting from the entire dataset tbl = self.data['enzsub'] # filter by type if args['types']: tbl = tbl.loc[tbl.modification.isin(args['types'])] # if partners provided those will overwrite # enzymes and substrates args['enzymes'] = args['enzymes'] or args['partners'] args['substrates'] = args['substrates'] or args['partners'] # then we filter by enzyme and substrate # which matched against both standard names # and gene symbols if ( args['enzymes'] and args['substrates'] and enzyme_substrate == 'OR' ): tbl = tbl.loc[ tbl.substrate.isin(args['substrates']) | tbl.substrate_genesymbol.isin(args['substrates']) | tbl.enzyme.isin(args['enzymes']) | tbl.enzyme_genesymbol.isin(args['enzymes']) ] else: if args['enzymes']: tbl = tbl.loc[ tbl.enzyme.isin(args['enzymes']) | tbl.enzyme_genesymbol.isin(args['enzymes']) ] if args['substrates']: tbl = tbl.loc[ tbl.substrate.isin(args['substrates']) | tbl.substrate_genesymbol.isin(args['substrates']) ] # filter by organism tbl = tbl.loc[tbl.ncbi_tax_id.isin(args['organisms'])] # filter by databases if args['resources']: tbl = tbl.loc[ [ bool(args['resources'] & sources) for sources in tbl.set_sources ] ] if req.args[b'fields']: _fields = [ f for f in req.args[b'fields'][0].decode('utf-8').split(',') if f in self.enzsub_fields ] for f in _fields: if f == 'ncbi_tax_id' or f == 'organism': hdr.append('ncbi_tax_id') elif f in {'databases', 'resources'}: hdr.append('sources') else: hdr.append(f) license = self._get_license(req) tbl = self._filter_by_license_interactions(tbl, license) tbl = tbl.loc[:,hdr] return self._serve_dataframe(tbl, req) def ptms(self, req): req.postpath[0] = 'enzsub' return self.enzsub(req) def enz_sub(self, req): req.postpath[0] = 'enzsub' return self.enzsub(req) def annotations(self, req): bad_req = self._check_args(req) if bad_req: return bad_req if b'databases' in req.args: req.args[b'resources'] = req.args[b'databases'] if ( not settings.get('server_annotations_full_download') and not b'resources' in req.args and not b'proteins' in req.args ): return ( 'Downloading the entire annotations database by the REST ' 'API is not allowed because of its huge size (>1GB). ' 'We recommend to query a set of proteins or a few ' 'resources, depending on your interest. ' 'You can always download the full database from ' 'https://archive.omnipathdb.org/' 'omnipath_webservice_annotations__recent.tsv' ) # starting from the entire dataset tbl = self.data['annotations'] hdr = tbl.columns # filtering for resources if b'resources' in req.args: resources = self._args_set(req, 'resources') tbl = tbl.loc[tbl.source.isin(resources)] # filtering for entity types if b'entity_types' in req.args: entity_types = self._args_set(req, 'entity_types') tbl = tbl.loc[tbl.entity_type.isin(entity_types)] # filtering for proteins if b'proteins' in req.args: proteins = self._args_set(req, 'proteins') tbl = tbl.loc[ tbl.uniprot.isin(proteins) | tbl.genesymbol.isin(proteins) ] # provide genesymbols: yes or no if ( b'genesymbols' in req.args and self._parse_arg(req.args[b'genesymbols']) ): genesymbols = True hdr.insert(1, 'genesymbol') else: genesymbols = False license = self._get_license(req) tbl = self._filter_by_license_annotations(tbl, license) tbl = tbl.loc[:,hdr] return self._serve_dataframe(tbl, req) def annotations_summary(self, req): bad_req = self._check_args(req) if bad_req: return bad_req if b'databases' in req.args: req.args[b'resources'] = req.args[b'databases'] # starting from the entire dataset tbl = self.data['annotations_summary'] hdr = tbl.columns # filtering for resources if b'resources' in req.args: resources = self._args_set(req, 'resources') tbl = tbl.loc[tbl.source.isin(resources)] if ( b'cytoscape' in req.args and self._parse_arg(req.args[b'cytoscape']) ): cytoscape = True else: cytoscape = False tbl = tbl.loc[:,hdr] if cytoscape: tbl = tbl.set_index(['source', 'label'], drop = False) cytoscape_keys = { (source, label) for source, labels in self.cytoscape_attributes for label in ( labels if isinstance(labels, tuple) else (labels,) ) } & set(tbl.index) tbl = tbl.loc[list(cytoscape_keys)] return self._serve_dataframe(tbl, req) def intercell(self, req): bad_req = self._check_args(req) if bad_req: return bad_req if b'databases' in req.args: req.args[b'resources'] = req.args[b'databases'] # starting from the entire dataset tbl = self.data['intercell'] hdr = tbl.columns # filtering for category types for var in ( 'aspect', 'source', 'scope', 'transmitter', 'receiver', 'parent', 'resources', ): if var.encode('ascii') in req.args: values = self._args_set(req, var) if var in {'resources', 'databases'}: var = 'database' tbl = tbl.loc[getattr(tbl, var).isin(values)] for (_long, short) in ( ('transmitter', 'trans'), ('receiver', 'rec'), ('secreted', 'sec'), ('plasma_membrane_peripheral', 'pmp'), ('plasma_membrane_transmembrane', 'pmtm'), ): this_arg = None _long_b = _long.encode('ascii') short_b = short.encode('ascii') if _long_b in req.args: this_arg = self._parse_arg(req.args[_long_b]) elif short_b in req.args: this_arg = self._parse_arg(req.args[short_b]) if this_arg is not None: tbl = tbl.loc[getattr(tbl, _long) == this_arg] if b'causality' in req.args: causality = self._args_set(req, 'causality') trans = causality & {'transmitter', 'trans', 'both'} rec = causality & {'receiver', 'rec', 'both'} tbl = ( tbl.loc[tbl.transmitter | tbl.receiver] if trans and rec else tbl.loc[tbl.transmitter] if trans else tbl.loc[tbl.receiver] if rec else tbl ) if b'topology' in req.args: topology = self._args_set(req, 'topology') query = ' or '.join( colname for enabled, colname in ( (topology & {'secreted', 'sec'}, 'secreted'), ( topology & {'plasma_membrane_peripheral', 'pmp'}, 'plasma_membrane_peripheral' ), ( topology & {'plasma_membrane_transmembrane', 'pmtm'}, 'plasma_membrane_transmembrane' ) ) if enabled ) if query: tbl = tbl.query(query) # filtering for categories if b'categories' in req.args: categories = self._args_set(req, 'categories') tbl = tbl.loc[tbl.category.isin(categories)] # filtering for entity types if b'entity_types' in req.args: entity_types = self._args_set(req, 'entity_types') tbl = tbl.loc[tbl.entity_type.isin(entity_types)] # filtering for proteins if b'proteins' in req.args: proteins = self._args_set(req, 'proteins') tbl = tbl.loc[ np.logical_or( tbl.uniprot.isin(proteins), tbl.genesymbol.isin(proteins), ) ] license = self._get_license(req) tbl = self._filter_by_license_intercell(tbl, license) tbl = tbl.loc[:,hdr] return self._serve_dataframe(tbl, req) def intercell_summary(self, req): bad_req = self._check_args(req) if bad_req: return bad_req if b'databases' in req.args: req.args[b'resources'] = req.args[b'databases'] # starting from the entire dataset tbl = self.data['intercell_summary'] hdr = tbl.columns # filtering for category types for var in ( 'aspect', 'source', 'scope', 'transmitter', 'receiver', 'parent', 'resources', ): if var.encode('ascii') in req.args: values = self._args_set(req, var) tbl = tbl.loc[getattr(tbl, var).isin(values)] # filtering for categories if b'categories' in req.args: categories = self._args_set(req, 'categories') tbl = tbl.loc[tbl.category.isin(categories)] tbl = tbl.loc[:,hdr] return self._serve_dataframe(tbl, req) def complexes(self, req): bad_req = self._check_args(req) if bad_req: return bad_req if b'databases' in req.args: req.args[b'resources'] = req.args[b'databases'] # starting from the entire dataset tbl = self.data['complexes'] hdr = list(tbl.columns) hdr.remove('set_sources') hdr.remove('set_proteins') # filtering for resources if b'resources' in req.args: resources = self._args_set(req, 'resources') tbl = tbl.loc[ [ bool(sources & resources) for sources in tbl.set_sources ] ] # filtering for proteins if b'proteins' in req.args: proteins = self._args_set(req, 'proteins') tbl = tbl.loc[ [ bool(this_proteins & proteins) for this_proteins in tbl.set_proteins ] ] license = self._get_license(req) tbl = self._filter_by_license_complexes(tbl, license) tbl = tbl.loc[:,hdr] return self._serve_dataframe(tbl, req) def resources(self, req): datasets = ( { self._query_type(dataset.decode('ascii')) for dataset in req.args[b'datasets'] } if b'datasets' in req.args else None ) res_ctrl = resources_mod.get_controller() license = self._get_license(req) return json.dumps( dict( (k, v) for k, v in iteritems(self._resources_dict) if ( res_ctrl.license(k).enables(license) and ( not datasets or datasets & set(v['datasets'].keys()) ) ) ) ) @staticmethod def _get_license(req): return req.args[b'license'][0].decode('utf-8') @classmethod def _filter_by_license_complexes(cls, tbl, license): return cls._filter_by_license( tbl = tbl, license = license, res_col = 'sources', simple = False, prefix_col = 'identifiers', ) @classmethod def _filter_by_license_interactions(cls, tbl, license): return cls._filter_by_license( tbl = tbl, license = license, res_col = 'sources', simple = False, prefix_col = 'references', ) @classmethod def _filter_by_license_annotations(cls, tbl, license): return cls._filter_by_license( tbl = tbl, license = license, res_col = 'source', simple = True, ) @classmethod def _filter_by_license_intercell(cls, tbl, license): return cls._filter_by_license( tbl = tbl, license = license, res_col = 'database', simple = True, ) @staticmethod def _filter_by_license( tbl, license, res_col, simple = False, prefix_col = None, ): def filter_resources(res): res = { r for r in res if res_ctrl.license(r).enables(license) } composite = [ r for r in res if res_ctrl.license(r).name == 'Composite' ] if composite: composite_to_remove = { comp_res for comp_res in composite if not res_ctrl.secondary_resources(comp_res, True) & res } res = res - composite_to_remove return res if license == LICENSE_IGNORE or tbl.shape[0] == 0: return tbl res_ctrl = resources_mod.get_controller() _res_col = getattr(tbl, res_col) if simple: bool_idx = [ res_ctrl.license(res).enables(license) for res in _res_col ] else: _set_res_col = tbl.set_sources _res_to_keep = [ filter_resources(ress) for ress in _set_res_col ] with ignore_pandas_copywarn(): tbl[res_col] = [ ';'.join(sorted(ress)) for ress in _res_to_keep ] if prefix_col: _prefix_col = getattr(tbl, prefix_col) _new_prefix_col = [ ';'.join(sorted( pref_res for pref_res in pref_ress.split(';') if ( pref_res.split(':', maxsplit = 1)[0] in _res_to_keep[i] ) )) if isinstance(pref_ress, str) else pref_ress for i, pref_ress in enumerate(_prefix_col) ] with ignore_pandas_copywarn(): tbl[prefix_col] = _new_prefix_col bool_idx = [bool(res) for res in tbl[res_col]] tbl = tbl.loc[bool_idx] return tbl @classmethod def _serve_dataframe(cls, tbl, req): if b'limit' in req.args: limit = req.args[b'limit'][0].decode('utf-8') if limit.isdigit(): limit = int(limit) tbl = tbl.head(limit) if b'format' in req.args and req.args[b'format'][0] == b'json': data_json = tbl.to_json(orient = 'records') # this is necessary because in the data frame we keep lists # as `;` separated strings but in json is nicer to serve # them as lists data_json = json.loads(data_json) for i in data_json: for k, v in iteritems(i): if k in cls.list_fields: i[k] = ( [ ( int(f) if ( k in cls.int_list_fields and f.isdigit() ) else f ) for f in v.split(';') ] if isinstance(v, str) else [] ) return json.dumps(data_json) else: return tbl.to_csv( sep = '\t', index = False, header = bool(req.args[b'header']), chunksize = 2e5, ) @staticmethod def _args_set(req, arg): arg = arg.encode('utf-8') return ( set(req.args[arg][0].decode('utf-8').split(',')) if arg in req.args else set() )
[docs] class Rest(object):
[docs] def __init__( self, port, serverclass = TableServer, start = True, **kwargs ): """ Runs a webserver serving a `PyPath` instance listening to a custom port. Args ----- :param int port: The port to listen to. :param str serverclass' The class implementing the server. :param **kwargs: Arguments for initialization of the server class. """ self.port = port _log('Creating the server class.') self.server = serverclass(**kwargs) _log('Server class ready.') if start: _log('Starting the twisted server.') self.start()
def start(self): self.site = TwistedWebSite(self.server) _log('Site created.') twisted_listen_tcp(self.port, self.site) _log('Server going to listen on port %u from now.' % self.port) twisted_run()