Source code for pypath.proteomicsdb

#!/usr/bin/env python2
# -*- coding: utf-8 -*-

#  This file is part of the `pypath` python module
#  Copyright (c) 2014-2016 - EMBL-EBI
#  File author(s): Dénes Türei (
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#  Website:


from future.utils import iteritems
from past.builtins import xrange, range, reduce

import json
import base64
import re
import os
import sys

    import cPickle as pickle
    import pickle

    import ssl
except ImportError:
    sys.stdout.write("\t:: Error: no ssl support :(\n\n")

# from this module:
from pypath import dataio
from pypath.progress import Progress
from pypath.common import *

[docs]class ProteomicsDB(object): def __init__(self, username, password, output_format='json'): ''' This is an extensible class for downloading and processing data from ProteomicsDB. Now 2 of the 10 available APIs implemented here, but feel free to write functions for the other APIs. To find out more about ProteomicsDB, take a look at Wilhelm et al. 2014, Nature: To read a comprehensive descritpion of the APIs, visit here: @username : str Registered and API enabled user for ProteomicsDB. To have such a user, you need first to register, AND then write an e-mail to the address given on the webpage. In a couple of days the admins will enable the API for your user. @password : str Password of the user. @output_format : str Either 'json' or 'xml'. Some functions in this module process JSON further and give certain objects. ''' self.auth = [ 'Authorization: Basic %s' % base64.encodestring("%s:%s" % (username, password)).rstrip('\n') ] self.port = 443 self.output_format = output_format = '' self.tissues = [] self.expression = {} self.samples = {} self.tissues_loaded = set([]) self.testurl = '''/proteomicsdb/logic/api/proteinpeptideresult.xsodata/InputParams(PROTEINFILTER='Q92769')/Results?$select=UNIQUE_IDENTIFIER,PROTEIN_NAME,START_POSITION,END_POSITION,PEPTIDE_SEQUENCE,PEPTIDE_MASS,Q_VALUE,RANK,SCORE,SEARCH_ENGINE&$filter=PEPTIDE_MASS%20gt%201000%20&$format=xml ''' self.urls = { 'tissues': '' 'tissuelist.xsodata/CA_AVAILABLEBIOLOGICALSOURCES_API?' '$select=TISSUE_ID,TISSUE_NAME,TISSUE_GROUP_NAME,TISSUE_CATEGORY,' 'SCOPE_ID,SCOPE_NAME,QUANTIFICATION_METHOD_ID,' 'QUANTIFICATION_METHOD_NAME,MS_LEVEL&$format=%s', 'proteinpertissue': '' 'proteinspertissue.xsodata/InputParams(TISSUE_ID=%%27%s%%27,' 'CALCULATION_METHOD=%u,SWISSPROT_ONLY=%u,NO_ISOFORM=%u)/Results?' '$select=ENTRY_NAME,UNIQUE_IDENTIFIER,DATABASE,PROTEIN_DESCRIPTION,' 'PEPTIDES,SAMPLE_NAME,SAMPLE_DESCRIPTION,UNNORMALIZED_EXPRESSION,' 'NORMALIZED_EXPRESSION&$format=%s' } def query(self, api, param, silent=True, large=False): ''' Retrieves data from the API. @api : str Shold be one of the 10 API sections available. @param : tuple Tuple of the parameters according to the API. @large : bool Passed to the curl wrapper function. If True, the file will be written to disk, and a file object open for reading is returned; if False, the raw data will be returned, in case of JSON, converted to python object, in case of XML, as a string. ''' url = self.urls[api] % param # long timeout is given, because huge files (hundreds MB) take time to # load data = dataio.curl( url, req_headers=self.auth, silent=silent, timeout=1200, large=large) if self.output_format == 'json' and not large: self.result = self.get_json(data) else: self.result = data def get_json(self, reply): return json.loads(reply)['d']['results'] def get_tissues(self): ''' Gets an annotated list of all tissues for which ProteomicsDB has expression data. Result stored in `ProteomicsDB.tissues`. ''' self.query('tissues', (self.output_format, )) self.tissues = self.result def which_tissues(self, name, value): if len(self.tissues) == 0: self.get_tissues() value = set(value) if type(value) is list else set([value]) match = [] for tis in self.tissues: if tis[name] in value: match.append(tis) return match def get_proteins(self, tissue_id, calculation_method=0, swissprot_only=1, no_isoform=1): ''' ''' for i in xrange(3): self.query( 'proteinpertissue', (tissue_id, calculation_method, swissprot_only, no_isoform, self.output_format), large=True) if type(self.result) is file: break def get_pieces(self, size=20480, delimiters=('{', '}')): ''' A generator for reading huge files (hundreds of MBs). Reads segments of @size, searches for self-contained JSON objects, and returns a list of them. @size : int Size to read at once (in Bytes). @delimiters : tuple Starting and closing delimiters. By default, these are curly braces, to return individual JSON objects of the largest possible size. ''' piece = '' while True: new = if len(new) == 0: yield [] break buff = piece + new piece = '' open_br = 0 results = [] for c in buff: if c == delimiters[0]: open_br += 1 if open_br > 0: piece += c if c == delimiters[1]: open_br -= 1 if open_br == 0: results.append(piece) piece = '' yield results def get_expression(self, normalized=True, tissue_average=False): ''' Extracts normalized or unnormalized expression data from previously downloaded data, stored on disk, and opened for reading in file object `ProteomicsDB.result`. Optionally averages data per tissue. @normalized : bool Read normalized or unnormalized expression values. @tissue_average : bool Read and store data for each samples, or keep only the mean value per tissue. ''' non_digit = re.compile(r'[^\d.-]+') try: if type(self.result) is file: nul = self.current_samples = set([]) for pp in self.get_pieces(): for p in pp: protein = json.loads(p) if protein['SAMPLE_NAME'] not in self.expression: self.expression[protein['SAMPLE_NAME']] = {} self.current_samples.add(protein['SAMPLE_NAME']) self.expression[protein['SAMPLE_NAME']]\ [protein['UNIQUE_IDENTIFIER']] = \ float(non_digit.sub('', protein['NORMALIZED_EXPRESSION']))\ if normalized else \ float(non_digit.sub('', protein[ 'UNNORMALIZED_EXPRESSION'])) self.result.close() self.result = None except: sys.stdout.write( 'Error in\n') sys.stdout.write('Result type: %s\n' % str(type(self.result))) sys.stdout.write('Result mode: %s\n' % str(self.result.mode)) sys.stdout.write('Result name: %s\n' % str( sys.stdout.flush() def tissues_x_proteins(self, normalized=True, tissues=None): ''' For all tissues downloads the expression of all the proteins. In the result, a dict of dicts will hold the expression values of each proteins, grouped by samples. ''' self.get_tissues() tissues_selected = set([ t['TISSUE_ID'] for t in self.tissues if tissues is None or t['TISSUE_ID'] in tissues ]) - self.tissues_loaded prg = Progress( len(tissues_selected), 'Downloading expression data', 1, percent=False) for tis in tissues_selected: prg.step() sys.stdout.write('Querying tissue %s\n' % tis) sys.stdout.flush() self.get_proteins(tis) if type(self.result) is not file: sys.stdout.write('\tFailed: %s\n' % tis) sys.stdout.flush() else: self.tissues_loaded.add(tis) self.get_expression(normalized) if tis not in self.samples: self.samples[tis] = [] self.samples[tis] = uniqList(self.samples[tis] + list( self.current_samples)) self.current_samples = set([]) prg.terminate() def save(self, outf=None): self.result = None outf = outf if outf is not None else os.path.join( 'cache', 'proteomicsdb.pickle') pickle.dump(self, open(outf, 'wb')) def load(self, pfile=None): pfile = pfile if pfile is not None else os.path.join( 'cache', 'proteomicsdb.pickle') if os.path.exists(pfile): loaded = pickle.load(open(pfile, 'rb')) for k, v in iteritems(loaded.__dict__): if not hasattr(v, '__call__'): setattr(self, k, v) sys.stdout.write( '\t:: Loaded %u samples of %u tissues from file %s\n' % (len(self.expression.keys()), len(self.tissues_loaded), pfile)) else: sys.stdout.write('\t:: File not found: %s\n' % pfile) sys.stdout.flush() def pandas_matrix(self): ''' Returns expression data in a pandas matrix. Not implemented. ''' pass