Source code for pypath.inputs.biogps

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems
from past.builtins import xrange, range

import re
import collections

import pandas as pd

import pypath.resources.urls as urls
import pypath.share.curl as curl
import pypath.share.common as common
import pypath.utils.taxonomy as taxonomy
import pypath.utils.mapping as mapping


BiogpsDataset = collections.namedtuple(
    'BiogpsDataset',
    (
        'organism',
        'label',
        'url',
    ),
)


[docs] def biogps_datasets(): """ List the expression profile datasets available from BioGPS. Returns (list): Named tuples with the label, organism and URL of the datasets. """ biogps_urls = urls.urls['biogps'] url = biogps_urls['url'] datasets = biogps_urls['datasets'] return [ BiogpsDataset( organism = label.split('_', maxsplit = 1)[0], label = label, url = url % fname, ) for label, fname in iteritems(datasets) ]
[docs] def biogps_download(dataset): """ Retrieve one BioGPS expression profile dataset. Args dataset (str,BiogpsDataset): Either the label of a dataset or a `BiogpsDataset` object. For a list of available datasets, see `biogps_datasets`. Returns (pandas.DataFrame): A data frame of expression values, columns are tissues, cell types or cell lines, rows are microarray probes representing transcripts. """ biogps_urls = urls.urls['biogps'] label = dataset.label if hasattr(dataset, 'label') else dataset url = biogps_urls['url'] % biogps_urls['datasets'][label] c = curl.Curl(url, silent = False, large = True) the_file = ( common.first(c.result.values()) if isinstance(c.result, dict) else c.result ) fileobj = the_file if hasattr(the_file, 'name') else c.fileobj sep = ',' if fileobj.name.endswith('csv') else '\t' header = next(the_file).split(sep) header = [ re.sub( '\W+', '_', re.sub('^\d+', '', h).strip('') ).strip('_') for h in header ] header[0] = 'probe' hdr_dup = {} for i in xrange(len(header)): field = header[i] header[i] = ( field if field not in hdr_dup else '%s_%u' % (field, hdr_dup[field]) ) hdr_dup[field] = hdr_dup.get(field, 0) hdr_dup[field] += 1 content = [ [ value if j == 0 else float(value) for j, value in enumerate(row.strip().split(sep)) ] for row in the_file ] df = pd.DataFrame(content, columns = header) return df
[docs] def biogps_download_all(organism = None, exclude = None, only = None): """ Downloads all expression data from BioGPS. Args organism (str,int): Restrict the download to this organism. Human, mouse and rat datasets are available. exclude (str,set): One or more datasets to exclude. Should be dataset labels as shown by `biogps_datasets`. only (str,set): Restrict the download only to these datasets. Returns (dict): Dict of pandas data frames, keys are dataset labels, values are data frames of expression values, columns are tissues, cell types or cell lines, rows are microarray probes representing transcripts. """ if organism: organism = taxonomy.ensure_common_name(organism).lower() exclude = common.to_set(exclude) only = common.to_set(only) datasets = biogps_datasets() result = {} for dataset in datasets: if ( ( not organism or dataset.organism == organism ) and ( not only or dataset.label in only ) and dataset not in exclude ): result[dataset.label] = biogps_download(dataset) return result
[docs] def biogps_annotations(organism = 9606, exclude = None, only = None): """ Expression data from BioGPS compiled in the annotation format commonly used in this module. Args organism (int,str): Name or ID of the organism. This function is able to process data for one organism at once. exclude (str,set): One or more datasets to exclude. Should be dataset labels as shown by `biogps_datasets`. only (str,set): Restrict the download only to these datasets. Returns (dict): Dict of annotations, keys are UniProt IDs, values are sets of named tuples of annotations. """ BiogpsAnnotation = collections.namedtuple( 'BiogpsAnnotation', ( 'dataset', 'sample', 'probe', 'expression', ), ) data = biogps_download_all( organism = organism, exclude = exclude, only = only, ) result = collections.defaultdict(set) for label, df in iteritems(data): for row in df.itertuples(index = False): uniprots = mapping.map_name( row[0], 'affy', 'uniprot', ncbi_tax_id = organism, ) for uniprot in uniprots: result[uniprot].update( { BiogpsAnnotation( dataset = label, sample = sample, probe = row[0], expression = expr, ) for sample, expr in zip(row._fields[1:], row[1:]) } ) return result