Source code for pypath.inputs.biomart

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

from future.utils import iteritems

import re
import os
import json
import collections

import pypath.share.session as session_mod
import pypath.share.common as common
import pypath_common.data as _data
import pypath.share.curl as curl
import pypath.share.settings as settings
import pypath.resources.urls as urls
import pypath.utils.taxonomy as taxonomy

_logger = session_mod.Logger(name = 'biomart_input')


# for mouse homologues: Filter name = "with_mmusculus_homolog"
_filter_xml_template = '<Filter name="%s" excluded="0"/>'
_attr_xml_template = '<Attribute name="%s" />'


[docs] def biomart_query( attrs: str | list[str], filters: str | list = None, transcript: bool = False, peptide: bool = False, gene: bool = False, dataset: str = 'hsapiens_gene_ensembl', ): """ Query the Ensembl Biomart web service. Use https://www.ensembl.org/biomart/martview/ to check for attribute and dataset names. Args attrs: One or more Ensembl attribute names. filters: One or more Ensembl filter names. transcript: Include Ensembl transcript IDs in the result. peptide: Include Ensembl peptide IDs in the result. gene: Include Ensembl gene IDs in the result. dataset: An Ensembl dataset name. Yields: Named tuples with the requested attributes for each record returned by Ensembl Biomart. """ _attrs = [] if gene: _attrs.append('ensembl_gene_id') if transcript: _attrs.append('ensembl_transcript_id') if peptide: _attrs.append('ensembl_peptide_id') _attrs.extend(common.to_list(attrs)) filters = common.to_list(filters) record = collections.namedtuple( 'EnsemblRecord', _attrs, ) _logger._log( 'Downloading data from Ensembl Biomart: ' 'dataset=`%s`, ' '%s' 'attributes=`%s`.' % ( dataset, ( 'filters=`%s`, ' % ', '.join(filters) if filters else '' ), ', '.join(_attrs), ) ) rewsp = re.compile(r'\n\s+') xml_template_path = _data.path('ensembl_biomart_query.xml') with open(xml_template_path, 'r') as fp: xml_template = fp.read() filter_part = ''.join( _filter_xml_template % _filter for _filter in filters ) attr_part = ''.join( _attr_xml_template % _attr for _attr in _attrs ) xml_query = xml_template % ( dataset, filter_part, attr_part, ) xml_query = rewsp.sub('', xml_query) biomart_url = urls.urls['ensembl']['biomart_url'] % xml_query c = curl.Curl( biomart_url, req_headers = [settings.get('user_agent')], large = True, silent = False, ) success = False for line in c.result: _line = line.strip('\n\r').split('\t') if _line[0] == '[success]': success = True continue if line.strip() and len(_line) == len(record._fields): yield record(*_line) if not success: _logger._log( 'Error: Interrupted transfer while downlading data ' 'from Ensembl Biomart (missing `success` tag).' )
[docs] def biomart_homology( source_organism: int | str = 9606, target_organism: int | str = 10090, extra_fields: str | Iterable[str] = 'external_gene_name', ): """ Retrieves orthology data from Ensembl Biomart. Returns List of named tuples filtered to genes of the source organism having orthologues in the target organism, with homology related fields. """ def ensure_organism(organism): organism_ensembl = taxonomy.ensure_ensembl_name(organism) if not organism_ensembl: msg = 'Could not find Ensembl taxon ID for `%s`.' % str(organism) _log(msg) raise ValueError(msg) return organism_ensembl source_organism = ensure_organism(source_organism) target_organism = ensure_organism(target_organism) homolog_attrs = [ 'homolog_ensembl_peptide', 'homolog_ensembl_gene', 'homolog_orthology_type', 'homolog_orthology_confidence', 'homolog_canonical_transcript_protein', 'homolog_associated_gene_name', ] homolog_attrs = [ '%s_%s' % (target_organism, attr) for attr in homolog_attrs ] + common.to_list(extra_fields) filters = [ 'with_%s_homolog' % target_organism ] return list( biomart_query( attrs = homolog_attrs, filters = filters, transcript = True, gene = True, peptide = True, dataset = '%s_gene_ensembl' % source_organism, ) )
[docs] def biomart_microarray_types(organism: int | str = 9606): """ Retrieves a list of available microarray types for an organism. Args organism: Name or ID of an organism. """ organism = taxonomy.ensure_ensembl_name(organism) url = urls.urls['ensembl']['arraytypes'] % organism c = curl.Curl(url, req_headers = [settings.get('user_agent')]) result = json.loads(c.result) _ = [ r.update( label = '%s %s' % ( r['vendor'], re.sub('[-_]', ' ', r['array']), ) ) for r in result ] return result
[docs] def biomart_microarray( array_type: str, gene: bool = True, transcript: bool = False, peptide: bool = False, organism: int | str = 9606 ): """ Microarray probe identifier mappings. Args array_type: The microarray model, as shown on the BioMart webpage, or the corresponding code. For a full list of available identifiers see the ``biomart_microarray_types``. gene: Include the mapping to Ensembl gene IDs. transcript: Include the mapping to Ensembl transcript IDs. peptide: Include the mapping to Ensembl peptide IDs. organism: Name or ID of an organism. Returns A dictionary with Ensembl gene, transcript and peptide IDs as keys and sets of microarray probe IDs as values. """ organism = taxonomy.ensure_ensembl_name(organism) array_types = biomart_microarray_types(organism = organism) array_types = { at['label'].lower().replace(' ', '_') for at in array_types } _array_type = ( array_type. lower(). replace('probe', ''). strip(). replace(' ', '_') ) if _array_type not in array_types: msg = 'No such array type in Ensembl BioMart: `%s` (%s).' % ( array_type, _array_type ) _logger._log(msg) raise ValueError(msg) attrs = [_array_type] dataset = '%s_gene_ensembl' % organism biomart_result = biomart_query( attrs = attrs, transcript = transcript, gene = gene, peptide = peptide, dataset = dataset, ) result = collections.defaultdict(set) _locals = locals() ensembl_attrs = tuple( attr for attr in ('gene', 'transcript', 'peptide') if _locals[attr] ) for r in biomart_result: array_probe_id = getattr(r, _array_type) if array_probe_id: for ensembl_attr in ensembl_attrs: ensembl_id = getattr(r, 'ensembl_%s_id' % ensembl_attr) if ensembl_id: result[ensembl_id].add(array_probe_id) return dict(result)
[docs] def biomart_microarrays( organism: int | str = 9606, vendor: str | set[str] = None, gene: bool = True, transcript: bool = False, peptide: bool = False ): """ Microarray probe identifier mappings for multiple microarrays. Retrieves probe mappings for all array types for one organism, optionally limited to one or more array vendors. Note: depending on the number of array models, it can take minutes to download the data. Args organism: Name or ID of an organism. vendor: One or more vendors. None means all vendors. For human, possible values are AFFY, ILLUMINA, AGILENT, CODELINK and PHALANX. gene: Include the mapping to Ensembl gene IDs. transcript: Include the mapping to Ensembl transcript IDs. peptide: Include the mapping to Ensembl peptide IDs. Returns A dictionary with Ensembl gene, transcript and peptide IDs as keys and sets of tuples with microarray types and probe IDs as values. """ record = collections.namedtuple( 'Probe', ( 'array', 'probe', ) ) array_types = biomart_microarray_types(organism = organism) vendor = {v.upper() for v in common.to_set(vendor)} result = collections.defaultdict(set) for at in array_types: if not vendor or at['vendor'] in vendor: probe_map = biomart_microarray( array_type = at['label'], organism = organism, gene = gene, transcript = transcript, peptide = peptide, ) for gene_id, probe_ids in iteritems(probe_map): for probe_id in probe_ids: result[gene_id].add( record( array = at['label'].lower().replace(' ', '_'), probe = probe_id, ) ) return dict(result)