#!/usr/bin/env python# -*- coding: utf-8 -*-## This file is part of the `pypath` python module## Copyright 2014-2023# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University## Authors: see the file `README.rst`# Contact: Dénes Türei (turei.denes@gmail.com)## Distributed under the GPLv3 License.# See accompanying file LICENSE.txt or copy at# https://www.gnu.org/licenses/gpl-3.0.html## Website: https://pypath.omnipathdb.org/#from__future__importannotationsfromfuture.utilsimportiteritemsimportreimportosimportjsonimportcollectionsimportpypath.share.sessionassession_modimportpypath.share.commonascommonimportpypath_common.dataas_dataimportpypath.share.curlascurlimportpypath.share.settingsassettingsimportpypath.resources.urlsasurlsimportpypath.utils.taxonomyastaxonomy_logger=session_mod.Logger(name='biomart_input')# for mouse homologues: Filter name = "with_mmusculus_homolog"_filter_xml_template='<Filter name="%s" excluded="0"/>'_attr_xml_template='<Attribute name="%s" />'
[docs]defbiomart_query(attrs:str|list[str],filters:str|list=None,transcript:bool=False,peptide:bool=False,gene:bool=False,dataset:str='hsapiens_gene_ensembl',):""" Query the Ensembl Biomart web service. Use https://www.ensembl.org/biomart/martview/ to check for attribute and dataset names. Args attrs: One or more Ensembl attribute names. filters: One or more Ensembl filter names. transcript: Include Ensembl transcript IDs in the result. peptide: Include Ensembl peptide IDs in the result. gene: Include Ensembl gene IDs in the result. dataset: An Ensembl dataset name. Yields: Named tuples with the requested attributes for each record returned by Ensembl Biomart. """_attrs=[]ifgene:_attrs.append('ensembl_gene_id')iftranscript:_attrs.append('ensembl_transcript_id')ifpeptide:_attrs.append('ensembl_peptide_id')_attrs.extend(common.to_list(attrs))filters=common.to_list(filters)record=collections.namedtuple('EnsemblRecord',_attrs,)_logger._log('Downloading data from Ensembl Biomart: ''dataset=`%s`, ''%s''attributes=`%s`.'%(dataset,('filters=`%s`, '%', '.join(filters)iffilterselse''),', '.join(_attrs),))rewsp=re.compile(r'\n\s+')xml_template_path=_data.path('ensembl_biomart_query.xml')withopen(xml_template_path,'r')asfp:xml_template=fp.read()filter_part=''.join(_filter_xml_template%_filterfor_filterinfilters)attr_part=''.join(_attr_xml_template%_attrfor_attrin_attrs)xml_query=xml_template%(dataset,filter_part,attr_part,)xml_query=rewsp.sub('',xml_query)biomart_url=urls.urls['ensembl']['biomart_url']%xml_queryc=curl.Curl(biomart_url,req_headers=[settings.get('user_agent')],large=True,silent=False,)success=Falseforlineinc.result:_line=line.strip('\n\r').split('\t')if_line[0]=='[success]':success=Truecontinueifline.strip()andlen(_line)==len(record._fields):yieldrecord(*_line)ifnotsuccess:_logger._log('Error: Interrupted transfer while downlading data ''from Ensembl Biomart (missing `success` tag).')
[docs]defbiomart_homology(source_organism:int|str=9606,target_organism:int|str=10090,extra_fields:str|Iterable[str]='external_gene_name',):""" Retrieves orthology data from Ensembl Biomart. Returns List of named tuples filtered to genes of the source organism having orthologues in the target organism, with homology related fields. """defensure_organism(organism):organism_ensembl=taxonomy.ensure_ensembl_name(organism)ifnotorganism_ensembl:msg='Could not find Ensembl taxon ID for `%s`.'%str(organism)_log(msg)raiseValueError(msg)returnorganism_ensemblsource_organism=ensure_organism(source_organism)target_organism=ensure_organism(target_organism)homolog_attrs=['homolog_ensembl_peptide','homolog_ensembl_gene','homolog_orthology_type','homolog_orthology_confidence','homolog_canonical_transcript_protein','homolog_associated_gene_name',]homolog_attrs=['%s_%s'%(target_organism,attr)forattrinhomolog_attrs]+common.to_list(extra_fields)filters=['with_%s_homolog'%target_organism]returnlist(biomart_query(attrs=homolog_attrs,filters=filters,transcript=True,gene=True,peptide=True,dataset='%s_gene_ensembl'%source_organism,))
[docs]defbiomart_microarray_types(organism:int|str=9606):""" Retrieves a list of available microarray types for an organism. Args organism: Name or ID of an organism. """organism=taxonomy.ensure_ensembl_name(organism)url=urls.urls['ensembl']['arraytypes']%organismc=curl.Curl(url,req_headers=[settings.get('user_agent')])result=json.loads(c.result)_=[r.update(label='%s%s'%(r['vendor'],re.sub('[-_]',' ',r['array']),))forrinresult]returnresult
[docs]defbiomart_microarray(array_type:str,gene:bool=True,transcript:bool=False,peptide:bool=False,organism:int|str=9606):""" Microarray probe identifier mappings. Args array_type: The microarray model, as shown on the BioMart webpage, or the corresponding code. For a full list of available identifiers see the ``biomart_microarray_types``. gene: Include the mapping to Ensembl gene IDs. transcript: Include the mapping to Ensembl transcript IDs. peptide: Include the mapping to Ensembl peptide IDs. organism: Name or ID of an organism. Returns A dictionary with Ensembl gene, transcript and peptide IDs as keys and sets of microarray probe IDs as values. """organism=taxonomy.ensure_ensembl_name(organism)array_types=biomart_microarray_types(organism=organism)array_types={at['label'].lower().replace(' ','_')foratinarray_types}_array_type=(array_type.lower().replace('probe','').strip().replace(' ','_'))if_array_typenotinarray_types:msg='No such array type in Ensembl BioMart: `%s` (%s).'%(array_type,_array_type)_logger._log(msg)raiseValueError(msg)attrs=[_array_type]dataset='%s_gene_ensembl'%organismbiomart_result=biomart_query(attrs=attrs,transcript=transcript,gene=gene,peptide=peptide,dataset=dataset,)result=collections.defaultdict(set)_locals=locals()ensembl_attrs=tuple(attrforattrin('gene','transcript','peptide')if_locals[attr])forrinbiomart_result:array_probe_id=getattr(r,_array_type)ifarray_probe_id:forensembl_attrinensembl_attrs:ensembl_id=getattr(r,'ensembl_%s_id'%ensembl_attr)ifensembl_id:result[ensembl_id].add(array_probe_id)returndict(result)
[docs]defbiomart_microarrays(organism:int|str=9606,vendor:str|set[str]=None,gene:bool=True,transcript:bool=False,peptide:bool=False):""" Microarray probe identifier mappings for multiple microarrays. Retrieves probe mappings for all array types for one organism, optionally limited to one or more array vendors. Note: depending on the number of array models, it can take minutes to download the data. Args organism: Name or ID of an organism. vendor: One or more vendors. None means all vendors. For human, possible values are AFFY, ILLUMINA, AGILENT, CODELINK and PHALANX. gene: Include the mapping to Ensembl gene IDs. transcript: Include the mapping to Ensembl transcript IDs. peptide: Include the mapping to Ensembl peptide IDs. Returns A dictionary with Ensembl gene, transcript and peptide IDs as keys and sets of tuples with microarray types and probe IDs as values. """record=collections.namedtuple('Probe',('array','probe',))array_types=biomart_microarray_types(organism=organism)vendor={v.upper()forvincommon.to_set(vendor)}result=collections.defaultdict(set)foratinarray_types:ifnotvendororat['vendor']invendor:probe_map=biomart_microarray(array_type=at['label'],organism=organism,gene=gene,transcript=transcript,peptide=peptide,)forgene_id,probe_idsiniteritems(probe_map):forprobe_idinprobe_ids:result[gene_id].add(record(array=at['label'].lower().replace(' ','_'),probe=probe_id,))returndict(result)