Source code for pypath.internals.input_formats

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from __future__ import annotations

import copy

import pypath.share.settings as settings
import pypath.share.session as session
import pypath_common._constants as _const
import pypath.inputs.uniprot_idmapping as uniprot_idmapping
import pypath.inputs.unichem as unichem_input

_logger = session.Logger(name = 'input_formats')

__all__ = [
    'FileMapping',
    'PickleMapping',
    'NetworkInput',
    'ReadList',
    'UniprotListMapping',
    'ProMapping',
    'ArrayMapping',
    'BiomartMapping',
]


AC_QUERY = {
    'genesymbol': 'gene_primary',
    'genesymbol-syn': 'gene_synonym',
    'hgnc': 'xref_hgnc',
    'embl': 'xref_embl',
    'entrez': 'xref_geneid',
    'geneid': 'xref_geneid',
    'refseqp': 'xref_refseq',
    'enst': 'xref_ensembl',
    'uniprot-entry': 'id',
    'protein-name': 'protein_name',
    'gene-name': 'gene_names',
    'gene-orf': 'gene_orf',
    'gene-oln': 'gene_oln',
    'ec': 'ec',
}

AC_MAPPING = {
    'uniprot': 'UniProtKB',
    'uniprot-entry': 'UniProtKB',
    'embl': 'EMBL-GeneBank-DDBJ',
    'embl_id': 'EMBL-GeneBank-DDBJ_CDS',
    'pir': 'PIR',
    'entrez': 'GeneID',
    'gi': 'GI_number',
    'refseqp': 'RefSeq_Protein',
    'refseqn': 'RefSeq_Nucleotide',
    'ensembl': 'Ensembl',
    'ensp': 'Ensembl_Protein',
    'enst': 'Ensembl_Transcript',
    'ensg': 'Ensembl',
    'ensgp': 'Ensembl_Genomes_Protein',
    'ensgt': 'Ensembl_Genomes_Transcript',
    'hgnc': 'HGNC',
    'ensp_string': 'STRING',
    'genesymbol': 'Gene_Name',
}

BIOMART_MAPPING = {
    'hgnc_symbol': 'hgnc_symbol',
    'rnacentral': 'rnacentral',
    'hgnc_trans_name': 'hgnc_trans_name',
    'wikigene_name': 'wikigene_name',
    'gene_name': 'external_gene_name',
    'genesymbol': 'external_gene_name',
    'transcript_name': 'external_transcript_name',
    'gene_description': 'description',
    'gene_synonym': 'external_synonym',
    'interpro_description': 'interpro_description',
    'interpro': 'interpro',
    'interpro_short_description': 'interpro_short_description',
    'enst_biomart': 'ensembl_transcript_id',
    'ensg_biomart': 'ensembl_gene_id',
    'ensp_biomart': 'ensembl_peptide_id',
    'ensembl_gene_id': 'ensembl_gene_id',
    'ensembl_transcript_id': 'ensembl_transcript_id',
    'ensembl_peptide_id': 'ensembl_peptide_id',
    'uniprot': 'uniprotswissprot',
    'trembl': 'uniprotsptrembl',

}

PRO_MAPPING = {
    'alzforum': 'Alzforum_mut',
    'araport': 'Araport',
    'cgnc': 'CGNC',
    'dictybase': 'dictyBase',
    'dto': 'DTO',
    'ecocyc': 'EcoCyc',
    'ecogene': 'EcoGene',
    'ensembl_pro': 'Ensembl',
    'ensembl_bacteria': 'EnsemblBacteria',
    'flybase': 'FlyBase',
    'hgnc': 'HGNC',
    'iuphar_fam': 'IUPHARfam',
    'iuphar': 'IUPHARobj',
    'mgi': 'MGI',
    'mro': 'MRO',
    'ncbi_gene': 'NCBIGene',
    'pbd': 'PDB',
    'pombase': 'PomBase',
    'interpro': 'PRO',
    'reactome': 'Reactome',
    'rgd': 'RGD',
    'sgd': 'SGD',
    'tdr': 'TDR',
    'uniprot': 'UniProtKB',
    'uniprot-var': 'UniProtKB_VAR',
    'wormbase': 'WormBase',
    'zfin': 'ZFIN',
}

ARRAY_MAPPING = {
    'affy',
    'affymetrix',
    'illumina',
    'agilent',
    'codelink',
    'phalanx',
}

RAMP_MAPPING = {
    'cas': 'CAS',
    'cas_id': 'CAS',
    'lipidmaps': 'LIPIDMAPS',
    'en': 'EN',
    'enzymatic_nomenclature': 'EN',
    'genesymbol': 'gene_symbol',
    'pubchem_compound': 'pubchem',
    'pubchem_cid': 'pubchem',
}

HMDB_MAPPING = {
    'hmdb': 'accession',
    'pubchem_cid': 'pubchem_compound',
    'pubchem': 'pubchem_compound',
    'phenolexplorer': 'phenol_explorer_compound',
    'cas': 'cas_registry_number',
    'formula': 'chemical_formula',
    'inchi': 'inchi',
    'inchikey': 'inchikey',
    'hmdb_name': 'name',
    'hmdb_synonym': 'synonyms',
    'smiles': 'smiles',
    'iupac': 'traditional_iupac',
}


[docs] class MappingInput(object): _resource_id_types = {}
[docs] def __init__( self, type_, id_type_a, id_type_b, ncbi_tax_id = None, resource_id_type_a = None, resource_id_type_b = None, input_method = None, ): self.type = type_ self.id_type_a = id_type_a self.id_type_b = id_type_b self.resource_id_type_a = resource_id_type_a self.resource_id_type_b = resource_id_type_b self.ncbi_tax_id = ncbi_tax_id or settings.get('default_organism') self.input_method = input_method
def _resource_id_type(self, side): return self.resource_id_type( getattr(self, 'id_type_%s' % side), override = getattr(self, 'resource_id_type_%s' % side), ) @property def _resource_id_type_a(self): return self._resource_id_type(side = 'a') @property def _resource_id_type_b(self): return self._resource_id_type(side = 'b') @classmethod def resource_id_type(cls, id_type, override = None): return override or cls._resource_id_types.get(id_type, None) def __contains__(self, other: str) -> bool: return ( self.id_type_a == other or self.id_type_b == other or self._resource_id_type_a == other or self._resource_id_type_b == other ) def swap_sides(self): self.id_type_a, self.id_type_b = self.id_type_b, self.id_type_a self.resource_id_type_a, self.resource_id_type_b = ( self.resource_id_type_b, self.resource_id_type_a, ) @classmethod def possible( cls, id_type_a: str, id_type_b: str, ncbi_tax_id: int | None = None, ) -> bool: return all( ( id_type in cls._resource_id_types or id_type in cls._resource_id_types.values() ) for id_type in (id_type_a, id_type_b) )
[docs] class FileMapping(MappingInput):
[docs] def __init__( self, id_type_a, id_type_b, input_, col_a, col_b, separator = None, header = 0, ncbi_tax_id = None, entity_type = 'protein', ): MappingInput.__init__( self, type_ = 'file', id_type_a = id_type_a, id_type_b = id_type_b, ncbi_tax_id = ncbi_tax_id, ) self.input = input_ self.col_a = col_a self.col_b = col_b self.separator = separator self.header = header self.entity_type = entity_type self.input_args = {'organism': self.ncbi_tax_id}
def set_organism(self, ncbi_tax_id): other_organism = copy.deepcopy(self) other_organism.ncbi_tax_id = ncbi_tax_id if 'organism' in other_organism.input_args: other_organism.input_args['organism'] = ncbi_tax_id return other_organism @classmethod def possible( cls, id_type_a: str, id_type_b: str, ncbi_tax_id: int | None = None, ) -> bool: raise NotImplementedError
[docs] class UniprotMapping(MappingInput): _resource_id_type_b = 'accession' _resource_id_types = AC_QUERY
[docs] def __init__( self, id_type_a, id_type_b = 'uniprot', ncbi_tax_id = 9606, swissprot = 'true', ): """ Defines an ID conversion table to retrieve from UniProt. id_type : str Type of accession numbers you would like to translate. target_id_type : str Type of accession numbers you would like to translate to. tax : int NCBI Taxonomy ID of the organism of interest. swissprot : str Look for SwissProt or Trembl. Passed directly to UniProt`s `reviewed` parameter. `yes` or `no` To fetch Trembl and SwissProt together, set value to None. mapping : bool Get the data from UniProt`s programmatic access query interface, (uniprot.org/uniprot) or the batch retrieval/id mapping service (uniprot.org/mapping). These have slightly different APIs and capabilities. Some IDs can be obtained from the former, some from the latter. """ self.type = 'uniprot' MappingInput.__init__( self, type_ = 'uniprot', id_type_a = id_type_a, id_type_b = id_type_b, ncbi_tax_id = ncbi_tax_id, ) self.ncbi_tax_id = int(ncbi_tax_id) self.typ = 'protein' self.swissprot = swissprot
def set_organism(self, ncbi_tax_id): other_organism = copy.deepcopy(self) other_organism.ncbi_tax_id = ncbi_tax_id return other_organism @property def field(self): return AC_QUERY.get(self.id_type_a, (None,))[0] @property def subfield(self): return AC_QUERY.get(self.id_type_a, (None, None))[1]
[docs] @staticmethod def resource_id_type(id_type, override = None): """ For an ID type label used in pypath, returns the one used in the UniProt web service. If the label is not available in the built in list None is returned. Returns (str): The ID type label used by UniProt; None if the input label is not known. """ id_type = AC_QUERY.get(id_type, id_type) return id_type
@classmethod def possible( cls, id_type_a: str, id_type_b: str, ncbi_tax_id: int | None = None, ) -> bool: return all( ( id_type in cls._resource_id_types or id_type in cls._resource_id_types.values() or id_type == 'uniprot' or id_type.startswith('xref_') ) for id_type in (id_type_a, id_type_b) )
[docs] class UniprotListMapping(MappingInput): """ Provides parameters for downloading mapping table from UniProt `Upload Lists` webservice. :arg str id_type_a: Custom name for one of the ID types. :arg str id_type_b: Custom name for the other ID type. :arg str uniprot_id_type_a: This is the symbol the UniProt webservice uses for the first name type. These are included in the module and set automatically, the argument only gives a way to override this. :arg str uniprot_id_type_b: Same as above just for the other ID type. :arg bool swissprot: Download data only for SwissProt IDs. """ _resource_id_types = AC_MAPPING _from_uniprot = { 'uniprot': 'UniProtKB_AC-ID', 'swissprot': 'UniProtKB_AC-ID', 'trembl': 'UniProtKB_AC-ID', } _to_uniprot = { 'uniprot': 'UniProtKB', 'swissprot': 'UniProtKB-Swiss-Prot', 'trembl': 'UniProtKB', }
[docs] def __init__( self, id_type_a, id_type_b, uniprot_id_type_a = None, uniprot_id_type_b = None, ncbi_tax_id = 9606, swissprot = None, ): MappingInput.__init__( self, type_ = 'uniprot_list', id_type_a = id_type_a, id_type_b = id_type_b, ncbi_tax_id = ncbi_tax_id, resource_id_type_a = uniprot_id_type_a, resource_id_type_b = uniprot_id_type_b, ) self._set_swissprot(swissprot) self.ac_mapping = AC_MAPPING self._update_uniprot_types() self.entity_type = 'protein'
def set_organism(self, ncbi_tax_id): other_organism = copy.deepcopy(self) other_organism.ncbi_tax_id = ncbi_tax_id return other_organism def swap_sides(self): MappingInput.swap_sides(self) self._update_uniprot_types() def _update_uniprot_types(self): self.uniprot_id_type_a = self._resource_id_type_a self.uniprot_id_type_b = self._resource_id_type_b def _resource_id_type(self, side: str) -> str: uniprot_id_types = { 'a': self._from_uniprot, 'b': self._to_uniprot, }.get(side) id_type = getattr(self, f'id_type_{side}') return uniprot_id_types.get( id_type, self._resource_id_types.get(id_type, id_type) ) def _set_swissprot(self, swissprot: bool | None) -> None: values = {'swissprot': True, 'trembl': False, 'uniprot': True} if swissprot is None: swissprot = values.get( self.id_type_a, values.get(self.id_type_b, swissprot) ) self.swissprot = swissprot @classmethod def _uniprotkb_id_type(cls, id_type: str) -> bool: return id_type in cls._from_uniprot @classmethod def possible( cls, id_type_a: str, id_type_b: str, ncbi_tax_id: int | None = None, ) -> bool: id_type_a = cls._from_uniprot.get(id_type_a, id_type_a) id_type_a = cls._resource_id_types.get(id_type_a, id_type_a) id_type_b = cls._to_uniprot.get(id_type_b, id_type_b) id_type_b = cls._resource_id_types.get(id_type_b, id_type_b) pairs = uniprot_idmapping.idtypes() return (id_type_a, id_type_b) in pairs
[docs] class ProMapping(MappingInput): """ Provides parameters for mapping table from the Protein Ontology Consortium. :arg str id_type_a: Custom name for one of the ID types. :arg str id_type_b: Custom name for the other ID type. :arg str pro_id_type_a: This is the symbol PRO uses to label the IDs. These are included in the module and set automatically, the argument only gives a way to override this. :arg str pro_id_type_b: Same as above just for the other ID type. """ _resource_id_types = PRO_MAPPING
[docs] def __init__( self, id_type_a, id_type_b = None, pro_id_type_a = None, pro_id_type_b = None, ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC, ): to_pro = id_type_a != 'pro' id_type = id_type_a if to_pro else id_type_b pro_id_type = ( pro_id_type_a if to_pro else pro_id_type_b ) MappingInput.__init__( self, type_ = 'pro', id_type_a = 'pro', id_type_b = id_type, ncbi_tax_id = -1, resource_id_type_a = pro_id_type_a, resource_id_type_b = pro_id_type_b, ) self.to_pro = to_pro self.id_type = id_type self.pro_mapping = PRO_MAPPING self.pro_id_type = pro_id_type or self.pro_mapping[self.id_type_b] self.entity_type = 'protein'
@classmethod def possible( cls, id_type_a: str, id_type_b: str, ncbi_tax_id: int | None = None, ) -> bool: id_types = {id_type_a: None, id_type_b: None} return ( id_types.pop('pro', None) and ( list(id_types)[0] in self._resource_id_types or list(id_types)[0] in self._resource_id_types.values() ) )
[docs] class BiomartMapping(MappingInput): _resource_id_types = BIOMART_MAPPING
[docs] def __init__( self, id_type_a, id_type_b = None, transcript = False, biomart_id_type_a = None, biomart_id_type_b = None, ncbi_tax_id = 9606, ): MappingInput.__init__( self, type_ = 'biomart', id_type_a = id_type_a, id_type_b = id_type_b, ncbi_tax_id = ncbi_tax_id, resource_id_type_a = biomart_id_type_a, resource_id_type_b = biomart_id_type_b, ) self.biomart_id_type_a = self._resource_id_type_a self.biomart_id_type_b = self._resource_id_type_b self.attrs = ( self.biomart_id_type_a, self.biomart_id_type_b, ) self.biomart_mapping = BIOMART_MAPPING
[docs] class UnichemMapping(MappingInput): _resource_id_types = { id_type: id_type for id_type in unichem_input.unichem_sources().values() }
[docs] def __init__( self, id_type_a, id_type_b, ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC, ): """ Paramaters for UniChem based ID translation. Args: id_type_a: Custom name for one of the ID types. id_type_b: Custom name for the other ID type. """ MappingInput.__init__( self, type_ = 'unichem', id_type_a = id_type_a, id_type_b = id_type_b, ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC, )
[docs] class RampMapping(MappingInput): _resource_id_types = RAMP_MAPPING
[docs] def __init__( self, id_type_a, id_type_b, ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC, ): """ Paramaters for ID translation tables from the RaMP database. Args: id_type_a: Custom name for one of the ID types. id_type_b: Custom name for the other ID type. """ MappingInput.__init__( self, type_ = 'ramp', id_type_a = id_type_a, id_type_b = id_type_b, ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC, )
[docs] class HmdbMapping(MappingInput): _resource_id_types = HMDB_MAPPING
[docs] def __init__( self, id_type_a, id_type_b, ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC, ): """ Paramaters for ID translation tables from the Human Metabolome Database. Args: id_type_a: Custom name for one of the ID types. id_type_b: Custom name for the other ID type. """ MappingInput.__init__( self, type_ = 'hmdb', id_type_a = id_type_a, id_type_b = id_type_b, ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC, input_method = 'hmdb.metabolites_mapping', )
[docs] class ArrayMapping(MappingInput): """ Provides parameters for microarray probe mapping tables. :arg str id_type_a: Custom name for one of the ID types. :arg str id_type_b: Custom name for the other ID type. :arg str pro_id_type_a: This is the symbol PRO uses to label the IDs. These are included in the module and set automatically, the argument only gives a way to override this. :arg str pro_id_type_b: Same as above just for the other ID type. """ _resource_id_types = ARRAY_MAPPING
[docs] def __init__( self, id_type_a, id_type_b, ncbi_tax_id = 9606, ): MappingInput.__init__( self, type_ = 'array', id_type_a = self._get_id_type(id_type_a), id_type_b = self._get_id_type(id_type_b), ncbi_tax_id = ncbi_tax_id, resource_id_type_a = self._process_id_type(id_type_a), resource_id_type_b = self._process_id_type(id_type_b), ) self.ensembl_id = ( self.resource_id_type_a if self.id_type_a.startswith('ens') else self.resource_id_type_b ) self.array_id = ( self.resource_id_type_a if self.id_type_a in self._resource_id_types else self.resource_id_type_b ) self.entity_type = 'protein'
@classmethod def _process_id_type(cls, id_type: str, fail: bool = True): id_type = id_type.lower() id_type = 'affy' if id_type == 'affymetrix' else id_type id_type = 'ensg' if id_type == 'ensembl' else id_type if ( id_type not in cls._resource_id_types and id_type not in {'ensg', 'enst', 'ensp'} ): if fail: msg = ( 'Unknown ID type for microarray probe mapping: `%s`. ' 'Microarray ID types include `affy`, `illumina`, `agilent`, ' '`codelink` and `phalanx`, all these can be translated to ' 'Ensembl gene, transcript or peptide IDs: `ensg`, `enst` ' 'or `ensp`. If you translate to some other ID type, do it ' 'in multiple steps.' % str(id_type) ) _logger._log(msg) raise ValueError(msg) else: return None return id_type @classmethod def possible( cls, id_type_a: str, id_type_b: str, ncbi_tax_id: int | None = None, ) -> bool: return ( cls._process_id_type(id_type_a, fail = False) and cls._process_id_type(id_type_b, fail = False) )
[docs] class PickleMapping(MappingInput):
[docs] def __init__( self, id_type_a, id_type_b, fname, ncbi_tax_id = None, ): MappingInput.__init__( self, type_ = 'pickle', id_type_a = id_type_a, id_type_b = id_type_b, ncbi_tax_id = ncbi_tax_id, ) self.fname = fname
[docs] class NetworkInput:
[docs] def __init__( self, name = "unknown", separator = None, id_col_a = 0, id_col_b = 1, id_type_a = "uniprot", id_type_b = "uniprot", entity_type_a = "protein", entity_type_b = "protein", is_directed = False, sign = False, input = None, references = None, extra_edge_attrs = None, extra_node_attrs_a = None, extra_node_attrs_b = None, header = False, taxon_a = 9606, taxon_b = 9606, ncbi_tax_id = 9606, interaction_type = 'post_translational', positive_filters = None, negative_filters = None, mark_source = None, mark_target = None, input_args = None, curl_args = None, must_have_references = True, huge = False, resource = None, unique_fields = None, expand_complexes = None, data_model = None, allow_loops = None, only_default_organism = False, dataset = None, ): """ :param str mark_source: Creates a boolean vertex attribute and sets it True for the source vertex of directed interactions from this particular resource. :param str mark_target: Same as ``mark_source`` but for target vertices. """ self.entity_type_a = entity_type_a self.entity_type_b = entity_type_b self.id_col_a = id_col_a self.id_col_b = id_col_b self.id_type_a = id_type_a self.id_type_b = id_type_b self.is_directed = is_directed self.input = input self.extra_edge_attrs = extra_edge_attrs or {} self.extra_node_attrs_a = extra_node_attrs_a or {} self.extra_node_attrs_b = extra_node_attrs_b or {} self.name = name self.separator = separator self.header = header self.refs = references or None self.sign = sign self.taxon_a = taxon_a self.taxon_b = taxon_b self.ncbi_tax_id = ncbi_tax_id self.interaction_type = interaction_type self.positive_filters = positive_filters or [] self.negative_filters = negative_filters or [] self.input_args = input_args or {} self.curl_args = curl_args or {} self.must_have_references = must_have_references and bool(references) self.huge = huge self.resource = self.name if resource is None else resource self.mark_source = mark_source self.mark_target = mark_target self.unique_fields = unique_fields or set() self.expand_complexes = expand_complexes self.data_model = data_model self.allow_loops = allow_loops self.only_default_organism = only_default_organism self.dataset = dataset
def _field(self, value, cls): return value if isinstance(value, cls) else cls(compact = value)
[docs] class ReadList:
[docs] def __init__( self, name = 'unknown', separator = None, id_col = 0, id_type = 'uniprot', entity_type = 'protein', input = None, extra_attrs = None, header = False, ): self.entity_type = entity_type self.id_col = id_col self.id_type = id_type self.input = input self.extra_attrs = extra_attrs or {} self.name = name self.separator = separator self.header = header