#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
from __future__ import annotations
import copy
import pypath.share.settings as settings
import pypath.share.session as session
import pypath_common._constants as _const
import pypath.inputs.uniprot_idmapping as uniprot_idmapping
import pypath.inputs.unichem as unichem_input
_logger = session.Logger(name = 'input_formats')
__all__ = [
'FileMapping',
'PickleMapping',
'NetworkInput',
'ReadList',
'UniprotListMapping',
'ProMapping',
'ArrayMapping',
'BiomartMapping',
]
AC_QUERY = {
'genesymbol': 'gene_primary',
'genesymbol-syn': 'gene_synonym',
'hgnc': 'xref_hgnc',
'embl': 'xref_embl',
'entrez': 'xref_geneid',
'geneid': 'xref_geneid',
'refseqp': 'xref_refseq',
'enst': 'xref_ensembl',
'uniprot-entry': 'id',
'protein-name': 'protein_name',
'gene-name': 'gene_names',
'gene-orf': 'gene_orf',
'gene-oln': 'gene_oln',
'ec': 'ec',
}
AC_MAPPING = {
'uniprot': 'UniProtKB',
'uniprot-entry': 'UniProtKB',
'embl': 'EMBL-GeneBank-DDBJ',
'embl_id': 'EMBL-GeneBank-DDBJ_CDS',
'pir': 'PIR',
'entrez': 'GeneID',
'gi': 'GI_number',
'refseqp': 'RefSeq_Protein',
'refseqn': 'RefSeq_Nucleotide',
'ensembl': 'Ensembl',
'ensp': 'Ensembl_Protein',
'enst': 'Ensembl_Transcript',
'ensg': 'Ensembl',
'ensgp': 'Ensembl_Genomes_Protein',
'ensgt': 'Ensembl_Genomes_Transcript',
'hgnc': 'HGNC',
'ensp_string': 'STRING',
'genesymbol': 'Gene_Name',
}
BIOMART_MAPPING = {
'hgnc_symbol': 'hgnc_symbol',
'rnacentral': 'rnacentral',
'hgnc_trans_name': 'hgnc_trans_name',
'wikigene_name': 'wikigene_name',
'gene_name': 'external_gene_name',
'genesymbol': 'external_gene_name',
'transcript_name': 'external_transcript_name',
'gene_description': 'description',
'gene_synonym': 'external_synonym',
'interpro_description': 'interpro_description',
'interpro': 'interpro',
'interpro_short_description': 'interpro_short_description',
'enst_biomart': 'ensembl_transcript_id',
'ensg_biomart': 'ensembl_gene_id',
'ensp_biomart': 'ensembl_peptide_id',
'ensembl_gene_id': 'ensembl_gene_id',
'ensembl_transcript_id': 'ensembl_transcript_id',
'ensembl_peptide_id': 'ensembl_peptide_id',
'uniprot': 'uniprotswissprot',
'trembl': 'uniprotsptrembl',
}
PRO_MAPPING = {
'alzforum': 'Alzforum_mut',
'araport': 'Araport',
'cgnc': 'CGNC',
'dictybase': 'dictyBase',
'dto': 'DTO',
'ecocyc': 'EcoCyc',
'ecogene': 'EcoGene',
'ensembl_pro': 'Ensembl',
'ensembl_bacteria': 'EnsemblBacteria',
'flybase': 'FlyBase',
'hgnc': 'HGNC',
'iuphar_fam': 'IUPHARfam',
'iuphar': 'IUPHARobj',
'mgi': 'MGI',
'mro': 'MRO',
'ncbi_gene': 'NCBIGene',
'pbd': 'PDB',
'pombase': 'PomBase',
'interpro': 'PRO',
'reactome': 'Reactome',
'rgd': 'RGD',
'sgd': 'SGD',
'tdr': 'TDR',
'uniprot': 'UniProtKB',
'uniprot-var': 'UniProtKB_VAR',
'wormbase': 'WormBase',
'zfin': 'ZFIN',
}
ARRAY_MAPPING = {
'affy',
'affymetrix',
'illumina',
'agilent',
'codelink',
'phalanx',
}
RAMP_MAPPING = {
'cas': 'CAS',
'cas_id': 'CAS',
'lipidmaps': 'LIPIDMAPS',
'en': 'EN',
'enzymatic_nomenclature': 'EN',
'genesymbol': 'gene_symbol',
'pubchem_compound': 'pubchem',
'pubchem_cid': 'pubchem',
}
HMDB_MAPPING = {
'hmdb': 'accession',
'pubchem_cid': 'pubchem_compound',
'pubchem': 'pubchem_compound',
'phenolexplorer': 'phenol_explorer_compound',
'cas': 'cas_registry_number',
'formula': 'chemical_formula',
'inchi': 'inchi',
'inchikey': 'inchikey',
'hmdb_name': 'name',
'hmdb_synonym': 'synonyms',
'smiles': 'smiles',
'iupac': 'traditional_iupac',
}
[docs]
class FileMapping(MappingInput):
[docs]
def __init__(
self,
id_type_a,
id_type_b,
input_,
col_a,
col_b,
separator = None,
header = 0,
ncbi_tax_id = None,
entity_type = 'protein',
):
MappingInput.__init__(
self,
type_ = 'file',
id_type_a = id_type_a,
id_type_b = id_type_b,
ncbi_tax_id = ncbi_tax_id,
)
self.input = input_
self.col_a = col_a
self.col_b = col_b
self.separator = separator
self.header = header
self.entity_type = entity_type
self.input_args = {'organism': self.ncbi_tax_id}
def set_organism(self, ncbi_tax_id):
other_organism = copy.deepcopy(self)
other_organism.ncbi_tax_id = ncbi_tax_id
if 'organism' in other_organism.input_args:
other_organism.input_args['organism'] = ncbi_tax_id
return other_organism
@classmethod
def possible(
cls,
id_type_a: str,
id_type_b: str,
ncbi_tax_id: int | None = None,
) -> bool:
raise NotImplementedError
[docs]
class UniprotMapping(MappingInput):
_resource_id_type_b = 'accession'
_resource_id_types = AC_QUERY
[docs]
def __init__(
self,
id_type_a,
id_type_b = 'uniprot',
ncbi_tax_id = 9606,
swissprot = 'true',
):
"""
Defines an ID conversion table to retrieve from UniProt.
id_type : str
Type of accession numbers you would like to translate.
target_id_type : str
Type of accession numbers you would like to translate to.
tax : int
NCBI Taxonomy ID of the organism of interest.
swissprot : str
Look for SwissProt or Trembl.
Passed directly to UniProt`s `reviewed` parameter. `yes` or `no`
To fetch Trembl and SwissProt together, set value to None.
mapping : bool
Get the data from UniProt`s programmatic access query interface,
(uniprot.org/uniprot) or the batch retrieval/id mapping service
(uniprot.org/mapping). These have slightly different APIs and
capabilities. Some IDs can be obtained from the former, some
from the latter.
"""
self.type = 'uniprot'
MappingInput.__init__(
self,
type_ = 'uniprot',
id_type_a = id_type_a,
id_type_b = id_type_b,
ncbi_tax_id = ncbi_tax_id,
)
self.ncbi_tax_id = int(ncbi_tax_id)
self.typ = 'protein'
self.swissprot = swissprot
def set_organism(self, ncbi_tax_id):
other_organism = copy.deepcopy(self)
other_organism.ncbi_tax_id = ncbi_tax_id
return other_organism
@property
def field(self):
return AC_QUERY.get(self.id_type_a, (None,))[0]
@property
def subfield(self):
return AC_QUERY.get(self.id_type_a, (None, None))[1]
[docs]
@staticmethod
def resource_id_type(id_type, override = None):
"""
For an ID type label used in pypath, returns the one used in the
UniProt web service. If the label is not available in the built in
list None is returned.
Returns
(str): The ID type label used by UniProt; None if the input
label is not known.
"""
id_type = AC_QUERY.get(id_type, id_type)
return id_type
@classmethod
def possible(
cls,
id_type_a: str,
id_type_b: str,
ncbi_tax_id: int | None = None,
) -> bool:
return all(
(
id_type in cls._resource_id_types or
id_type in cls._resource_id_types.values() or
id_type == 'uniprot' or
id_type.startswith('xref_')
)
for id_type in (id_type_a, id_type_b)
)
[docs]
class UniprotListMapping(MappingInput):
"""
Provides parameters for downloading mapping table from UniProt
`Upload Lists` webservice.
:arg str id_type_a:
Custom name for one of the ID types.
:arg str id_type_b:
Custom name for the other ID type.
:arg str uniprot_id_type_a:
This is the symbol the UniProt webservice uses for the first
name type. These are included in the module and set
automatically, the argument only gives a way to override this.
:arg str uniprot_id_type_b:
Same as above just for the other ID type.
:arg bool swissprot:
Download data only for SwissProt IDs.
"""
_resource_id_types = AC_MAPPING
_from_uniprot = {
'uniprot': 'UniProtKB_AC-ID',
'swissprot': 'UniProtKB_AC-ID',
'trembl': 'UniProtKB_AC-ID',
}
_to_uniprot = {
'uniprot': 'UniProtKB',
'swissprot': 'UniProtKB-Swiss-Prot',
'trembl': 'UniProtKB',
}
[docs]
def __init__(
self,
id_type_a,
id_type_b,
uniprot_id_type_a = None,
uniprot_id_type_b = None,
ncbi_tax_id = 9606,
swissprot = None,
):
MappingInput.__init__(
self,
type_ = 'uniprot_list',
id_type_a = id_type_a,
id_type_b = id_type_b,
ncbi_tax_id = ncbi_tax_id,
resource_id_type_a = uniprot_id_type_a,
resource_id_type_b = uniprot_id_type_b,
)
self._set_swissprot(swissprot)
self.ac_mapping = AC_MAPPING
self._update_uniprot_types()
self.entity_type = 'protein'
def set_organism(self, ncbi_tax_id):
other_organism = copy.deepcopy(self)
other_organism.ncbi_tax_id = ncbi_tax_id
return other_organism
def swap_sides(self):
MappingInput.swap_sides(self)
self._update_uniprot_types()
def _update_uniprot_types(self):
self.uniprot_id_type_a = self._resource_id_type_a
self.uniprot_id_type_b = self._resource_id_type_b
def _resource_id_type(self, side: str) -> str:
uniprot_id_types = {
'a': self._from_uniprot,
'b': self._to_uniprot,
}.get(side)
id_type = getattr(self, f'id_type_{side}')
return uniprot_id_types.get(
id_type,
self._resource_id_types.get(id_type, id_type)
)
def _set_swissprot(self, swissprot: bool | None) -> None:
values = {'swissprot': True, 'trembl': False, 'uniprot': True}
if swissprot is None:
swissprot = values.get(
self.id_type_a,
values.get(self.id_type_b, swissprot)
)
self.swissprot = swissprot
@classmethod
def _uniprotkb_id_type(cls, id_type: str) -> bool:
return id_type in cls._from_uniprot
@classmethod
def possible(
cls,
id_type_a: str,
id_type_b: str,
ncbi_tax_id: int | None = None,
) -> bool:
id_type_a = cls._from_uniprot.get(id_type_a, id_type_a)
id_type_a = cls._resource_id_types.get(id_type_a, id_type_a)
id_type_b = cls._to_uniprot.get(id_type_b, id_type_b)
id_type_b = cls._resource_id_types.get(id_type_b, id_type_b)
pairs = uniprot_idmapping.idtypes()
return (id_type_a, id_type_b) in pairs
[docs]
class ProMapping(MappingInput):
"""
Provides parameters for mapping table from the Protein Ontology
Consortium.
:arg str id_type_a:
Custom name for one of the ID types.
:arg str id_type_b:
Custom name for the other ID type.
:arg str pro_id_type_a:
This is the symbol PRO uses to label the IDs.
These are included in the module and set
automatically, the argument only gives a way to override this.
:arg str pro_id_type_b:
Same as above just for the other ID type.
"""
_resource_id_types = PRO_MAPPING
[docs]
def __init__(
self,
id_type_a,
id_type_b = None,
pro_id_type_a = None,
pro_id_type_b = None,
ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC,
):
to_pro = id_type_a != 'pro'
id_type = id_type_a if to_pro else id_type_b
pro_id_type = (
pro_id_type_a if to_pro else pro_id_type_b
)
MappingInput.__init__(
self,
type_ = 'pro',
id_type_a = 'pro',
id_type_b = id_type,
ncbi_tax_id = -1,
resource_id_type_a = pro_id_type_a,
resource_id_type_b = pro_id_type_b,
)
self.to_pro = to_pro
self.id_type = id_type
self.pro_mapping = PRO_MAPPING
self.pro_id_type = pro_id_type or self.pro_mapping[self.id_type_b]
self.entity_type = 'protein'
@classmethod
def possible(
cls,
id_type_a: str,
id_type_b: str,
ncbi_tax_id: int | None = None,
) -> bool:
id_types = {id_type_a: None, id_type_b: None}
return (
id_types.pop('pro', None) and
(
list(id_types)[0] in self._resource_id_types or
list(id_types)[0] in self._resource_id_types.values()
)
)
[docs]
class BiomartMapping(MappingInput):
_resource_id_types = BIOMART_MAPPING
[docs]
def __init__(
self,
id_type_a,
id_type_b = None,
transcript = False,
biomart_id_type_a = None,
biomart_id_type_b = None,
ncbi_tax_id = 9606,
):
MappingInput.__init__(
self,
type_ = 'biomart',
id_type_a = id_type_a,
id_type_b = id_type_b,
ncbi_tax_id = ncbi_tax_id,
resource_id_type_a = biomart_id_type_a,
resource_id_type_b = biomart_id_type_b,
)
self.biomart_id_type_a = self._resource_id_type_a
self.biomart_id_type_b = self._resource_id_type_b
self.attrs = (
self.biomart_id_type_a,
self.biomart_id_type_b,
)
self.biomart_mapping = BIOMART_MAPPING
[docs]
class UnichemMapping(MappingInput):
_resource_id_types = {
id_type: id_type
for id_type in unichem_input.unichem_sources().values()
}
[docs]
def __init__(
self,
id_type_a,
id_type_b,
ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC,
):
"""
Paramaters for UniChem based ID translation.
Args:
id_type_a:
Custom name for one of the ID types.
id_type_b:
Custom name for the other ID type.
"""
MappingInput.__init__(
self,
type_ = 'unichem',
id_type_a = id_type_a,
id_type_b = id_type_b,
ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC,
)
[docs]
class RampMapping(MappingInput):
_resource_id_types = RAMP_MAPPING
[docs]
def __init__(
self,
id_type_a,
id_type_b,
ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC,
):
"""
Paramaters for ID translation tables from the RaMP database.
Args:
id_type_a:
Custom name for one of the ID types.
id_type_b:
Custom name for the other ID type.
"""
MappingInput.__init__(
self,
type_ = 'ramp',
id_type_a = id_type_a,
id_type_b = id_type_b,
ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC,
)
[docs]
class HmdbMapping(MappingInput):
_resource_id_types = HMDB_MAPPING
[docs]
def __init__(
self,
id_type_a,
id_type_b,
ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC,
):
"""
Paramaters for ID translation tables from the
Human Metabolome Database.
Args:
id_type_a:
Custom name for one of the ID types.
id_type_b:
Custom name for the other ID type.
"""
MappingInput.__init__(
self,
type_ = 'hmdb',
id_type_a = id_type_a,
id_type_b = id_type_b,
ncbi_tax_id = _const.NOT_ORGANISM_SPECIFIC,
input_method = 'hmdb.metabolites_mapping',
)
[docs]
class ArrayMapping(MappingInput):
"""
Provides parameters for microarray probe mapping tables.
:arg str id_type_a:
Custom name for one of the ID types.
:arg str id_type_b:
Custom name for the other ID type.
:arg str pro_id_type_a:
This is the symbol PRO uses to label the IDs.
These are included in the module and set
automatically, the argument only gives a way to override this.
:arg str pro_id_type_b:
Same as above just for the other ID type.
"""
_resource_id_types = ARRAY_MAPPING
[docs]
def __init__(
self,
id_type_a,
id_type_b,
ncbi_tax_id = 9606,
):
MappingInput.__init__(
self,
type_ = 'array',
id_type_a = self._get_id_type(id_type_a),
id_type_b = self._get_id_type(id_type_b),
ncbi_tax_id = ncbi_tax_id,
resource_id_type_a = self._process_id_type(id_type_a),
resource_id_type_b = self._process_id_type(id_type_b),
)
self.ensembl_id = (
self.resource_id_type_a
if self.id_type_a.startswith('ens') else
self.resource_id_type_b
)
self.array_id = (
self.resource_id_type_a
if self.id_type_a in self._resource_id_types else
self.resource_id_type_b
)
self.entity_type = 'protein'
@classmethod
def _process_id_type(cls, id_type: str, fail: bool = True):
id_type = id_type.lower()
id_type = 'affy' if id_type == 'affymetrix' else id_type
id_type = 'ensg' if id_type == 'ensembl' else id_type
if (
id_type not in cls._resource_id_types and
id_type not in {'ensg', 'enst', 'ensp'}
):
if fail:
msg = (
'Unknown ID type for microarray probe mapping: `%s`. '
'Microarray ID types include `affy`, `illumina`, `agilent`, '
'`codelink` and `phalanx`, all these can be translated to '
'Ensembl gene, transcript or peptide IDs: `ensg`, `enst` '
'or `ensp`. If you translate to some other ID type, do it '
'in multiple steps.' % str(id_type)
)
_logger._log(msg)
raise ValueError(msg)
else:
return None
return id_type
@classmethod
def possible(
cls,
id_type_a: str,
id_type_b: str,
ncbi_tax_id: int | None = None,
) -> bool:
return (
cls._process_id_type(id_type_a, fail = False) and
cls._process_id_type(id_type_b, fail = False)
)
[docs]
class PickleMapping(MappingInput):
[docs]
def __init__(
self,
id_type_a,
id_type_b,
fname,
ncbi_tax_id = None,
):
MappingInput.__init__(
self,
type_ = 'pickle',
id_type_a = id_type_a,
id_type_b = id_type_b,
ncbi_tax_id = ncbi_tax_id,
)
self.fname = fname
[docs]
class ReadList:
[docs]
def __init__(
self,
name = 'unknown',
separator = None,
id_col = 0,
id_type = 'uniprot',
entity_type = 'protein',
input = None,
extra_attrs = None,
header = False,
):
self.entity_type = entity_type
self.id_col = id_col
self.id_type = id_type
self.input = input
self.extra_attrs = extra_attrs or {}
self.name = name
self.separator = separator
self.header = header