Source code for pypath.inputs.mirbase
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file is part of the `pypath` python module
#
# Copyright 2014-2023
# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
# Authors: see the file `README.rst`
# Contact: Dénes Türei (turei.denes@gmail.com)
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# https://www.gnu.org/licenses/gpl-3.0.html
#
# Website: https://pypath.omnipathdb.org/
#
from future.utils import iteritems
import re
import collections
import pypath.inputs.uniprot as uniprot_input
import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.share.common as common
import pypath_common._constants as _const
import pypath.utils.taxonomy as taxonomy
[docs]
def get_mirbase_aliases(organism = 9606):
"""
Downloads and processes mapping tables from miRBase.
"""
if type(organism) in _const.CHAR_TYPES:
mborganism = organism
elif organism not in taxonomy.mirbase_taxids:
raise ValueError(
'Organism not known: %u. Try to pass miRBase '
'taxon prefix as string, e.g. `hsa`.' % organism
)
else:
mborganism = taxonomy.mirbase_taxids[organism]
mat = {}
mir = {}
url = urls.urls['mirbase']['aliases']
c = curl.Curl(url, silent = False, large = True)
for l in c.result:
l = l.strip().strip(';').split('\t')
if l[1][:3] != mborganism:
continue
d = mat if l[0][:5] == 'MIMAT' else mir
if l[0] not in d:
d[l[0]] = set([])
for m in l[1].split(';'):
d[l[0]].add(m)
return mat, mir
[docs]
def mirbase_mature(organism = 9606):
mat, mir = get_mirbase_aliases(organism)
result = {}
for mimat, mmats in iteritems(mat):
for mmat in mmats:
yield mimat, mmat
[docs]
def mirbase_precursor(organism = 9606):
mat, mir = get_mirbase_aliases(organism)
result = {}
for mi, mpres in iteritems(mir):
for mpre in mpres:
yield mi, mpre
[docs]
def mirbase_precursor_to_mature(organism = 9606):
pre = mirbase_precursor(organism)
ids = mirbase_ids(organism)
_ids = collections.defaultdict(set)
_pre = collections.defaultdict(set)
for mmat, mpre in ids:
_ids[mpre].add(mmat)
for preid, prename in pre:
_pre[prename].add(preid)
result = {}
for prename, mpres in iteritems(_pre):
for mpre in mpres:
if mpre in _ids:
for mmat in _ids[mpre]:
yield prename, mmat
[docs]
def mirbase_ids(organism = 9606):
reprename = re.compile(r'([-A-z]*[-]?\d+[a-z]*)(-\d*)')
def get_pre_name(mat_name):
return mat_name.replace(
'*', '').replace(
'-3p', '').replace(
'-5p', '')
mat, mir = get_mirbase_aliases(organism)
mir = dict((k, set.union(set(reprename.sub(r'\1', vv) for vv in v), v))
for k, v in iteritems(mir))
mir = common.swap_dict(mir)
mat = dict((k, set(get_pre_name(vv) for vv in v))
for k, v in iteritems(mat))
if (sum(sum(vv in mir for vv in v) for v in mat.values()) <
sum(sum(vv.lower() in mir for vv in v) for v in mat.values())):
mat = dict((k, set(vv.lower() for vv in v))
for k, v in iteritems(mat))
mat_mir = common.join_dicts(mat, mir)
for ma, mis in iteritems(mat_mir):
for mi in (mis if type(mis) not in _const.SIMPLE_TYPES else [mis]):
yield ma, mi
[docs]
def mirbase_mature_all(organism = 9606):
return [i[0] for i in mirbase_ids(organism = organism)]
[docs]
def mirbase_precursor_all(organism = 9606):
return [i[1] for i in mirbase_ids(organism = organism)]