Source code for pypath.inputs.matrisome

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

import collections

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.utils.mapping as mapping
import pypath.inputs.common as inputs_common


[docs] def matrisome_annotations(organism = 9606): """ Downloads MatrisomeDB 2.0, a database of extracellular matrix proteins. Returns dict where keys are UniProt IDs and values are tuples of classes, subclasses and notes. """ MatrisomeAnnotation = collections.namedtuple( 'MatrisomeAnnotation', ['mainclass', 'subclass', 'subsubclass'] ) tax_names = { 10090: ('Murine', 'mm'), 9606: ('Human', 'hs'), } url = urls.urls['matrisome']['url_rescued'] % tax_names[organism][1] c = curl.Curl(url, large = True, silent = False) xlsname = c.fname del(c) raw = inputs_common.read_xls(xlsname)[1:] result = collections.defaultdict(set) for r in raw: uniprots = set(r[7].split(':')) uniprots.discard('') if not uniprots: continue uniprots = mapping.map_names(uniprots, 'uniprot', 'uniprot') for uniprot in uniprots: result[uniprot].add( MatrisomeAnnotation( mainclass = r[0].strip(), subclass = r[1].strip(), subsubclass = r[10].strip() or None, ) ) return dict(result)
def __matrisome_annotations_2(): """ This I made only to find out why certain proteins are missing from this output. I will contact Matrisome people to ask why. """ url = urls.urls['matrisome']['url_dl'] c = curl.Curl(url, large = True, silent = False) _ = next(c.result) return set(r.split(',')[1] for r in c.result)