Source code for pypath.inputs.complexportal

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
#  This file is part of the `pypath` python module
#
#  Copyright 2014-2023
#  EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University
#
#  Authors: see the file `README.rst`
#  Contact: Dénes Türei (turei.denes@gmail.com)
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      https://www.gnu.org/licenses/gpl-3.0.html
#
#  Website: https://pypath.omnipathdb.org/
#

from future.utils import iteritems

import collections

import bs4

import pypath.share.curl as curl
import pypath.resources.urls as urls
import pypath.internals.intera as intera


[docs] def complexportal_complexes(organism = 9606, return_details = False): """ Complex dataset from IntAct. See more: http://www.ebi.ac.uk/intact/complex/ http://nar.oxfordjournals.org/content/early/2014/10/13/nar.gku975.full.pdf """ spec = {9606: 'Homo_sapiens'} zipurl = '%s/%s.zip' % ( urls.urls['complex_portal']['url'], spec[organism], ) c = curl.Curl(zipurl, large = True, silent = False) files = c.result errors = [] complexes = {} details = [] name_key = 'complex recommended name' for xmlname, xml in iteritems(c.result): soup = bs4.BeautifulSoup(xml, 'html.parser') interactors_xml = soup.find_all('interactor') interactors = {} interactions = {} for i in interactors_xml: if i.find('primaryref').attrs['db'] == 'uniprotkb': interactors[i.attrs['id']] = i.find('primaryref').attrs['id'] interactions_xml = soup.find_all('interaction') for i in interactions_xml: description = '' pubmeds = [] fullname = '' names = {} pdbs = [] uniprots = [] ids = collections.defaultdict(set) for a in i.find_all('attribute'): if a.attrs['name'] == 'curated-complex': description = a.text for sr in i.find_all('secondaryref'): if sr.attrs['db'] == 'pubmed': pubmeds.append(sr.attrs['id']) if sr.attrs['db'] == 'wwpdb': pdbs.append(sr.attrs['id']) for pr in i.find_all('primaryref'): if pr.attrs['db'] in {'wwpdb', 'rcsb pdb', 'pdbe'}: pdbs.append(pr.attrs['id']) for sr in i.find('xref').find_all('secondaryref'): if ( 'reftype' in sr.attrs and sr.attrs['db'] in {'intact', 'reactome'} and sr.attrs['reftype'] == 'identity' ): ids[sr.attrs['db']].add(sr.attrs['id']) pubmeds = list(set(pubmeds)) pdbs = list(set(pdbs)) fullname = ( None if i.find('fullname') is None else i.find('fullname').text ) for a in i.find_all('alias'): names[a.attrs['type']] = a.text for intref in i.find_all('interactorref'): int_id = intref.text if int_id in interactors: uniprot = interactors[int_id] if uniprot.startswith('PRO'): continue uniprot = uniprot.split('-')[0] uniprots.append(uniprot) if uniprots: if pdbs: ids['PDB'].update(set(pdbs)) cplex = intera.Complex( components = uniprots, name = names[name_key] if name_key in names else None, references = set(pubmeds), sources = 'ComplexPortal', ids = ids, ) if cplex.__str__() in complexes: complexes[cplex.__str__()] += cplex else: complexes[cplex.__str__()] = cplex details.append({ 'uniprots': uniprots, 'pdbs': pdbs, 'pubmeds': pubmeds, 'fullname': fullname, 'names': names, 'description': description }) if return_details: return complexes, details else: return complexes